summaryrefslogtreecommitdiff
path: root/test/CodeGen
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2016-07-23 20:41:05 +0000
committerDimitry Andric <dim@FreeBSD.org>2016-07-23 20:41:05 +0000
commit01095a5d43bbfde13731688ddcf6048ebb8b7721 (patch)
tree4def12e759965de927d963ac65840d663ef9d1ea /test/CodeGen
parentf0f4822ed4b66e3579e92a89f368f8fb860e218e (diff)
downloadsrc-test2-01095a5d43bbfde13731688ddcf6048ebb8b7721.tar.gz
src-test2-01095a5d43bbfde13731688ddcf6048ebb8b7721.zip
Notes
Diffstat (limited to 'test/CodeGen')
-rw-r--r--test/CodeGen/AArch64/128bit_load_store.ll2
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll63
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir329
-rw-r--r--test/CodeGen/AArch64/a57-csel.ll5
-rw-r--r--test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll5
-rw-r--r--test/CodeGen/AArch64/aarch64-DAGCombine-findBetterNeighborChains-crash.ll43
-rw-r--r--test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll18
-rw-r--r--test/CodeGen/AArch64/aarch64-be-bv.ll198
-rw-r--r--test/CodeGen/AArch64/aarch64-deferred-spilling.ll514
-rw-r--r--test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll223
-rw-r--r--test/CodeGen/AArch64/aarch64-gep-opt.ll3
-rw-r--r--test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll86
-rw-r--r--test/CodeGen/AArch64/aarch64-interleaved-accesses.ll12
-rw-r--r--test/CodeGen/AArch64/aarch64-smull.ll4
-rw-r--r--test/CodeGen/AArch64/aarch64-stp-cluster.ll149
-rw-r--r--test/CodeGen/AArch64/aarch64-tbz.ll98
-rw-r--r--test/CodeGen/AArch64/aarch64-tryBitfieldInsertOpFromOr-crash.ll36
-rw-r--r--test/CodeGen/AArch64/addsub.ll8
-rw-r--r--test/CodeGen/AArch64/alloca.ll18
-rw-r--r--test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll15
-rw-r--r--test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll8
-rw-r--r--test/CodeGen/AArch64/arm64-aapcs-be.ll3
-rw-r--r--test/CodeGen/AArch64/arm64-abi-varargs.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-abi.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-abi_align.ll32
-rw-r--r--test/CodeGen/AArch64/arm64-addrmode.ll10
-rw-r--r--test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-atomic-128.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-atomic.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-bitfield-extract.ll10
-rw-r--r--test/CodeGen/AArch64/arm64-build-vector.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-builtins-linux.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-ccmp.ll249
-rw-r--r--test/CodeGen/AArch64/arm64-collect-loh.ll22
-rw-r--r--test/CodeGen/AArch64/arm64-const-addr.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-convert-v4f64.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-csldst-mmo.ll25
-rw-r--r--test/CodeGen/AArch64/arm64-detect-vec-redux.ll52
-rw-r--r--test/CodeGen/AArch64/arm64-extern-weak.ll15
-rw-r--r--test/CodeGen/AArch64/arm64-extract.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll16
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-alloca.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-call.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-gv.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll8
-rw-r--r--test/CodeGen/AArch64/arm64-fcopysign.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-fma-combines.ll136
-rw-r--r--test/CodeGen/AArch64/arm64-fml-combines.ll128
-rw-r--r--test/CodeGen/AArch64/arm64-fp128.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-hello.ll21
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm.ll16
-rw-r--r--test/CodeGen/AArch64/arm64-join-reserved.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-ldp-cluster.ll150
-rw-r--r--test/CodeGen/AArch64/arm64-memcpy-inline.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-misched-basic-A53.ll1
-rw-r--r--test/CodeGen/AArch64/arm64-misched-memdep-bug.ll3
-rw-r--r--test/CodeGen/AArch64/arm64-misched-multimmo.ll23
-rw-r--r--test/CodeGen/AArch64/arm64-movi.ll98
-rw-r--r--test/CodeGen/AArch64/arm64-mul.ll62
-rw-r--r--test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll94
-rw-r--r--test/CodeGen/AArch64/arm64-neon-2velem-high.ll36
-rw-r--r--test/CodeGen/AArch64/arm64-neon-copy.ll41
-rw-r--r--test/CodeGen/AArch64/arm64-nvcast.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll58
-rw-r--r--test/CodeGen/AArch64/arm64-patchpoint.ll19
-rw-r--r--test/CodeGen/AArch64/arm64-register-pairing.ll31
-rw-r--r--test/CodeGen/AArch64/arm64-regress-opt-cmp.mir42
-rw-r--r--test/CodeGen/AArch64/arm64-rev.ll27
-rw-r--r--test/CodeGen/AArch64/arm64-shrink-wrapping.ll70
-rw-r--r--test/CodeGen/AArch64/arm64-stp-aa.ll34
-rw-r--r--test/CodeGen/AArch64/arm64-stp.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-this-return.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-tls-dynamic-together.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-variadic-aapcs.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-vclz.ll28
-rw-r--r--test/CodeGen/AArch64/arm64-vecCmpBr.ll8
-rw-r--r--test/CodeGen/AArch64/arm64-vector-ext.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-vector-imm.ll18
-rw-r--r--test/CodeGen/AArch64/arm64-virtual_base.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-vshift.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-vshuffle.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll72
-rw-r--r--test/CodeGen/AArch64/atomic-ops.ll2
-rw-r--r--test/CodeGen/AArch64/bitfield-extract.ll98
-rw-r--r--test/CodeGen/AArch64/bitfield-insert.ll245
-rw-r--r--test/CodeGen/AArch64/bitfield.ll2
-rw-r--r--test/CodeGen/AArch64/bitreverse.ll17
-rw-r--r--test/CodeGen/AArch64/branch-folder-merge-mmos.ll2
-rw-r--r--test/CodeGen/AArch64/bswap-known-bits.ll44
-rw-r--r--test/CodeGen/AArch64/cmpxchg-O0.ll75
-rw-r--r--test/CodeGen/AArch64/combine-comparisons-by-cse.ll44
-rw-r--r--test/CodeGen/AArch64/complex-int-to-fp.ll2
-rw-r--r--test/CodeGen/AArch64/cond-sel.ll40
-rw-r--r--test/CodeGen/AArch64/cpus.ll3
-rw-r--r--test/CodeGen/AArch64/cxx-tlscc.ll127
-rw-r--r--test/CodeGen/AArch64/dag-combine-invaraints.ll2
-rw-r--r--test/CodeGen/AArch64/directcond.ll2
-rw-r--r--test/CodeGen/AArch64/div_minsize.ll45
-rw-r--r--test/CodeGen/AArch64/emutls.ll121
-rw-r--r--test/CodeGen/AArch64/emutls_generic.ll9
-rw-r--r--test/CodeGen/AArch64/extern-weak.ll15
-rw-r--r--test/CodeGen/AArch64/f16-instructions.ll38
-rw-r--r--test/CodeGen/AArch64/fast-isel-branch-cond-split.ll4
-rw-r--r--test/CodeGen/AArch64/fast-isel-cmp-vec.ll12
-rw-r--r--test/CodeGen/AArch64/fast-isel-gep.ll2
-rw-r--r--test/CodeGen/AArch64/fast-isel-tbz.ll4
-rw-r--r--test/CodeGen/AArch64/fastcc.ll147
-rw-r--r--test/CodeGen/AArch64/fcvt-int.ll25
-rw-r--r--test/CodeGen/AArch64/fcvt_combine.ll8
-rw-r--r--test/CodeGen/AArch64/fdiv-combine.ll30
-rw-r--r--test/CodeGen/AArch64/fdiv_combine.ll4
-rw-r--r--test/CodeGen/AArch64/fp-cond-sel.ll6
-rw-r--r--test/CodeGen/AArch64/fp16-v4-instructions.ll133
-rw-r--r--test/CodeGen/AArch64/fp16-vector-nvcast.ll12
-rw-r--r--test/CodeGen/AArch64/fpimm.ll13
-rw-r--r--test/CodeGen/AArch64/func-argpassing.ll2
-rw-r--r--test/CodeGen/AArch64/func-calls.ll14
-rw-r--r--test/CodeGen/AArch64/gep-nullptr.ll23
-rw-r--r--test/CodeGen/AArch64/global-merge-3.ll4
-rw-r--r--test/CodeGen/AArch64/global-merge-group-by-use.ll4
-rw-r--r--test/CodeGen/AArch64/half.ll12
-rw-r--r--test/CodeGen/AArch64/hints.ll2
-rw-r--r--test/CodeGen/AArch64/inlineasm-X-allocation.ll17
-rw-r--r--test/CodeGen/AArch64/inlineasm-X-constraint.ll152
-rw-r--r--test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll4
-rw-r--r--test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll125
-rw-r--r--test/CodeGen/AArch64/lit.local.cfg2
-rw-r--r--test/CodeGen/AArch64/local_vars.ll15
-rw-r--r--test/CodeGen/AArch64/logical-imm.ll4
-rw-r--r--test/CodeGen/AArch64/lower-range-metadata-func-call.ll44
-rw-r--r--test/CodeGen/AArch64/machine-combiner.ll2
-rw-r--r--test/CodeGen/AArch64/machine-copy-remove.ll94
-rw-r--r--test/CodeGen/AArch64/merge-store-dependency.ll63
-rw-r--r--test/CodeGen/AArch64/merge-store.ll13
-rw-r--r--test/CodeGen/AArch64/misched-fusion.ll10
-rw-r--r--test/CodeGen/AArch64/movimm-wzr.mir46
-rw-r--r--test/CodeGen/AArch64/movw-consts.ll10
-rw-r--r--test/CodeGen/AArch64/neg-imm.ll46
-rw-r--r--test/CodeGen/AArch64/neon-compare-instructions.ll57
-rw-r--r--test/CodeGen/AArch64/neon-mov.ll2
-rw-r--r--test/CodeGen/AArch64/neon-perm.ll2
-rw-r--r--test/CodeGen/AArch64/no-quad-ldp-stp.ll29
-rw-r--r--test/CodeGen/AArch64/nontemporal.ll6
-rw-r--r--test/CodeGen/AArch64/nzcv-save.ll2
-rw-r--r--test/CodeGen/AArch64/optimize-cond-branch.ll48
-rw-r--r--test/CodeGen/AArch64/pie.ll14
-rw-r--r--test/CodeGen/AArch64/preferred-alignment.ll28
-rw-r--r--test/CodeGen/AArch64/preserve_mostcc.ll40
-rw-r--r--test/CodeGen/AArch64/recp-fastmath.ll79
-rw-r--r--test/CodeGen/AArch64/regress-tblgen-chains.ll2
-rw-r--r--test/CodeGen/AArch64/rem_crash.ll257
-rw-r--r--test/CodeGen/AArch64/remat.ll10
-rw-r--r--test/CodeGen/AArch64/sibling-call.ll2
-rw-r--r--test/CodeGen/AArch64/special-reg.ll2
-rw-r--r--test/CodeGen/AArch64/sqrt-fastmath.ll160
-rw-r--r--test/CodeGen/AArch64/stack-guard-remat-bitcast.ll8
-rw-r--r--test/CodeGen/AArch64/stack-protector-target.ll19
-rw-r--r--test/CodeGen/AArch64/stackmap-frame-setup.ll4
-rw-r--r--test/CodeGen/AArch64/stackmap-liveness.ll2
-rw-r--r--test/CodeGen/AArch64/subs-to-sub-opt.ll23
-rw-r--r--test/CodeGen/AArch64/swifterror.ll385
-rw-r--r--test/CodeGen/AArch64/swiftself.ll67
-rw-r--r--test/CodeGen/AArch64/tailcall-ccmismatch.ll24
-rw-r--r--test/CodeGen/AArch64/tailcall-implicit-sret.ll2
-rw-r--r--test/CodeGen/AArch64/tailcall_misched_graph.ll6
-rw-r--r--test/CodeGen/AArch64/tailmerging_in_mbp.ll63
-rw-r--r--test/CodeGen/AArch64/vcvt-oversize.ll5
-rw-r--r--test/CodeGen/AArch64/vector-fcopysign.ll28
-rw-r--r--test/CodeGen/AArch64/vector_merge_dep_check.ll41
-rw-r--r--test/CodeGen/AMDGPU/32-bit-local-address-space.ll6
-rw-r--r--test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll12
-rw-r--r--test/CodeGen/AMDGPU/add.ll27
-rw-r--r--test/CodeGen/AMDGPU/add_i64.ll8
-rw-r--r--test/CodeGen/AMDGPU/address-space.ll32
-rw-r--r--test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll106
-rw-r--r--test/CodeGen/AMDGPU/addrspacecast.ll235
-rw-r--r--test/CodeGen/AMDGPU/amdgcn.private-memory.ll31
-rw-r--r--test/CodeGen/AMDGPU/amdgcn.work-item-intrinsics.ll114
-rw-r--r--test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll8
-rw-r--r--test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll21
-rw-r--r--test/CodeGen/AMDGPU/amdgpu.private-memory.ll530
-rw-r--r--test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll (renamed from test/CodeGen/AMDGPU/work-item-intrinsics.ll)201
-rw-r--r--test/CodeGen/AMDGPU/and-gcn.ll27
-rw-r--r--test/CodeGen/AMDGPU/and.ll289
-rw-r--r--test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll238
-rw-r--r--test/CodeGen/AMDGPU/annotate-kernel-features.ll8
-rw-r--r--test/CodeGen/AMDGPU/array-ptr-calc-i32.ll38
-rw-r--r--test/CodeGen/AMDGPU/array-ptr-calc-i64.ll10
-rw-r--r--test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll14
-rw-r--r--test/CodeGen/AMDGPU/atomic_load_sub.ll4
-rw-r--r--test/CodeGen/AMDGPU/basic-branch.ll49
-rw-r--r--test/CodeGen/AMDGPU/bfm.ll24
-rw-r--r--test/CodeGen/AMDGPU/big_alu.ll2345
-rw-r--r--test/CodeGen/AMDGPU/bitcast.ll36
-rw-r--r--test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll158
-rw-r--r--test/CodeGen/AMDGPU/bitreverse.ll10
-rw-r--r--test/CodeGen/AMDGPU/branch-uniformity.ll41
-rw-r--r--test/CodeGen/AMDGPU/bug-vopc-commute.ll49
-rw-r--r--test/CodeGen/AMDGPU/call.ll18
-rw-r--r--test/CodeGen/AMDGPU/call_fs.ll4
-rw-r--r--test/CodeGen/AMDGPU/captured-frame-index.ll166
-rw-r--r--test/CodeGen/AMDGPU/cayman-loop-bug.ll14
-rw-r--r--test/CodeGen/AMDGPU/cf-loop-on-constant.ll121
-rw-r--r--test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll2
-rw-r--r--test/CodeGen/AMDGPU/cgp-addressing-modes.ll118
-rw-r--r--test/CodeGen/AMDGPU/cgp-bitfield-extract.ll301
-rw-r--r--test/CodeGen/AMDGPU/ci-use-flat-for-global.ll19
-rw-r--r--test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll56
-rw-r--r--test/CodeGen/AMDGPU/coalescer_distribute.ll53
-rw-r--r--test/CodeGen/AMDGPU/commute-compares.ll104
-rw-r--r--test/CodeGen/AMDGPU/commute-shifts.ll36
-rw-r--r--test/CodeGen/AMDGPU/commute_modifiers.ll68
-rw-r--r--test/CodeGen/AMDGPU/complex-folding.ll8
-rw-r--r--test/CodeGen/AMDGPU/convergent-inlineasm.ll45
-rw-r--r--test/CodeGen/AMDGPU/copy-illegal-type.ll104
-rw-r--r--test/CodeGen/AMDGPU/ctlz.ll10
-rw-r--r--test/CodeGen/AMDGPU/ctlz_zero_undef.ll8
-rw-r--r--test/CodeGen/AMDGPU/ctpop.ll12
-rw-r--r--test/CodeGen/AMDGPU/ctpop64.ll65
-rw-r--r--test/CodeGen/AMDGPU/cube.ll46
-rw-r--r--test/CodeGen/AMDGPU/cvt_f32_ubyte.ll130
-rw-r--r--test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll33
-rw-r--r--test/CodeGen/AMDGPU/debugger-emit-prologue.ll80
-rw-r--r--test/CodeGen/AMDGPU/debugger-insert-nops.ll71
-rw-r--r--test/CodeGen/AMDGPU/debugger-reserve-regs.ll62
-rw-r--r--test/CodeGen/AMDGPU/default-fp-mode.ll78
-rw-r--r--test/CodeGen/AMDGPU/detect-dead-lanes.mir428
-rw-r--r--test/CodeGen/AMDGPU/dot4-folding.ll27
-rw-r--r--test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll12
-rw-r--r--test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll30
-rw-r--r--test/CodeGen/AMDGPU/ds-sub-offset.ll17
-rw-r--r--test/CodeGen/AMDGPU/ds_read2.ll52
-rw-r--r--test/CodeGen/AMDGPU/ds_read2_offset_order.ll5
-rw-r--r--test/CodeGen/AMDGPU/ds_read2_superreg.ll86
-rw-r--r--test/CodeGen/AMDGPU/ds_read2st64.ll54
-rw-r--r--test/CodeGen/AMDGPU/ds_write2.ll81
-rw-r--r--test/CodeGen/AMDGPU/ds_write2st64.ll37
-rw-r--r--test/CodeGen/AMDGPU/dynamic_stackalloc.ll2
-rw-r--r--test/CodeGen/AMDGPU/elf.ll6
-rw-r--r--test/CodeGen/AMDGPU/endcf-loop-header.ll7
-rw-r--r--test/CodeGen/AMDGPU/extload-private.ll6
-rw-r--r--test/CodeGen/AMDGPU/extload.ll54
-rw-r--r--test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll126
-rw-r--r--test/CodeGen/AMDGPU/extract_vector_elt-f64.ll29
-rw-r--r--test/CodeGen/AMDGPU/extract_vector_elt-i16.ll86
-rw-r--r--test/CodeGen/AMDGPU/extract_vector_elt-i64.ll (renamed from test/CodeGen/AMDGPU/extract-vector-elt-i64.ll)29
-rw-r--r--test/CodeGen/AMDGPU/extract_vector_elt-i8.ll151
-rw-r--r--test/CodeGen/AMDGPU/extract_vector_elt_i16.ll30
-rw-r--r--test/CodeGen/AMDGPU/extractelt-to-trunc.ll77
-rw-r--r--test/CodeGen/AMDGPU/fabs.f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/fadd.ll4
-rw-r--r--test/CodeGen/AMDGPU/fcanonicalize.ll351
-rw-r--r--test/CodeGen/AMDGPU/fceil64.ll15
-rw-r--r--test/CodeGen/AMDGPU/fcopysign.f64.ll4
-rw-r--r--test/CodeGen/AMDGPU/fdiv.f64.ll8
-rw-r--r--test/CodeGen/AMDGPU/fdiv.ll207
-rw-r--r--test/CodeGen/AMDGPU/fetch-limits.r600.ll70
-rw-r--r--test/CodeGen/AMDGPU/fetch-limits.r700+.ll84
-rw-r--r--test/CodeGen/AMDGPU/ffloor.f64.ll33
-rw-r--r--test/CodeGen/AMDGPU/flat-address-space.ll5
-rw-r--r--test/CodeGen/AMDGPU/flat_atomics.ll968
-rw-r--r--test/CodeGen/AMDGPU/flat_atomics_i64.ll975
-rw-r--r--test/CodeGen/AMDGPU/floor.ll7
-rw-r--r--test/CodeGen/AMDGPU/fma-combine.ll144
-rw-r--r--test/CodeGen/AMDGPU/fma.ll2
-rw-r--r--test/CodeGen/AMDGPU/fmad.ll8
-rw-r--r--test/CodeGen/AMDGPU/fmax.ll8
-rw-r--r--test/CodeGen/AMDGPU/fmax3.f64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fmax3.ll12
-rw-r--r--test/CodeGen/AMDGPU/fmax_legacy.f64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fmax_legacy.ll24
-rw-r--r--test/CodeGen/AMDGPU/fmaxnum.ll8
-rw-r--r--test/CodeGen/AMDGPU/fmed3.ll154
-rw-r--r--test/CodeGen/AMDGPU/fmin.ll8
-rw-r--r--test/CodeGen/AMDGPU/fmin3.ll12
-rw-r--r--test/CodeGen/AMDGPU/fmin_legacy.f64.ll12
-rw-r--r--test/CodeGen/AMDGPU/fmin_legacy.ll53
-rw-r--r--test/CodeGen/AMDGPU/fminnum.ll8
-rw-r--r--test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll16
-rw-r--r--test/CodeGen/AMDGPU/fmul.ll51
-rw-r--r--test/CodeGen/AMDGPU/fmuladd.ll52
-rw-r--r--test/CodeGen/AMDGPU/fneg-fabs.f64.ll59
-rw-r--r--test/CodeGen/AMDGPU/fneg.f64.ll2
-rw-r--r--test/CodeGen/AMDGPU/fp-classify.ll5
-rw-r--r--test/CodeGen/AMDGPU/fp_to_sint.f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/fp_to_sint.ll4
-rw-r--r--test/CodeGen/AMDGPU/fp_to_uint.f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/fpext.ll12
-rw-r--r--test/CodeGen/AMDGPU/fract.f64.ll111
-rw-r--r--test/CodeGen/AMDGPU/fract.ll74
-rw-r--r--test/CodeGen/AMDGPU/frem.ll8
-rw-r--r--test/CodeGen/AMDGPU/fsqrt.f64.ll26
-rw-r--r--test/CodeGen/AMDGPU/fsqrt.ll154
-rw-r--r--test/CodeGen/AMDGPU/fsub.ll7
-rw-r--r--test/CodeGen/AMDGPU/fsub64.ll4
-rw-r--r--test/CodeGen/AMDGPU/ftrunc.f64.ll10
-rw-r--r--test/CodeGen/AMDGPU/ftrunc.ll4
-rw-r--r--test/CodeGen/AMDGPU/global-constant.ll2
-rw-r--r--test/CodeGen/AMDGPU/global-extload-i1.ll302
-rw-r--r--test/CodeGen/AMDGPU/global-extload-i16.ll302
-rw-r--r--test/CodeGen/AMDGPU/global-extload-i32.ll308
-rw-r--r--test/CodeGen/AMDGPU/global-extload-i8.ll299
-rw-r--r--test/CodeGen/AMDGPU/global-variable-relocs.ll203
-rw-r--r--test/CodeGen/AMDGPU/global-zero-initializer.ll13
-rw-r--r--test/CodeGen/AMDGPU/global_atomics.ll687
-rw-r--r--test/CodeGen/AMDGPU/global_atomics_i64.ll1037
-rw-r--r--test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll57
-rw-r--r--test/CodeGen/AMDGPU/gv-const-addrspace.ll37
-rw-r--r--test/CodeGen/AMDGPU/gv-offset-folding.ll21
-rw-r--r--test/CodeGen/AMDGPU/half.ll105
-rw-r--r--test/CodeGen/AMDGPU/hsa-default-device.ll11
-rw-r--r--test/CodeGen/AMDGPU/hsa-fp-mode.ll68
-rw-r--r--test/CodeGen/AMDGPU/hsa-func.ll61
-rw-r--r--test/CodeGen/AMDGPU/hsa-globals.ll146
-rw-r--r--test/CodeGen/AMDGPU/hsa-note-no-func.ll2
-rw-r--r--test/CodeGen/AMDGPU/hsa.ll19
-rw-r--r--test/CodeGen/AMDGPU/i1-copy-implicit-def.ll5
-rw-r--r--test/CodeGen/AMDGPU/i1-copy-phi.ll13
-rw-r--r--test/CodeGen/AMDGPU/imm.ll44
-rw-r--r--test/CodeGen/AMDGPU/indirect-addressing-si.ll325
-rw-r--r--test/CodeGen/AMDGPU/indirect-addressing-undef.mir327
-rw-r--r--test/CodeGen/AMDGPU/indirect-private-64.ll101
-rw-r--r--test/CodeGen/AMDGPU/inline-asm.ll174
-rw-r--r--test/CodeGen/AMDGPU/input-mods.ll8
-rw-r--r--test/CodeGen/AMDGPU/insert_vector_elt.ll358
-rw-r--r--test/CodeGen/AMDGPU/invalid-addrspacecast.ll8
-rw-r--r--test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll8
-rw-r--r--test/CodeGen/AMDGPU/jump-address.ll8
-rw-r--r--test/CodeGen/AMDGPU/kcache-fold.ll24
-rw-r--r--test/CodeGen/AMDGPU/kernarg-stack-alignment.ll44
-rw-r--r--test/CodeGen/AMDGPU/kernel-args.ll54
-rw-r--r--test/CodeGen/AMDGPU/large-alloca-compute.ll24
-rw-r--r--test/CodeGen/AMDGPU/large-alloca-graphics.ll33
-rw-r--r--test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll117
-rw-r--r--test/CodeGen/AMDGPU/large-work-group-registers.ll41
-rw-r--r--test/CodeGen/AMDGPU/lds-alignment.ll268
-rw-r--r--test/CodeGen/AMDGPU/lds-initializer.ll2
-rw-r--r--test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll47
-rw-r--r--test/CodeGen/AMDGPU/lds-output-queue.ll6
-rw-r--r--test/CodeGen/AMDGPU/lds-size.ll14
-rw-r--r--test/CodeGen/AMDGPU/lds-zero-initializer.ll2
-rw-r--r--test/CodeGen/AMDGPU/literals.ll4
-rw-r--r--test/CodeGen/AMDGPU/liveness.mir32
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll47
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll30
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll31
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll42
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll60
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll13
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll88
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll43
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll60
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll65
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll22
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll33
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll33
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll16
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll7
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll23
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll13
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll17
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll33
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll50
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll23
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll23
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll33
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll42
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll17
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll38
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll48
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll48
-rw-r--r--test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll18
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll17
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.gather4.ll328
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.getlod.ll27
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.image.ll21
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll (renamed from test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll)46
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.image.sample.ll123
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll123
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.imageload.ll132
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.load.dword.ll11
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.packf16.ll7
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.resinfo.ll111
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.sample.ll160
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.sampled.ll143
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll21
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll10
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.tid.ll18
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll387
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll383
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll126
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll133
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll119
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll95
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll95
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.class.ll (renamed from test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll)140
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll15
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll15
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll15
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll15
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll15
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll (renamed from test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll)8
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll (renamed from test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll)42
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll (renamed from test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll)144
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll33
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll24
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll15
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll34
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll64
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll64
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll56
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll123
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.image.ll110
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll9
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll34
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll31
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll14
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll17
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll2
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll66
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll59
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll19
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll128
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.read.workdim.ll (renamed from test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll)31
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll49
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll39
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll68
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll28
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll6
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll16
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll22
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll23
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll45
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll38
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll15
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll (renamed from test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll)6
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll107
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll56
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll4
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll13
-rw-r--r--test/CodeGen/AMDGPU/llvm.cos.ll8
-rw-r--r--test/CodeGen/AMDGPU/llvm.dbg.value.ll7
-rw-r--r--test/CodeGen/AMDGPU/llvm.memcpy.ll224
-rw-r--r--test/CodeGen/AMDGPU/llvm.pow.ll12
-rw-r--r--test/CodeGen/AMDGPU/llvm.r600.dot4.ll (renamed from test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll)4
-rw-r--r--test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll31
-rw-r--r--test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll36
-rw-r--r--test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll11
-rw-r--r--test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll28
-rw-r--r--test/CodeGen/AMDGPU/llvm.r600.tex.ll65
-rw-r--r--test/CodeGen/AMDGPU/llvm.rint.ll16
-rw-r--r--test/CodeGen/AMDGPU/llvm.round.f64.ll6
-rw-r--r--test/CodeGen/AMDGPU/llvm.round.ll6
-rw-r--r--test/CodeGen/AMDGPU/llvm.sin.ll90
-rw-r--r--test/CodeGen/AMDGPU/llvm.sqrt.ll105
-rw-r--r--test/CodeGen/AMDGPU/load-constant-f64.ll15
-rw-r--r--test/CodeGen/AMDGPU/load-constant-i1.ll371
-rw-r--r--test/CodeGen/AMDGPU/load-constant-i16.ll441
-rw-r--r--test/CodeGen/AMDGPU/load-constant-i32.ll380
-rw-r--r--test/CodeGen/AMDGPU/load-constant-i64.ll84
-rw-r--r--test/CodeGen/AMDGPU/load-constant-i8.ll567
-rw-r--r--test/CodeGen/AMDGPU/load-global-f32.ll93
-rw-r--r--test/CodeGen/AMDGPU/load-global-f64.ll94
-rw-r--r--test/CodeGen/AMDGPU/load-global-i1.ll371
-rw-r--r--test/CodeGen/AMDGPU/load-global-i16.ll476
-rw-r--r--test/CodeGen/AMDGPU/load-global-i32.ll521
-rw-r--r--test/CodeGen/AMDGPU/load-global-i64.ll122
-rw-r--r--test/CodeGen/AMDGPU/load-global-i8.ll564
-rw-r--r--test/CodeGen/AMDGPU/load-i1.ll149
-rw-r--r--test/CodeGen/AMDGPU/load-input-fold.ll13
-rw-r--r--test/CodeGen/AMDGPU/load-local-f32.ll110
-rw-r--r--test/CodeGen/AMDGPU/load-local-f64.ll154
-rw-r--r--test/CodeGen/AMDGPU/load-local-i1.ll371
-rw-r--r--test/CodeGen/AMDGPU/load-local-i16.ll454
-rw-r--r--test/CodeGen/AMDGPU/load-local-i32.ll182
-rw-r--r--test/CodeGen/AMDGPU/load-local-i64.ll154
-rw-r--r--test/CodeGen/AMDGPU/load-local-i8.ll556
-rw-r--r--test/CodeGen/AMDGPU/load-weird-sizes.ll31
-rw-r--r--test/CodeGen/AMDGPU/load.ll737
-rw-r--r--test/CodeGen/AMDGPU/load.vec.ll25
-rw-r--r--test/CodeGen/AMDGPU/load64.ll31
-rw-r--r--test/CodeGen/AMDGPU/local-64.ll18
-rw-r--r--test/CodeGen/AMDGPU/local-atomics.ll87
-rw-r--r--test/CodeGen/AMDGPU/local-atomics64.ll154
-rw-r--r--test/CodeGen/AMDGPU/local-memory-two-objects.ll63
-rw-r--r--test/CodeGen/AMDGPU/local-memory.amdgcn.ll92
-rw-r--r--test/CodeGen/AMDGPU/local-memory.ll69
-rw-r--r--test/CodeGen/AMDGPU/local-memory.r600.ll87
-rw-r--r--test/CodeGen/AMDGPU/local-stack-slot-bug.ll22
-rw-r--r--test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll46
-rw-r--r--test/CodeGen/AMDGPU/m0-spill.ll2
-rw-r--r--test/CodeGen/AMDGPU/mad-combine.ll168
-rw-r--r--test/CodeGen/AMDGPU/mad-sub.ll72
-rw-r--r--test/CodeGen/AMDGPU/mad24-get-global-id.ll36
-rw-r--r--test/CodeGen/AMDGPU/mad_int24.ll15
-rw-r--r--test/CodeGen/AMDGPU/mad_uint24.ll4
-rw-r--r--test/CodeGen/AMDGPU/madak.ll59
-rw-r--r--test/CodeGen/AMDGPU/madmk.ll58
-rw-r--r--test/CodeGen/AMDGPU/max-literals.ll17
-rw-r--r--test/CodeGen/AMDGPU/max.ll183
-rw-r--r--test/CodeGen/AMDGPU/max3.ll8
-rw-r--r--test/CodeGen/AMDGPU/merge-stores.ll46
-rw-r--r--test/CodeGen/AMDGPU/min.ll209
-rw-r--r--test/CodeGen/AMDGPU/min3.ll12
-rw-r--r--test/CodeGen/AMDGPU/missing-store.ll8
-rw-r--r--test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll7
-rw-r--r--test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll6
-rw-r--r--test/CodeGen/AMDGPU/mubuf.ll41
-rw-r--r--test/CodeGen/AMDGPU/mul.ll6
-rw-r--r--test/CodeGen/AMDGPU/mul_int24.ll4
-rw-r--r--test/CodeGen/AMDGPU/mul_uint24.ll4
-rw-r--r--test/CodeGen/AMDGPU/multilevel-break.ll41
-rw-r--r--test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll14
-rw-r--r--test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll15
-rw-r--r--test/CodeGen/AMDGPU/no-shrink-extloads.ll32
-rw-r--r--test/CodeGen/AMDGPU/opencl-image-metadata.ll2
-rw-r--r--test/CodeGen/AMDGPU/operand-folding.ll17
-rw-r--r--test/CodeGen/AMDGPU/or.ll2
-rw-r--r--test/CodeGen/AMDGPU/over-max-lds-size.ll14
-rw-r--r--test/CodeGen/AMDGPU/parallelandifcollapse.ll3
-rw-r--r--test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll4
-rw-r--r--test/CodeGen/AMDGPU/predicate-dp4.ll11
-rw-r--r--test/CodeGen/AMDGPU/predicates.ll20
-rw-r--r--test/CodeGen/AMDGPU/private-element-size.ll252
-rw-r--r--test/CodeGen/AMDGPU/private-memory-atomics.ll12
-rw-r--r--test/CodeGen/AMDGPU/private-memory-broken.ll2
-rw-r--r--test/CodeGen/AMDGPU/private-memory-r600.ll300
-rw-r--r--test/CodeGen/AMDGPU/private-memory.ll325
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll50
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll3
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-globals.ll35
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll25
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-lifetime.ll24
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll65
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-no-opts.ll38
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll130
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-shaders.ll29
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll35
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll64
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll204
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll133
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll24
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-volatile.ll45
-rw-r--r--test/CodeGen/AMDGPU/pv-packing.ll13
-rw-r--r--test/CodeGen/AMDGPU/pv.ll49
-rw-r--r--test/CodeGen/AMDGPU/r600-encoding.ll8
-rw-r--r--test/CodeGen/AMDGPU/r600-export-fix.ll22
-rw-r--r--test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll94
-rw-r--r--test/CodeGen/AMDGPU/r600.private-memory.ll26
-rw-r--r--test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll107
-rw-r--r--test/CodeGen/AMDGPU/r600cfg.ll18
-rw-r--r--test/CodeGen/AMDGPU/rcp-pattern.ll11
-rw-r--r--test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll14
-rw-r--r--test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll14
-rw-r--r--test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll13
-rw-r--r--test/CodeGen/AMDGPU/read_register.ll81
-rw-r--r--test/CodeGen/AMDGPU/readcyclecounter.ll25
-rw-r--r--test/CodeGen/AMDGPU/reciprocal.ll8
-rw-r--r--test/CodeGen/AMDGPU/reduce-load-width-alignment.ll38
-rw-r--r--test/CodeGen/AMDGPU/reduce-store-width-alignment.ll53
-rw-r--r--test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll43
-rw-r--r--test/CodeGen/AMDGPU/register-count-comments.ll6
-rw-r--r--test/CodeGen/AMDGPU/rename-disconnected-bug.ll33
-rw-r--r--test/CodeGen/AMDGPU/rename-independent-subregs.mir30
-rw-r--r--test/CodeGen/AMDGPU/reorder-stores.ll6
-rw-r--r--test/CodeGen/AMDGPU/ret.ll43
-rw-r--r--test/CodeGen/AMDGPU/ret_jump.ll63
-rw-r--r--test/CodeGen/AMDGPU/rotl.ll4
-rw-r--r--test/CodeGen/AMDGPU/rsq.ll14
-rw-r--r--test/CodeGen/AMDGPU/runtime-metadata.ll848
-rw-r--r--test/CodeGen/AMDGPU/rv7x0_count3.ll79
-rw-r--r--test/CodeGen/AMDGPU/s_addk_i32.ll93
-rw-r--r--test/CodeGen/AMDGPU/s_mulk_i32.ll41
-rw-r--r--test/CodeGen/AMDGPU/salu-to-valu.ll106
-rw-r--r--test/CodeGen/AMDGPU/scalar_to_vector.ll21
-rw-r--r--test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll17
-rw-r--r--test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll14
-rw-r--r--test/CodeGen/AMDGPU/schedule-fs-loop.ll14
-rw-r--r--test/CodeGen/AMDGPU/schedule-global-loads.ll10
-rw-r--r--test/CodeGen/AMDGPU/schedule-if-2.ll4
-rw-r--r--test/CodeGen/AMDGPU/schedule-if.ll4
-rw-r--r--test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll17
-rw-r--r--test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll51
-rw-r--r--test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll10
-rw-r--r--test/CodeGen/AMDGPU/scratch-buffer.ll22
-rw-r--r--test/CodeGen/AMDGPU/sdiv.ll60
-rw-r--r--test/CodeGen/AMDGPU/sdivrem24.ll124
-rw-r--r--test/CodeGen/AMDGPU/sdivrem64.ll78
-rw-r--r--test/CodeGen/AMDGPU/select-i1.ll14
-rw-r--r--test/CodeGen/AMDGPU/select-vectors.ll81
-rw-r--r--test/CodeGen/AMDGPU/selected-stack-object.ll15
-rw-r--r--test/CodeGen/AMDGPU/setcc-opt.ll117
-rw-r--r--test/CodeGen/AMDGPU/setcc.ll36
-rw-r--r--test/CodeGen/AMDGPU/setcc64.ll12
-rw-r--r--test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll22
-rw-r--r--test/CodeGen/AMDGPU/sext-in-reg.ll39
-rw-r--r--test/CodeGen/AMDGPU/sgpr-control-flow.ll10
-rw-r--r--test/CodeGen/AMDGPU/sgpr-copy.ll454
-rw-r--r--test/CodeGen/AMDGPU/shared-op-cycle.ll13
-rw-r--r--test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll118
-rw-r--r--test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll386
-rw-r--r--test/CodeGen/AMDGPU/shift-i64-opts.ll245
-rw-r--r--test/CodeGen/AMDGPU/shl.ll171
-rw-r--r--test/CodeGen/AMDGPU/shl_add_constant.ll16
-rw-r--r--test/CodeGen/AMDGPU/shl_add_ptr.ll38
-rw-r--r--test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll25
-rw-r--r--test/CodeGen/AMDGPU/si-annotate-cf.ll95
-rw-r--r--test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll24
-rw-r--r--test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll8
-rw-r--r--test/CodeGen/AMDGPU/si-literal-folding.ll4
-rw-r--r--test/CodeGen/AMDGPU/si-lod-bias.ll60
-rw-r--r--test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll56
-rw-r--r--test/CodeGen/AMDGPU/si-scheduler.ll61
-rw-r--r--test/CodeGen/AMDGPU/si-sgpr-spill.ll3040
-rw-r--r--test/CodeGen/AMDGPU/si-spill-cf.ll522
-rw-r--r--test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll60
-rw-r--r--test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll74
-rw-r--r--test/CodeGen/AMDGPU/si-vector-hang.ll2
-rw-r--r--test/CodeGen/AMDGPU/sign_extend.ll144
-rw-r--r--test/CodeGen/AMDGPU/sint_to_fp.f64.ll18
-rw-r--r--test/CodeGen/AMDGPU/sint_to_fp.i64.ll15
-rw-r--r--test/CodeGen/AMDGPU/sint_to_fp.ll2
-rw-r--r--test/CodeGen/AMDGPU/skip-if-dead.ll390
-rw-r--r--test/CodeGen/AMDGPU/smed3.ll449
-rw-r--r--test/CodeGen/AMDGPU/sminmax.ll118
-rw-r--r--test/CodeGen/AMDGPU/smrd-vccz-bug.ll49
-rw-r--r--test/CodeGen/AMDGPU/smrd.ll15
-rw-r--r--test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll8
-rw-r--r--test/CodeGen/AMDGPU/spill-scavenge-offset.ll21
-rw-r--r--test/CodeGen/AMDGPU/split-scalar-i64-add.ll4
-rw-r--r--test/CodeGen/AMDGPU/split-smrd.ll46
-rw-r--r--test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll17
-rw-r--r--test/CodeGen/AMDGPU/sra.ll371
-rw-r--r--test/CodeGen/AMDGPU/store-barrier.ll17
-rw-r--r--test/CodeGen/AMDGPU/store-v3i64.ll121
-rw-r--r--test/CodeGen/AMDGPU/store.ll71
-rw-r--r--test/CodeGen/AMDGPU/structurize.ll2
-rw-r--r--test/CodeGen/AMDGPU/structurize1.ll2
-rw-r--r--test/CodeGen/AMDGPU/sub.ll24
-rw-r--r--test/CodeGen/AMDGPU/subreg-coalescer-crash.ll8
-rw-r--r--test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll18
-rw-r--r--test/CodeGen/AMDGPU/swizzle-export.ll21
-rw-r--r--test/CodeGen/AMDGPU/target-cpu.ll112
-rw-r--r--test/CodeGen/AMDGPU/tex-clause-antidep.ll14
-rw-r--r--test/CodeGen/AMDGPU/texture-input-merge.ll16
-rw-r--r--test/CodeGen/AMDGPU/trap.ll15
-rw-r--r--test/CodeGen/AMDGPU/trunc-bitcast-vector.ll92
-rw-r--r--test/CodeGen/AMDGPU/trunc-cmp-constant.ll17
-rw-r--r--test/CodeGen/AMDGPU/trunc-store.ll34
-rw-r--r--test/CodeGen/AMDGPU/trunc.ll2
-rw-r--r--test/CodeGen/AMDGPU/udiv.ll56
-rw-r--r--test/CodeGen/AMDGPU/udivrem.ll72
-rw-r--r--test/CodeGen/AMDGPU/udivrem24.ll116
-rw-r--r--test/CodeGen/AMDGPU/udivrem64.ll18
-rw-r--r--test/CodeGen/AMDGPU/uint_to_fp.f64.ll18
-rw-r--r--test/CodeGen/AMDGPU/uint_to_fp.i64.ll2
-rw-r--r--test/CodeGen/AMDGPU/uint_to_fp.ll2
-rw-r--r--test/CodeGen/AMDGPU/umed3.ll484
-rw-r--r--test/CodeGen/AMDGPU/unaligned-load-store.ll549
-rw-r--r--test/CodeGen/AMDGPU/undefined-subreg-liverange.ll90
-rw-r--r--test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll16
-rw-r--r--test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll27
-rw-r--r--test/CodeGen/AMDGPU/uniform-cfg.ll439
-rw-r--r--test/CodeGen/AMDGPU/uniform-crash.ll57
-rw-r--r--test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll67
-rw-r--r--test/CodeGen/AMDGPU/unknown-processor.ll20
-rw-r--r--test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll42
-rw-r--r--test/CodeGen/AMDGPU/v_cndmask.ll6
-rw-r--r--test/CodeGen/AMDGPU/v_mac.ll99
-rw-r--r--test/CodeGen/AMDGPU/valu-i1.ll51
-rw-r--r--test/CodeGen/AMDGPU/vector-alloca.ll2
-rw-r--r--test/CodeGen/AMDGPU/vector-extract-insert.ll84
-rw-r--r--test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll48
-rw-r--r--test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll26
-rw-r--r--test/CodeGen/AMDGPU/vi-removed-intrinsics.ll24
-rw-r--r--test/CodeGen/AMDGPU/vop-shrink.ll10
-rw-r--r--test/CodeGen/AMDGPU/vselect.ll56
-rw-r--r--test/CodeGen/AMDGPU/wait.ll11
-rw-r--r--test/CodeGen/AMDGPU/waitcnt-flat.ll16
-rw-r--r--test/CodeGen/AMDGPU/wqm.ll366
-rw-r--r--test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll22
-rw-r--r--test/CodeGen/AMDGPU/write_register.ll80
-rw-r--r--test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll2
-rw-r--r--test/CodeGen/AMDGPU/xor.ll4
-rw-r--r--test/CodeGen/AMDGPU/zero_extend.ll4
-rw-r--r--test/CodeGen/AMDGPU/zext-i64-bit-operand.ll41
-rw-r--r--test/CodeGen/ARM/2009-08-31-LSDA-Name.ll8
-rw-r--r--test/CodeGen/ARM/2009-10-16-Scope.ll5
-rw-r--r--test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll5
-rw-r--r--test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll7
-rw-r--r--test/CodeGen/ARM/2010-08-04-StackVariable.ll20
-rw-r--r--test/CodeGen/ARM/2010-11-29-PrologueBug.ll25
-rw-r--r--test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll13
-rw-r--r--test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll2
-rw-r--r--test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll4
-rw-r--r--test/CodeGen/ARM/2011-04-26-SchedTweak.ll4
-rw-r--r--test/CodeGen/ARM/2011-06-09-TailCallByVal.ll2
-rw-r--r--test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll13
-rw-r--r--test/CodeGen/ARM/2011-08-25-ldmia_ret.ll2
-rw-r--r--test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll3
-rw-r--r--test/CodeGen/ARM/2016-05-01-RegScavengerAssert.ll192
-rw-r--r--test/CodeGen/ARM/ARMLoadStoreDBG.mir (renamed from test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir)75
-rw-r--r--test/CodeGen/ARM/Windows/builtin_longjmp.ll15
-rw-r--r--test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll2
-rw-r--r--test/CodeGen/ARM/Windows/chkstk.ll4
-rw-r--r--test/CodeGen/ARM/Windows/dbzchk.ll192
-rw-r--r--test/CodeGen/ARM/Windows/division.ll23
-rw-r--r--test/CodeGen/ARM/Windows/dllexport.ll75
-rw-r--r--test/CodeGen/ARM/Windows/long-calls.ll4
-rw-r--r--test/CodeGen/ARM/Windows/no-aeabi.ll2
-rw-r--r--test/CodeGen/ARM/Windows/overflow.ll77
-rw-r--r--test/CodeGen/ARM/Windows/tls.ll157
-rw-r--r--test/CodeGen/ARM/align.ll20
-rw-r--r--test/CodeGen/ARM/arm-and-tst-peephole.ll2
-rw-r--r--test/CodeGen/ARM/arm-eabi.ll8
-rw-r--r--test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll86
-rw-r--r--test/CodeGen/ARM/arm-interleaved-accesses.ll12
-rw-r--r--test/CodeGen/ARM/arm-shrink-wrapping.ll4
-rw-r--r--test/CodeGen/ARM/atomic-64bit.ll186
-rw-r--r--test/CodeGen/ARM/atomic-op.ll111
-rw-r--r--test/CodeGen/ARM/atomic-ops-v8.ll76
-rw-r--r--test/CodeGen/ARM/bfx.ll16
-rw-r--r--test/CodeGen/ARM/build-attributes-encoding.s11
-rw-r--r--test/CodeGen/ARM/build-attributes.ll203
-rw-r--r--test/CodeGen/ARM/byval_load_align.ll2
-rw-r--r--test/CodeGen/ARM/call-tc.ll18
-rw-r--r--test/CodeGen/ARM/call.ll2
-rw-r--r--test/CodeGen/ARM/carry.ll7
-rw-r--r--test/CodeGen/ARM/cdp.ll13
-rw-r--r--test/CodeGen/ARM/cdp2.ll13
-rw-r--r--test/CodeGen/ARM/cmpxchg-O0.ll113
-rw-r--r--test/CodeGen/ARM/cmpxchg-idioms.ll65
-rw-r--r--test/CodeGen/ARM/cmpxchg-weak.ll6
-rw-r--r--test/CodeGen/ARM/coalesce-dbgvalue.ll5
-rw-r--r--test/CodeGen/ARM/code-placement.ll6
-rw-r--r--test/CodeGen/ARM/crash-greedy.ll2
-rw-r--r--test/CodeGen/ARM/cxx-tlscc.ll109
-rw-r--r--test/CodeGen/ARM/dagcombine-anyexttozeroext.ll38
-rw-r--r--test/CodeGen/ARM/darwin-tls.ll17
-rw-r--r--test/CodeGen/ARM/debug-frame-vararg.ll5
-rw-r--r--test/CodeGen/ARM/debug-frame.ll5
-rw-r--r--test/CodeGen/ARM/debug-info-arg.ll5
-rw-r--r--test/CodeGen/ARM/debug-info-blocks.ll5
-rw-r--r--test/CodeGen/ARM/debug-info-branch-folding.ll9
-rw-r--r--test/CodeGen/ARM/debug-info-d16-reg.ll9
-rw-r--r--test/CodeGen/ARM/debug-info-no-frame.ll4
-rw-r--r--test/CodeGen/ARM/debug-info-qreg.ll9
-rw-r--r--test/CodeGen/ARM/debug-info-s16-reg.ll9
-rw-r--r--test/CodeGen/ARM/debug-info-sreg2.ll5
-rw-r--r--test/CodeGen/ARM/debug-segmented-stacks.ll5
-rw-r--r--test/CodeGen/ARM/debugtrap.ll2
-rw-r--r--test/CodeGen/ARM/default-float-abi.ll3
-rw-r--r--test/CodeGen/ARM/default-reloc.ll5
-rw-r--r--test/CodeGen/ARM/divmod-eabi.ll186
-rw-r--r--test/CodeGen/ARM/eh-resume-darwin.ll7
-rw-r--r--test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll4
-rw-r--r--test/CodeGen/ARM/ehabi-handlerdata.ll4
-rw-r--r--test/CodeGen/ARM/ehabi.ll16
-rw-r--r--test/CodeGen/ARM/emutls.ll74
-rw-r--r--test/CodeGen/ARM/emutls_generic.ll18
-rw-r--r--test/CodeGen/ARM/fast-isel-call.ll26
-rw-r--r--test/CodeGen/ARM/fast-isel-deadcode.ll2
-rw-r--r--test/CodeGen/ARM/fast-isel-intrinsic.ll97
-rw-r--r--test/CodeGen/ARM/fast-isel-pie.ll19
-rw-r--r--test/CodeGen/ARM/fast-isel.ll14
-rw-r--r--test/CodeGen/ARM/fast-tail-call.ll1
-rw-r--r--test/CodeGen/ARM/fp16-promote.ll74
-rw-r--r--test/CodeGen/ARM/fp16-v3.ll26
-rw-r--r--test/CodeGen/ARM/fp16.ll15
-rw-r--r--test/CodeGen/ARM/globals.ll12
-rw-r--r--test/CodeGen/ARM/half.ll8
-rw-r--r--test/CodeGen/ARM/hello.ll2
-rw-r--r--test/CodeGen/ARM/ifcvt-iter-indbr.ll2
-rw-r--r--test/CodeGen/ARM/inlineasm-X-allocation.ll21
-rw-r--r--test/CodeGen/ARM/inlineasm-X-constraint.ll157
-rw-r--r--test/CodeGen/ARM/inlineasm-ldr-pseudo.ll4
-rw-r--r--test/CodeGen/ARM/inlineasm3.ll25
-rw-r--r--test/CodeGen/ARM/interrupt-attr.ll8
-rw-r--r--test/CodeGen/ARM/interval-update-remat.ll162
-rw-r--r--test/CodeGen/ARM/intrinsics-coprocessor.ll79
-rw-r--r--test/CodeGen/ARM/intrinsics.ll39
-rw-r--r--test/CodeGen/ARM/invalidated-save-point.ll27
-rw-r--r--test/CodeGen/ARM/ldc2l.ll11
-rw-r--r--test/CodeGen/ARM/ldm-base-writeback.ll21
-rw-r--r--test/CodeGen/ARM/ldr_frame.ll11
-rw-r--r--test/CodeGen/ARM/ldrd.ll52
-rw-r--r--test/CodeGen/ARM/ldstrex-m.ll2
-rw-r--r--test/CodeGen/ARM/legalize-unaligned-load.ll2
-rw-r--r--test/CodeGen/ARM/litpool-licm.ll46
-rw-r--r--test/CodeGen/ARM/local-call.ll20
-rw-r--r--test/CodeGen/ARM/longMAC.ll29
-rw-r--r--test/CodeGen/ARM/lsr-code-insertion.ll4
-rw-r--r--test/CodeGen/ARM/macho-frame-offset.ll12
-rw-r--r--test/CodeGen/ARM/memcpy-no-inline.ll33
-rw-r--r--test/CodeGen/ARM/memfunc.ll29
-rw-r--r--test/CodeGen/ARM/minsize-call-cse.ll28
-rw-r--r--test/CodeGen/ARM/movt.ll6
-rw-r--r--test/CodeGen/ARM/msr-it-block.ll55
-rw-r--r--test/CodeGen/ARM/none-macho.ll10
-rw-r--r--test/CodeGen/ARM/pic.ll4
-rw-r--r--test/CodeGen/ARM/pie.ll18
-rw-r--r--test/CodeGen/ARM/plt-relative-reloc.ll16
-rw-r--r--test/CodeGen/ARM/popcnt.ll16
-rw-r--r--test/CodeGen/ARM/pr26669.ll31
-rw-r--r--test/CodeGen/ARM/preferred-align.ll8
-rw-r--r--test/CodeGen/ARM/rem_crash.ll257
-rw-r--r--test/CodeGen/ARM/returned-ext.ll32
-rw-r--r--test/CodeGen/ARM/sincos.ll18
-rw-r--r--test/CodeGen/ARM/sjlj-prepare-critical-edge.ll4
-rw-r--r--test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll6
-rw-r--r--test/CodeGen/ARM/smul.ll111
-rw-r--r--test/CodeGen/ARM/smulw.ll26
-rw-r--r--test/CodeGen/ARM/special-reg-v8m-base.ll142
-rw-r--r--test/CodeGen/ARM/special-reg-v8m-main.ll214
-rw-r--r--test/CodeGen/ARM/ssat.ll215
-rw-r--r--test/CodeGen/ARM/static-addr-hoisting.ll22
-rw-r--r--test/CodeGen/ARM/stc2.ll11
-rw-r--r--test/CodeGen/ARM/struct_byval.ll31
-rw-r--r--test/CodeGen/ARM/swift-ios.ll68
-rw-r--r--test/CodeGen/ARM/swift-return.ll133
-rw-r--r--test/CodeGen/ARM/swift-vldm.ll1
-rw-r--r--test/CodeGen/ARM/swifterror.ll381
-rw-r--r--test/CodeGen/ARM/swiftself.ll65
-rw-r--r--test/CodeGen/ARM/t2-shrink-ldrpost.ll52
-rw-r--r--test/CodeGen/ARM/tail-call-builtin.ll37
-rw-r--r--test/CodeGen/ARM/tail-call-weak.ll7
-rw-r--r--test/CodeGen/ARM/this-return.ll10
-rw-r--r--test/CodeGen/ARM/thread_pointer.ll4
-rw-r--r--test/CodeGen/ARM/thumb-alignment.ll6
-rw-r--r--test/CodeGen/ARM/thumb-stub.ll10
-rw-r--r--test/CodeGen/ARM/thumb1-ldst-opt.ll2
-rw-r--r--test/CodeGen/ARM/thumb1-varalloc.ll12
-rw-r--r--test/CodeGen/ARM/thumb2-size-opt.ll16
-rw-r--r--test/CodeGen/ARM/tls-models.ll4
-rw-r--r--test/CodeGen/ARM/tls3.ll2
-rw-r--r--test/CodeGen/ARM/trap.ll67
-rw-r--r--test/CodeGen/ARM/truncstore-dag-combine.ll10
-rw-r--r--test/CodeGen/ARM/twoaddrinstr.ll1
-rw-r--r--test/CodeGen/ARM/urem-opt-size.ll45
-rw-r--r--test/CodeGen/ARM/v7k-libcalls.ll9
-rw-r--r--test/CodeGen/ARM/v7k-sincos.ll2
-rw-r--r--test/CodeGen/ARM/vcnt.ll78
-rw-r--r--test/CodeGen/ARM/vcvt_combine.ll8
-rw-r--r--test/CodeGen/ARM/vdiv_combine.ll8
-rw-r--r--test/CodeGen/ARM/vfp-libcalls.ll2
-rw-r--r--test/CodeGen/ARM/vfp-regs-dwarf.ll5
-rw-r--r--test/CodeGen/ARM/vminmax.ll17
-rw-r--r--test/CodeGen/ARM/warn-stack.ll2
-rw-r--r--test/CodeGen/ARM/wide-compares.ll52
-rw-r--r--test/CodeGen/ARM/widen-vmovs.ll2
-rw-r--r--test/CodeGen/ARM/zero-cycle-zero.ll2
-rw-r--r--test/CodeGen/BPF/sdiv_error.ll9
-rw-r--r--test/CodeGen/CPP/2007-06-16-Funcname.ll7
-rw-r--r--test/CodeGen/CPP/2009-05-01-Long-Double.ll13
-rw-r--r--test/CodeGen/CPP/2009-05-04-CondBr.ll28
-rw-r--r--test/CodeGen/CPP/2012-02-05-UnitVarCrash.ll6
-rw-r--r--test/CodeGen/CPP/atomic.ll89
-rw-r--r--test/CodeGen/CPP/attributes.ll7
-rw-r--r--test/CodeGen/CPP/gep.ll10
-rw-r--r--test/CodeGen/CPP/lit.local.cfg3
-rw-r--r--test/CodeGen/Generic/MachineBranchProb.ll16
-rw-r--r--test/CodeGen/Generic/Makefile23
-rw-r--r--test/CodeGen/Generic/dont-remove-empty-preheader.ll39
-rw-r--r--test/CodeGen/Generic/run-pass.ll7
-rw-r--r--test/CodeGen/Generic/stop-after.ll2
-rw-r--r--test/CodeGen/Generic/vector-redux.ll237
-rw-r--r--test/CodeGen/Hexagon/Atomics.ll13
-rw-r--r--test/CodeGen/Hexagon/absaddr-store.ll1
-rw-r--r--test/CodeGen/Hexagon/adde.ll2
-rw-r--r--test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll49
-rw-r--r--test/CodeGen/Hexagon/avoid-predspill.ll42
-rw-r--r--test/CodeGen/Hexagon/bit-extractu-half.ll13
-rw-r--r--test/CodeGen/Hexagon/bitconvert-vector.ll27
-rw-r--r--test/CodeGen/Hexagon/block-addr.ll2
-rw-r--r--test/CodeGen/Hexagon/block-ranges-nodef.ll55
-rw-r--r--test/CodeGen/Hexagon/branch-non-mbb.ll46
-rw-r--r--test/CodeGen/Hexagon/brev_ld.ll26
-rw-r--r--test/CodeGen/Hexagon/brev_st.ll19
-rw-r--r--test/CodeGen/Hexagon/builtin-prefetch-offset.ll28
-rw-r--r--test/CodeGen/Hexagon/builtin-prefetch.ll29
-rw-r--r--test/CodeGen/Hexagon/callr-dep-edge.ll20
-rw-r--r--test/CodeGen/Hexagon/cext-check.ll2
-rw-r--r--test/CodeGen/Hexagon/cfi-late.ll5
-rw-r--r--test/CodeGen/Hexagon/cfi-offset.ll43
-rw-r--r--test/CodeGen/Hexagon/circ-load-isel.ll18
-rw-r--r--test/CodeGen/Hexagon/circ_ld.ll26
-rw-r--r--test/CodeGen/Hexagon/circ_st.ll20
-rw-r--r--test/CodeGen/Hexagon/clr_set_toggle.ll17
-rw-r--r--test/CodeGen/Hexagon/const64.ll18
-rw-r--r--test/CodeGen/Hexagon/csr-func-usedef.ll72
-rw-r--r--test/CodeGen/Hexagon/eliminate-pred-spill.ll144
-rw-r--r--test/CodeGen/Hexagon/expand-condsets-pred-undef.ll22
-rw-r--r--test/CodeGen/Hexagon/extload-combine.ll2
-rw-r--r--test/CodeGen/Hexagon/gp-plus-offset-load.ll4
-rw-r--r--test/CodeGen/Hexagon/hwloop-dbg.ll5
-rw-r--r--test/CodeGen/Hexagon/ifcvt-diamond-bad.ll43
-rw-r--r--test/CodeGen/Hexagon/inline-asm-qv.ll19
-rw-r--r--test/CodeGen/Hexagon/insert4.ll112
-rw-r--r--test/CodeGen/Hexagon/intrinsics/system_user.ll13
-rw-r--r--test/CodeGen/Hexagon/memops-stack.ll147
-rw-r--r--test/CodeGen/Hexagon/memops.ll126
-rw-r--r--test/CodeGen/Hexagon/misched-top-rptracker-sync.ll151
-rw-r--r--test/CodeGen/Hexagon/newvaluestore.ll21
-rw-r--r--test/CodeGen/Hexagon/opt-addr-mode.ll107
-rw-r--r--test/CodeGen/Hexagon/packetize-tailcall-arg.ll22
-rw-r--r--test/CodeGen/Hexagon/peephole-op-swap.ll30
-rw-r--r--test/CodeGen/Hexagon/pic-local.ll19
-rw-r--r--test/CodeGen/Hexagon/pic-regusage.ll69
-rw-r--r--test/CodeGen/Hexagon/rdf-copy-undef2.ll55
-rw-r--r--test/CodeGen/Hexagon/rdf-copy.ll2
-rw-r--r--test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll37
-rw-r--r--test/CodeGen/Hexagon/rdf-inline-asm.ll36
-rw-r--r--test/CodeGen/Hexagon/rdf-reset-kills.ll28
-rw-r--r--test/CodeGen/Hexagon/reg-scavengebug-3.ll80
-rw-r--r--test/CodeGen/Hexagon/reg-scavenger-valid-slot.ll100
-rw-r--r--test/CodeGen/Hexagon/restore-single-reg.ll42
-rw-r--r--test/CodeGen/Hexagon/ret-struct-by-val.ll18
-rw-r--r--test/CodeGen/Hexagon/runtime-stkchk.ll44
-rw-r--r--test/CodeGen/Hexagon/sdata-array.ll13
-rw-r--r--test/CodeGen/Hexagon/sdata-basic.ll16
-rw-r--r--test/CodeGen/Hexagon/section_7275.ll54
-rw-r--r--test/CodeGen/Hexagon/select-instr-align.ll31
-rw-r--r--test/CodeGen/Hexagon/static.ll8
-rw-r--r--test/CodeGen/Hexagon/store-shift.ll50
-rw-r--r--test/CodeGen/Hexagon/storerinewabs.ll17
-rw-r--r--test/CodeGen/Hexagon/struct_args_large.ll2
-rw-r--r--test/CodeGen/Hexagon/sube.ll4
-rw-r--r--test/CodeGen/Hexagon/tail-dup-subreg-map.ll67
-rw-r--r--test/CodeGen/Hexagon/tls_pic.ll37
-rw-r--r--test/CodeGen/Hexagon/tls_static.ll28
-rw-r--r--test/CodeGen/Hexagon/v60-cur.ll62
-rw-r--r--test/CodeGen/Hexagon/v60Intrins.ll2
-rw-r--r--test/CodeGen/Hexagon/vec-pred-spill1.ll80
-rw-r--r--test/CodeGen/Hexagon/vector-align.ll38
-rw-r--r--test/CodeGen/Hexagon/vload-postinc-sel.ll52
-rw-r--r--test/CodeGen/Hexagon/vselect-pseudo.ll33
-rw-r--r--test/CodeGen/Hexagon/vsplat-isel.ll10
-rw-r--r--test/CodeGen/Hexagon/zextloadi1.ll21
-rw-r--r--test/CodeGen/Inputs/DbgValueOtherTargets.ll5
-rw-r--r--test/CodeGen/Lanai/codemodel.ll30
-rw-r--r--test/CodeGen/Lanai/comparisons_i32.ll96
-rw-r--r--test/CodeGen/Lanai/comparisons_i64.ll108
-rw-r--r--test/CodeGen/Lanai/constant_multiply.ll107
-rw-r--r--test/CodeGen/Lanai/delay_filler.ll41
-rw-r--r--test/CodeGen/Lanai/i32.ll145
-rw-r--r--test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll55
-rw-r--r--test/CodeGen/Lanai/lit.local.cfg3
-rw-r--r--test/CodeGen/Lanai/mem_alu_combiner.ll35
-rw-r--r--test/CodeGen/Lanai/multiply.ll60
-rw-r--r--test/CodeGen/Lanai/rshift64.ll12
-rw-r--r--test/CodeGen/Lanai/select.ll41
-rw-r--r--test/CodeGen/Lanai/set_and_hi.ll15
-rw-r--r--test/CodeGen/Lanai/shift.ll28
-rw-r--r--test/CodeGen/Lanai/stack-frame.ll14
-rw-r--r--test/CodeGen/Lanai/sub-cmp-peephole.ll109
-rw-r--r--test/CodeGen/Lanai/subword.ll29
-rw-r--r--test/CodeGen/MIR/AArch64/cfi-def-cfa.mir2
-rw-r--r--test/CodeGen/MIR/AArch64/expected-target-flag-name.mir2
-rw-r--r--test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir43
-rw-r--r--test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir2
-rw-r--r--test/CodeGen/MIR/AArch64/machine-dead-copy.mir71
-rw-r--r--test/CodeGen/MIR/AArch64/machine-scheduler.mir35
-rw-r--r--test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir2
-rw-r--r--test/CodeGen/MIR/AArch64/stack-object-local-offset.mir4
-rw-r--r--test/CodeGen/MIR/AArch64/target-flags.mir2
-rw-r--r--test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir2
-rw-r--r--test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir2
-rw-r--r--test/CodeGen/MIR/AMDGPU/target-index-operands.mir2
-rw-r--r--test/CodeGen/MIR/ARM/bundled-instructions.mir2
-rw-r--r--test/CodeGen/MIR/ARM/cfi-same-value.mir2
-rw-r--r--test/CodeGen/MIR/ARM/expected-closing-brace.mir2
-rw-r--r--test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir2
-rw-r--r--test/CodeGen/MIR/ARM/imm-peephole-arm.mir60
-rw-r--r--test/CodeGen/MIR/ARM/imm-peephole-thumb.mir59
-rw-r--r--test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir2
-rw-r--r--test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir65
-rw-r--r--test/CodeGen/MIR/Generic/basic-blocks.mir2
-rw-r--r--test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir2
-rw-r--r--test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir2
-rw-r--r--test/CodeGen/MIR/Generic/frame-info.mir2
-rw-r--r--test/CodeGen/MIR/Generic/function-missing-machine-function.mir2
-rw-r--r--test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir2
-rw-r--r--test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir2
-rw-r--r--test/CodeGen/MIR/Generic/llvmIR.mir2
-rw-r--r--test/CodeGen/MIR/Generic/llvmIRMissing.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-function-missing-function.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-function-missing-name.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir2
-rw-r--r--test/CodeGen/MIR/Generic/machine-function.mir2
-rw-r--r--test/CodeGen/MIR/Generic/multiRunPass.mir20
-rw-r--r--test/CodeGen/MIR/Generic/register-info.mir2
-rw-r--r--test/CodeGen/MIR/Hexagon/anti-dep-partial.mir35
-rw-r--r--test/CodeGen/MIR/Hexagon/lit.local.cfg2
-rw-r--r--test/CodeGen/MIR/Lanai/lit.local.cfg2
-rw-r--r--test/CodeGen/MIR/Lanai/peephole-compare.mir714
-rw-r--r--test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir2
-rw-r--r--test/CodeGen/MIR/Mips/memory-operands.mir14
-rw-r--r--test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir2
-rw-r--r--test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir2
-rw-r--r--test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir2
-rw-r--r--test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir2
-rw-r--r--test/CodeGen/MIR/X86/basic-block-liveins.mir2
-rw-r--r--test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/block-address-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/callee-saved-info.mir2
-rw-r--r--test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir2
-rw-r--r--test/CodeGen/MIR/X86/cfi-def-cfa-register.mir2
-rw-r--r--test/CodeGen/MIR/X86/cfi-offset.mir2
-rw-r--r--test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/constant-pool.mir2
-rw-r--r--test/CodeGen/MIR/X86/constant-value-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/dead-register-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/def-register-already-tied-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/duplicate-register-flag-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/early-clobber-register-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-different-implicit-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir24
-rw-r--r--test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-machine-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir7
-rw-r--r--test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir7
-rw-r--r--test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-named-register-livein.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-number-after-bb.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-register-after-flags.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-stack-object.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-subregister-after-colon.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-target-flag-name.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir2
-rw-r--r--test/CodeGen/MIR/X86/external-symbol-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/fixed-stack-objects.mir2
-rw-r--r--test/CodeGen/MIR/X86/frame-info-save-restore-points.mir2
-rw-r--r--test/CodeGen/MIR/X86/frame-info-stack-references.mir2
-rw-r--r--test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/function-liveins.mir2
-rw-r--r--test/CodeGen/MIR/X86/generic-instr-type-error.mir15
-rw-r--r--test/CodeGen/MIR/X86/generic-virtual-registers.mir48
-rw-r--r--test/CodeGen/MIR/X86/global-value-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/immediate-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/implicit-register-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/inline-asm-registers.mir2
-rw-r--r--test/CodeGen/MIR/X86/instructions-debug-location.mir19
-rw-r--r--test/CodeGen/MIR/X86/invalid-constant-pool-item.mir2
-rw-r--r--test/CodeGen/MIR/X86/invalid-metadata-node-type.mir8
-rw-r--r--test/CodeGen/MIR/X86/invalid-target-flag-name.mir2
-rw-r--r--test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/jump-table-info.mir4
-rw-r--r--test/CodeGen/MIR/X86/jump-table-redefinition-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/killed-register-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/large-immediate-operand-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/large-index-number-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/large-offset-number-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/liveout-register-mask.mir2
-rw-r--r--test/CodeGen/MIR/X86/machine-basic-block-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/machine-instructions.mir2
-rw-r--r--test/CodeGen/MIR/X86/machine-verifier.mir2
-rw-r--r--test/CodeGen/MIR/X86/memory-operands.mir30
-rw-r--r--test/CodeGen/MIR/X86/metadata-operands.mir9
-rw-r--r--test/CodeGen/MIR/X86/missing-closing-quote.mir2
-rw-r--r--test/CodeGen/MIR/X86/missing-comma.mir2
-rw-r--r--test/CodeGen/MIR/X86/missing-implicit-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/named-registers.mir2
-rw-r--r--test/CodeGen/MIR/X86/newline-handling.mir2
-rw-r--r--test/CodeGen/MIR/X86/null-register-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/register-mask-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/register-operands-target-flag-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/simple-register-allocation-hints.mir2
-rw-r--r--test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir2
-rw-r--r--test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir2
-rw-r--r--test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir2
-rw-r--r--test/CodeGen/MIR/X86/stack-object-debug-info.mir13
-rw-r--r--test/CodeGen/MIR/X86/stack-object-invalid-name.mir2
-rw-r--r--test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/stack-object-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/stack-object-redefinition-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/stack-objects.mir2
-rw-r--r--test/CodeGen/MIR/X86/standalone-register-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/subreg-on-physreg.mir12
-rw-r--r--test/CodeGen/MIR/X86/subregister-index-operands.mir32
-rw-r--r--test/CodeGen/MIR/X86/subregister-operands.mir2
-rw-r--r--test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir2
-rw-r--r--test/CodeGen/MIR/X86/successor-basic-blocks.mir2
-rw-r--r--test/CodeGen/MIR/X86/tied-def-operand-invalid.mir2
-rw-r--r--test/CodeGen/MIR/X86/undef-register-flag.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-global-value.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-jump-table-id.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-named-global-value.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-register-class.mir4
-rw-r--r--test/CodeGen/MIR/X86/undefined-stack-object.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir2
-rw-r--r--test/CodeGen/MIR/X86/undefined-virtual-register.mir2
-rw-r--r--test/CodeGen/MIR/X86/unknown-instruction.mir2
-rw-r--r--test/CodeGen/MIR/X86/unknown-machine-basic-block.mir2
-rw-r--r--test/CodeGen/MIR/X86/unknown-metadata-keyword.mir2
-rw-r--r--test/CodeGen/MIR/X86/unknown-metadata-node.mir7
-rw-r--r--test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir2
-rw-r--r--test/CodeGen/MIR/X86/unknown-register.mir2
-rw-r--r--test/CodeGen/MIR/X86/unknown-subregister-index-op.mir26
-rw-r--r--test/CodeGen/MIR/X86/unknown-subregister-index.mir2
-rw-r--r--test/CodeGen/MIR/X86/unrecognized-character.mir2
-rw-r--r--test/CodeGen/MIR/X86/used-physical-register-info.mir2
-rw-r--r--test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/variable-sized-stack-objects.mir2
-rw-r--r--test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir2
-rw-r--r--test/CodeGen/MIR/X86/virtual-registers.mir2
-rw-r--r--test/CodeGen/MIR/lit.local.cfg2
-rw-r--r--test/CodeGen/MSP430/spill-to-stack.ll40
-rw-r--r--test/CodeGen/Mips/2010-07-20-Switch.ll6
-rw-r--r--test/CodeGen/Mips/Fast-ISel/callabi.ll12
-rw-r--r--test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll14
-rw-r--r--test/CodeGen/Mips/Fast-ISel/div1.ll4
-rw-r--r--test/CodeGen/Mips/Fast-ISel/fastalloca.ll2
-rw-r--r--test/CodeGen/Mips/Fast-ISel/fpcmpa.ll4
-rw-r--r--test/CodeGen/Mips/Fast-ISel/memtest1.ll4
-rw-r--r--test/CodeGen/Mips/Fast-ISel/rem1.ll4
-rw-r--r--test/CodeGen/Mips/Fast-ISel/shift.ll2
-rw-r--r--test/CodeGen/Mips/abicalls.ll8
-rw-r--r--test/CodeGen/Mips/adjust-callstack-sp.ll12
-rw-r--r--test/CodeGen/Mips/alloca.ll2
-rw-r--r--test/CodeGen/Mips/analyzebranch.ll19
-rw-r--r--test/CodeGen/Mips/assertzext-trunc.ll62
-rw-r--r--test/CodeGen/Mips/atomic.ll251
-rw-r--r--test/CodeGen/Mips/atomicCmpSwapPW.ll17
-rw-r--r--test/CodeGen/Mips/biggot.ll4
-rw-r--r--test/CodeGen/Mips/brdelayslot.ll19
-rw-r--r--test/CodeGen/Mips/brsize3.ll10
-rw-r--r--test/CodeGen/Mips/buildpairextractelementf64.ll12
-rw-r--r--test/CodeGen/Mips/call-optimization.ll4
-rw-r--r--test/CodeGen/Mips/cannot-copy-registers.ll24
-rw-r--r--test/CodeGen/Mips/cconv/arguments-float.ll16
-rw-r--r--test/CodeGen/Mips/cconv/arguments-fp128.ll8
-rw-r--r--test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll16
-rw-r--r--test/CodeGen/Mips/cconv/arguments-hard-float.ll16
-rw-r--r--test/CodeGen/Mips/cconv/arguments-hard-fp128.ll8
-rw-r--r--test/CodeGen/Mips/cconv/arguments-small-structures-bigger-than-32bits.ll8
-rw-r--r--test/CodeGen/Mips/cconv/arguments-struct.ll16
-rw-r--r--test/CodeGen/Mips/cconv/arguments-varargs.ll16
-rw-r--r--test/CodeGen/Mips/cconv/arguments.ll16
-rw-r--r--test/CodeGen/Mips/cconv/callee-saved-float.ll37
-rw-r--r--test/CodeGen/Mips/cconv/callee-saved-fpxx.ll16
-rw-r--r--test/CodeGen/Mips/cconv/callee-saved.ll32
-rw-r--r--test/CodeGen/Mips/cconv/memory-layout.ll54
-rw-r--r--test/CodeGen/Mips/cconv/reserved-space.ll16
-rw-r--r--test/CodeGen/Mips/cconv/return-float.ll16
-rw-r--r--test/CodeGen/Mips/cconv/return-hard-float.ll20
-rw-r--r--test/CodeGen/Mips/cconv/return-hard-fp128.ll8
-rw-r--r--test/CodeGen/Mips/cconv/return-hard-struct-f128.ll8
-rw-r--r--test/CodeGen/Mips/cconv/return-struct.ll19
-rw-r--r--test/CodeGen/Mips/cconv/return.ll16
-rw-r--r--test/CodeGen/Mips/cconv/roundl-call.ll40
-rw-r--r--test/CodeGen/Mips/cconv/stack-alignment.ll16
-rw-r--r--test/CodeGen/Mips/cfi_offset.ll12
-rw-r--r--test/CodeGen/Mips/check-adde-redundant-moves.ll18
-rwxr-xr-xtest/CodeGen/Mips/cmov.ll24
-rw-r--r--test/CodeGen/Mips/compactbranches/beqc-bnec-register-constraint.ll55
-rw-r--r--test/CodeGen/Mips/compactbranches/compact-branch-policy.ll28
-rw-r--r--test/CodeGen/Mips/compactbranches/compact-branches.ll206
-rw-r--r--test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll53
-rw-r--r--test/CodeGen/Mips/const-mult.ll4
-rw-r--r--test/CodeGen/Mips/const4a.ll2
-rw-r--r--test/CodeGen/Mips/const6.ll4
-rw-r--r--test/CodeGen/Mips/const6a.ll2
-rw-r--r--test/CodeGen/Mips/countleading.ll21
-rw-r--r--test/CodeGen/Mips/cstmaterialization/stack.ll54
-rw-r--r--test/CodeGen/Mips/divrem.ll68
-rw-r--r--test/CodeGen/Mips/dsp-r1.ll3
-rw-r--r--test/CodeGen/Mips/dynamic-stack-realignment.ll36
-rw-r--r--test/CodeGen/Mips/eh-dwarf-cfa.ll14
-rw-r--r--test/CodeGen/Mips/eh-return32.ll6
-rw-r--r--test/CodeGen/Mips/eh-return64.ll8
-rw-r--r--test/CodeGen/Mips/eh.ll2
-rw-r--r--test/CodeGen/Mips/ehframe-indirect.ll18
-rw-r--r--test/CodeGen/Mips/elf_eflags.ll12
-rw-r--r--test/CodeGen/Mips/emergency-spill-slot-near-fp.ll2
-rw-r--r--test/CodeGen/Mips/emutls_generic.ll6
-rw-r--r--test/CodeGen/Mips/fastcc.ll8
-rw-r--r--test/CodeGen/Mips/fcmp.ll403
-rw-r--r--test/CodeGen/Mips/fcopysign-f32-f64.ll38
-rw-r--r--test/CodeGen/Mips/fcopysign.ll2
-rw-r--r--test/CodeGen/Mips/fmadd1.ll24
-rw-r--r--test/CodeGen/Mips/fp-indexed-ls.ll14
-rw-r--r--test/CodeGen/Mips/fp-spill-reload.ll2
-rw-r--r--test/CodeGen/Mips/fp16-promote.ll2
-rw-r--r--test/CodeGen/Mips/fp64a.ll12
-rw-r--r--test/CodeGen/Mips/fpbr.ll30
-rw-r--r--test/CodeGen/Mips/fpxx.ll20
-rw-r--r--test/CodeGen/Mips/gpreg-lazy-binding.ll2
-rw-r--r--test/CodeGen/Mips/hf16call32.ll8
-rw-r--r--test/CodeGen/Mips/hf16call32_body.ll36
-rw-r--r--test/CodeGen/Mips/hf1_body.ll27
-rw-r--r--test/CodeGen/Mips/i64arg.ll2
-rw-r--r--test/CodeGen/Mips/inlineasm-constraint_ZC_2.ll36
-rw-r--r--test/CodeGen/Mips/inlineasm-operand-code.ll57
-rw-r--r--test/CodeGen/Mips/inlineasm_constraint.ll4
-rw-r--r--test/CodeGen/Mips/inlineasm_constraint_R.ll2
-rw-r--r--test/CodeGen/Mips/inlineasm_constraint_ZC.ll6
-rw-r--r--test/CodeGen/Mips/inlineasm_constraint_m.ll2
-rw-r--r--test/CodeGen/Mips/inlineasmmemop.ll2
-rw-r--r--test/CodeGen/Mips/internalfunc.ll2
-rw-r--r--test/CodeGen/Mips/interrupt-attr-64-error.ll2
-rw-r--r--test/CodeGen/Mips/largeimm1.ll12
-rw-r--r--test/CodeGen/Mips/largeimmprinting.ll34
-rw-r--r--test/CodeGen/Mips/lazy-binding.ll2
-rw-r--r--test/CodeGen/Mips/lcb5.ll2
-rw-r--r--test/CodeGen/Mips/llvm-ir/add.ll359
-rw-r--r--test/CodeGen/Mips/llvm-ir/and.ll633
-rw-r--r--test/CodeGen/Mips/llvm-ir/ashr.ll105
-rw-r--r--test/CodeGen/Mips/llvm-ir/call.ll67
-rw-r--r--test/CodeGen/Mips/llvm-ir/indirectbr.ll27
-rw-r--r--test/CodeGen/Mips/llvm-ir/lh_lhu.ll32
-rw-r--r--test/CodeGen/Mips/llvm-ir/load-atomic.ll4
-rw-r--r--test/CodeGen/Mips/llvm-ir/lshr.ll102
-rw-r--r--test/CodeGen/Mips/llvm-ir/mul.ll110
-rw-r--r--test/CodeGen/Mips/llvm-ir/not.ll239
-rw-r--r--test/CodeGen/Mips/llvm-ir/or.ll648
-rw-r--r--test/CodeGen/Mips/llvm-ir/ret.ll32
-rw-r--r--test/CodeGen/Mips/llvm-ir/sdiv.ll115
-rw-r--r--test/CodeGen/Mips/llvm-ir/select-dbl.ll358
-rw-r--r--test/CodeGen/Mips/llvm-ir/select-flt.ll335
-rw-r--r--test/CodeGen/Mips/llvm-ir/select-int.ll270
-rw-r--r--test/CodeGen/Mips/llvm-ir/select.ll712
-rw-r--r--test/CodeGen/Mips/llvm-ir/shl.ll102
-rw-r--r--test/CodeGen/Mips/llvm-ir/srem.ll112
-rw-r--r--test/CodeGen/Mips/llvm-ir/store-atomic.ll4
-rw-r--r--test/CodeGen/Mips/llvm-ir/sub.ll108
-rw-r--r--test/CodeGen/Mips/llvm-ir/udiv.ll97
-rw-r--r--test/CodeGen/Mips/llvm-ir/urem.ll126
-rw-r--r--test/CodeGen/Mips/llvm-ir/xor.ll164
-rw-r--r--test/CodeGen/Mips/load-store-left-right.ll28
-rw-r--r--test/CodeGen/Mips/longbranch.ll27
-rw-r--r--test/CodeGen/Mips/lw16-base-reg.ll26
-rw-r--r--test/CodeGen/Mips/madd-msub.ll12
-rw-r--r--test/CodeGen/Mips/micromips-addiu.ll2
-rw-r--r--test/CodeGen/Mips/micromips-atomic1.ll3
-rw-r--r--test/CodeGen/Mips/micromips-delay-slot.ll4
-rw-r--r--test/CodeGen/Mips/micromips-lwc1-swc1.ll50
-rw-r--r--test/CodeGen/Mips/micromips-or16.ll29
-rw-r--r--test/CodeGen/Mips/micromips-shift.ll2
-rw-r--r--test/CodeGen/Mips/micromips-zero-mat-uses.ll8
-rw-r--r--test/CodeGen/Mips/mips-shf-gprel.s27
-rw-r--r--test/CodeGen/Mips/mips16fpe.ll3
-rw-r--r--test/CodeGen/Mips/mips64-f128.ll59
-rw-r--r--test/CodeGen/Mips/mips64extins.ll12
-rw-r--r--test/CodeGen/Mips/mips64fpldst.ll36
-rw-r--r--test/CodeGen/Mips/mips64instrs.ll8
-rw-r--r--test/CodeGen/Mips/mips64intldst.ll8
-rw-r--r--test/CodeGen/Mips/mips64muldiv.ll8
-rw-r--r--test/CodeGen/Mips/mips64r6/compatibility.ll4
-rw-r--r--test/CodeGen/Mips/mips64shift.ll29
-rw-r--r--test/CodeGen/Mips/mno-ldc1-sdc1.ll133
-rw-r--r--test/CodeGen/Mips/msa/2r.ll4
-rw-r--r--test/CodeGen/Mips/msa/2r_vector_scalar.ll18
-rw-r--r--test/CodeGen/Mips/msa/2rf.ll4
-rw-r--r--test/CodeGen/Mips/msa/2rf_float_int.ll4
-rw-r--r--test/CodeGen/Mips/msa/2rf_int_float.ll4
-rw-r--r--test/CodeGen/Mips/msa/3r-a.ll4
-rw-r--r--test/CodeGen/Mips/msa/3r-b.ll4
-rw-r--r--test/CodeGen/Mips/msa/3r-s.ll4
-rw-r--r--test/CodeGen/Mips/msa/3r_splat.ll4
-rw-r--r--test/CodeGen/Mips/msa/basic_operations.ll24
-rw-r--r--test/CodeGen/Mips/msa/basic_operations_float.ll12
-rw-r--r--test/CodeGen/Mips/msa/elm_copy.ll16
-rw-r--r--test/CodeGen/Mips/msa/elm_cxcmsa.ll4
-rw-r--r--test/CodeGen/Mips/msa/elm_insv.ll16
-rw-r--r--test/CodeGen/Mips/msa/frameindex.ll4
-rw-r--r--test/CodeGen/Mips/msa/i5-b.ll4
-rw-r--r--test/CodeGen/Mips/msa/i8.ll4
-rw-r--r--test/CodeGen/Mips/msa/vec.ll4
-rw-r--r--test/CodeGen/Mips/nacl-align.ll14
-rw-r--r--test/CodeGen/Mips/no-odd-spreg-msa.ll8
-rw-r--r--test/CodeGen/Mips/no-odd-spreg.ll10
-rw-r--r--test/CodeGen/Mips/o32_cc.ll7
-rw-r--r--test/CodeGen/Mips/o32_cc_byval.ll2
-rw-r--r--test/CodeGen/Mips/octeon.ll4
-rw-r--r--test/CodeGen/Mips/octeon_popcnt.ll2
-rw-r--r--test/CodeGen/Mips/optimize-pic-o0.ll2
-rw-r--r--test/CodeGen/Mips/prevent-hoisting.ll8
-rw-r--r--test/CodeGen/Mips/private-addr.ll14
-rw-r--r--test/CodeGen/Mips/private.ll2
-rw-r--r--test/CodeGen/Mips/return-vector.ll2
-rw-r--r--test/CodeGen/Mips/rotate.ll10
-rw-r--r--test/CodeGen/Mips/select.ll12
-rw-r--r--test/CodeGen/Mips/selectcc.ll8
-rw-r--r--test/CodeGen/Mips/selectiondag-optlevel.ll22
-rw-r--r--test/CodeGen/Mips/stackcoloring.ll2
-rw-r--r--test/CodeGen/Mips/start-asm-file.ll24
-rw-r--r--test/CodeGen/Mips/stchar.ll50
-rw-r--r--test/CodeGen/Mips/stldst.ll8
-rw-r--r--test/CodeGen/Mips/tailcall.ll15
-rw-r--r--test/CodeGen/Mips/thread-pointer.ll12
-rw-r--r--test/CodeGen/Mips/tls-models.ll2
-rw-r--r--test/CodeGen/Mips/tls.ll8
-rw-r--r--test/CodeGen/Mips/unalignedload.ll12
-rw-r--r--test/CodeGen/Mips/zeroreg.ll14
-rw-r--r--test/CodeGen/NVPTX/MachineSink-call.ll23
-rw-r--r--test/CodeGen/NVPTX/MachineSink-convergent.ll23
-rw-r--r--test/CodeGen/NVPTX/TailDuplication-convergent.ll45
-rw-r--r--test/CodeGen/NVPTX/access-non-generic.ll91
-rw-r--r--test/CodeGen/NVPTX/alias.ll7
-rw-r--r--test/CodeGen/NVPTX/arithmetic-int.ll24
-rw-r--r--test/CodeGen/NVPTX/bug22322.ll12
-rw-r--r--test/CodeGen/NVPTX/bug26185-2.ll34
-rw-r--r--test/CodeGen/NVPTX/bug26185.ll57
-rw-r--r--test/CodeGen/NVPTX/convergent-mir-call.ll27
-rw-r--r--test/CodeGen/NVPTX/debug-file-loc.ll43
-rw-r--r--test/CodeGen/NVPTX/disable-opt.ll12
-rw-r--r--test/CodeGen/NVPTX/global-ctor-empty.ll5
-rw-r--r--test/CodeGen/NVPTX/global-ctor.ll9
-rw-r--r--test/CodeGen/NVPTX/global-dtor.ll9
-rw-r--r--test/CodeGen/NVPTX/global-visibility.ll16
-rw-r--r--test/CodeGen/NVPTX/intrinsic-old.ll211
-rw-r--r--test/CodeGen/NVPTX/noduplicate-syncthreads.ll8
-rw-r--r--test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll13
-rw-r--r--test/CodeGen/NVPTX/shfl.ll90
-rw-r--r--test/CodeGen/NVPTX/sm-version-60.ll5
-rw-r--r--test/CodeGen/NVPTX/sm-version-61.ll5
-rw-r--r--test/CodeGen/NVPTX/sm-version-62.ll5
-rw-r--r--test/CodeGen/NVPTX/speculative-execution-divergent-target.ll24
-rw-r--r--test/CodeGen/NVPTX/zeroext-32bit.ll26
-rw-r--r--test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll21
-rw-r--r--test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll2
-rw-r--r--test/CodeGen/PowerPC/2008-07-15-Bswap.ll5
-rw-r--r--test/CodeGen/PowerPC/2010-02-04-EmptyGlobal.ll2
-rw-r--r--test/CodeGen/PowerPC/2012-11-16-mischedcall.ll8
-rw-r--r--test/CodeGen/PowerPC/2016-04-16-ADD8TLS.ll43
-rw-r--r--test/CodeGen/PowerPC/2016-04-17-combine.ll26
-rw-r--r--test/CodeGen/PowerPC/2016-04-28-setjmp.ll48
-rw-r--r--test/CodeGen/PowerPC/BreakableToken-reduced.ll2
-rw-r--r--test/CodeGen/PowerPC/aantidep-def-ec.mir1
-rw-r--r--test/CodeGen/PowerPC/addisdtprelha-nonr3.mir1
-rw-r--r--test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll24
-rw-r--r--test/CodeGen/PowerPC/align.ll20
-rw-r--r--test/CodeGen/PowerPC/andc.ll36
-rw-r--r--test/CodeGen/PowerPC/asm-constraints.ll62
-rw-r--r--test/CodeGen/PowerPC/asm-printer-topological-order.ll15
-rw-r--r--test/CodeGen/PowerPC/atomics-fences.ll4
-rw-r--r--test/CodeGen/PowerPC/available-externally.ll81
-rw-r--r--test/CodeGen/PowerPC/bdzlr.ll3
-rw-r--r--test/CodeGen/PowerPC/builtins-ppc-p8vector.ll52
-rw-r--r--test/CodeGen/PowerPC/cannonicalize-vector-shifts.ll27
-rw-r--r--test/CodeGen/PowerPC/code-align.ll60
-rw-r--r--test/CodeGen/PowerPC/combine-to-pre-index-store-crash.ll25
-rw-r--r--test/CodeGen/PowerPC/crsave.ll20
-rw-r--r--test/CodeGen/PowerPC/crypto_bifs.ll1
-rw-r--r--test/CodeGen/PowerPC/ctr-minmaxnum.ll231
-rw-r--r--test/CodeGen/PowerPC/ctrloop-udivti3.ll5
-rw-r--r--test/CodeGen/PowerPC/ctrloops-softfloat.ll129
-rw-r--r--test/CodeGen/PowerPC/ctrloops.ll15
-rw-r--r--test/CodeGen/PowerPC/cxx_tlscc64.ll43
-rw-r--r--test/CodeGen/PowerPC/dbg.ll5
-rw-r--r--test/CodeGen/PowerPC/direct-move-profit.ll83
-rw-r--r--test/CodeGen/PowerPC/ec-input.ll24
-rw-r--r--test/CodeGen/PowerPC/ext-bool-trunc-repl.ll38
-rw-r--r--test/CodeGen/PowerPC/fabs.ll31
-rw-r--r--test/CodeGen/PowerPC/fast-isel-fcmp-nan.ll187
-rw-r--r--test/CodeGen/PowerPC/fast-isel-fpconv.ll33
-rw-r--r--test/CodeGen/PowerPC/fast-isel-i64offset.ll12
-rw-r--r--test/CodeGen/PowerPC/fdiv-combine.ll10
-rw-r--r--test/CodeGen/PowerPC/fma-assoc.ll4
-rw-r--r--test/CodeGen/PowerPC/fma-ext.ll4
-rw-r--r--test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll36
-rw-r--r--test/CodeGen/PowerPC/fma-mutate.ll2
-rw-r--r--test/CodeGen/PowerPC/fma.ll8
-rw-r--r--test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll4
-rw-r--r--test/CodeGen/PowerPC/hello-reloc.s2
-rw-r--r--test/CodeGen/PowerPC/hidden-vis-2.ll21
-rw-r--r--test/CodeGen/PowerPC/indirect-hidden.ll21
-rw-r--r--test/CodeGen/PowerPC/inline-asm-scalar-to-vector-error.ll14
-rw-r--r--test/CodeGen/PowerPC/lbzux.ll5
-rw-r--r--test/CodeGen/PowerPC/load-two-flts.ll60
-rw-r--r--test/CodeGen/PowerPC/load-v4i8-improved.ll23
-rw-r--r--test/CodeGen/PowerPC/lsr-postinc-pos.ll12
-rw-r--r--test/CodeGen/PowerPC/machine-combiner.ll4
-rw-r--r--test/CodeGen/PowerPC/multi-return.ll21
-rw-r--r--test/CodeGen/PowerPC/no-rlwimi-trivial-commute.mir2
-rw-r--r--test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll23
-rw-r--r--test/CodeGen/PowerPC/opt-sub-inst-cr0-live.mir143
-rw-r--r--test/CodeGen/PowerPC/optcmp.ll8
-rw-r--r--test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll12
-rw-r--r--test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll3
-rw-r--r--test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll970
-rw-r--r--test/CodeGen/PowerPC/pie.ll16
-rw-r--r--test/CodeGen/PowerPC/popcnt.ll24
-rw-r--r--test/CodeGen/PowerPC/ppc-shrink-wrapping.ll7
-rw-r--r--test/CodeGen/PowerPC/ppc32-align-long-double-sf.ll21
-rw-r--r--test/CodeGen/PowerPC/ppc32-constant-BE-ppcf128.ll24
-rw-r--r--test/CodeGen/PowerPC/ppc32-i1-vaarg.ll4
-rw-r--r--test/CodeGen/PowerPC/ppc64-align-long-double.ll16
-rw-r--r--test/CodeGen/PowerPC/ppc64-byval-align.ll3
-rw-r--r--test/CodeGen/PowerPC/ppc64-calls.ll6
-rw-r--r--test/CodeGen/PowerPC/ppc64-fastcc.ll6
-rw-r--r--test/CodeGen/PowerPC/ppc64-linux-func-size.ll2
-rw-r--r--test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll46
-rw-r--r--test/CodeGen/PowerPC/ppc64-sibcall.ll191
-rw-r--r--test/CodeGen/PowerPC/ppc64-toc.ll2
-rw-r--r--test/CodeGen/PowerPC/ppcf128-endian.ll8
-rw-r--r--test/CodeGen/PowerPC/ppcf128sf.ll179
-rw-r--r--test/CodeGen/PowerPC/ppcsoftops.ll2
-rw-r--r--test/CodeGen/PowerPC/pr17168.ll59
-rw-r--r--test/CodeGen/PowerPC/pr24546.ll7
-rw-r--r--test/CodeGen/PowerPC/pr25802.ll52
-rw-r--r--test/CodeGen/PowerPC/pr26180.ll14
-rw-r--r--test/CodeGen/PowerPC/pr26378.ll6
-rw-r--r--test/CodeGen/PowerPC/pr26617.ll15
-rw-r--r--test/CodeGen/PowerPC/pr26690.ll8
-rw-r--r--test/CodeGen/PowerPC/pr27078.ll15
-rw-r--r--test/CodeGen/PowerPC/pr27350.ll26
-rw-r--r--test/CodeGen/PowerPC/pr28130.ll70
-rw-r--r--test/CodeGen/PowerPC/preincprep-invoke.ll8
-rw-r--r--test/CodeGen/PowerPC/qpx-bv-sint.ll6
-rw-r--r--test/CodeGen/PowerPC/qpx-load-splat.ll75
-rw-r--r--test/CodeGen/PowerPC/qpx-s-sel.ll5
-rw-r--r--test/CodeGen/PowerPC/qpx-sel.ll5
-rw-r--r--test/CodeGen/PowerPC/qpx-split-vsetcc.ll21
-rw-r--r--test/CodeGen/PowerPC/remove-redundant-moves.ll107
-rw-r--r--test/CodeGen/PowerPC/rlwinm-zero-ext.ll57
-rw-r--r--test/CodeGen/PowerPC/stack-protector.ll17
-rw-r--r--test/CodeGen/PowerPC/stackmap-frame-setup.ll4
-rw-r--r--test/CodeGen/PowerPC/stubs.ll17
-rw-r--r--test/CodeGen/PowerPC/stwux.ll5
-rw-r--r--test/CodeGen/PowerPC/subreg-postra-2.ll147
-rw-r--r--test/CodeGen/PowerPC/subreg-postra.ll31
-rw-r--r--test/CodeGen/PowerPC/subsumes-pred-regs.ll65
-rw-r--r--test/CodeGen/PowerPC/svr4-redzone.ll18
-rw-r--r--test/CodeGen/PowerPC/swaps-le-2.ll2
-rw-r--r--test/CodeGen/PowerPC/swaps-le-7.ll55
-rw-r--r--test/CodeGen/PowerPC/tailcall-string-rvo.ll47
-rw-r--r--test/CodeGen/PowerPC/thread-pointer.ll17
-rw-r--r--test/CodeGen/PowerPC/tls_get_addr_stackframe.ll21
-rw-r--r--test/CodeGen/PowerPC/unal-altivec.ll18
-rw-r--r--test/CodeGen/PowerPC/unal4-std.ll13
-rw-r--r--test/CodeGen/PowerPC/unwind-dw2-g.ll5
-rw-r--r--test/CodeGen/PowerPC/vec_abs.ll80
-rw-r--r--test/CodeGen/PowerPC/vec_cmp.ll6
-rw-r--r--test/CodeGen/PowerPC/vec_fneg.ll33
-rw-r--r--test/CodeGen/PowerPC/vrsave-spill.ll4
-rw-r--r--test/CodeGen/PowerPC/vsx-fma-m.ll1
-rw-r--r--test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll6
-rw-r--r--test/CodeGen/PowerPC/vsx-fma-sp.ll2
-rw-r--r--test/CodeGen/PowerPC/vsx-infl-copy1.ll19
-rw-r--r--test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll202
-rw-r--r--test/CodeGen/PowerPC/vsx-word-splats.ll147
-rw-r--r--test/CodeGen/PowerPC/vsx.ll94
-rw-r--r--test/CodeGen/PowerPC/weak_def_can_be_hidden.ll8
-rw-r--r--test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll16
-rwxr-xr-xtest/CodeGen/SPARC/2011-01-11-CC.ll4
-rw-r--r--test/CodeGen/SPARC/32abi.ll140
-rw-r--r--test/CodeGen/SPARC/64abi.ll185
-rw-r--r--test/CodeGen/SPARC/LeonFixCALLPassUT.ll20
-rwxr-xr-xtest/CodeGen/SPARC/LeonFixFSMULDPassUT.ll31
-rw-r--r--test/CodeGen/SPARC/LeonInsertNOPLoad.ll13
-rwxr-xr-xtest/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll43
-rw-r--r--test/CodeGen/SPARC/LeonInsertNOPsDoublePrecision.ll17
-rw-r--r--test/CodeGen/SPARC/LeonItinerariesUT.ll50
-rw-r--r--test/CodeGen/SPARC/LeonPreventRoundChangePassUT.ll65
-rwxr-xr-xtest/CodeGen/SPARC/LeonReplaceFMULSPassUT.ll19
-rw-r--r--test/CodeGen/SPARC/LeonReplaceSDIVPassUT.ll9
-rwxr-xr-xtest/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll20
-rw-r--r--test/CodeGen/SPARC/atomics.ll170
-rw-r--r--test/CodeGen/SPARC/float.ll9
-rw-r--r--test/CodeGen/SPARC/fp128.ll117
-rw-r--r--test/CodeGen/SPARC/func-addr.ll51
-rw-r--r--test/CodeGen/SPARC/inlineasm.ll12
-rw-r--r--test/CodeGen/SPARC/missinglabel.ll4
-rwxr-xr-xtest/CodeGen/SPARC/sjlj.ll88
-rw-r--r--test/CodeGen/SPARC/soft-float.ll235
-rw-r--r--test/CodeGen/SPARC/stack-protector.ll33
-rw-r--r--test/CodeGen/SPARC/thread-pointer.ll11
-rw-r--r--test/CodeGen/SPARC/vector-call.ll33
-rw-r--r--test/CodeGen/SPARC/zerostructcall.ll51
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-01.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-03.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-04.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-05.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-06.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-09.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-10.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-11.py3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-12.py3
-rw-r--r--test/CodeGen/SystemZ/alloca-01.ll2
-rw-r--r--test/CodeGen/SystemZ/and-xor-01.ll14
-rw-r--r--test/CodeGen/SystemZ/args-09.ll53
-rw-r--r--test/CodeGen/SystemZ/args-10.ll50
-rw-r--r--test/CodeGen/SystemZ/asm-02.ll37
-rw-r--r--test/CodeGen/SystemZ/asm-03.ll40
-rw-r--r--test/CodeGen/SystemZ/asm-04.ll63
-rw-r--r--test/CodeGen/SystemZ/asm-05.ll3
-rw-r--r--test/CodeGen/SystemZ/atomic-fence-01.ll16
-rw-r--r--test/CodeGen/SystemZ/atomic-fence-02.ll13
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-add-01.ll24
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-add-02.ll30
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-and-01.ll33
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-and-02.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-minmax-01.ll66
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-minmax-02.ll66
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-minmax-03.ll36
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-minmax-04.ll30
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-nand-01.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-nand-02.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-or-01.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-or-02.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-sub-01.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-sub-02.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-xchg-01.ll18
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-xchg-02.ll18
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-xor-01.ll34
-rw-r--r--test/CodeGen/SystemZ/atomicrmw-xor-02.ll34
-rw-r--r--test/CodeGen/SystemZ/backchain.ll84
-rw-r--r--test/CodeGen/SystemZ/branch-05.ll2
-rw-r--r--test/CodeGen/SystemZ/bswap-06.ll99
-rw-r--r--test/CodeGen/SystemZ/bswap-07.ll100
-rw-r--r--test/CodeGen/SystemZ/builtins.ll14
-rw-r--r--test/CodeGen/SystemZ/call-04.ll369
-rw-r--r--test/CodeGen/SystemZ/call-05.ll467
-rw-r--r--test/CodeGen/SystemZ/cmpxchg-01.ll12
-rw-r--r--test/CodeGen/SystemZ/cmpxchg-02.ll14
-rw-r--r--test/CodeGen/SystemZ/cmpxchg-05.ll81
-rw-r--r--test/CodeGen/SystemZ/cond-li.ll23
-rw-r--r--test/CodeGen/SystemZ/cond-store-01.ll51
-rw-r--r--test/CodeGen/SystemZ/cond-store-02.ll51
-rw-r--r--test/CodeGen/SystemZ/cond-store-03.ll39
-rw-r--r--test/CodeGen/SystemZ/cond-store-04.ll21
-rw-r--r--test/CodeGen/SystemZ/cond-store-05.ll27
-rw-r--r--test/CodeGen/SystemZ/cond-store-06.ll27
-rw-r--r--test/CodeGen/SystemZ/dyn-alloca-offset.ll42
-rw-r--r--test/CodeGen/SystemZ/fp-cmp-01.ll38
-rw-r--r--test/CodeGen/SystemZ/fp-cmp-02.ll16
-rw-r--r--test/CodeGen/SystemZ/fp-cmp-03.ll4
-rw-r--r--test/CodeGen/SystemZ/fp-cmp-04.ll38
-rw-r--r--test/CodeGen/SystemZ/fp-cmp-05.ll12
-rw-r--r--test/CodeGen/SystemZ/fp-copysign-01.ll18
-rw-r--r--test/CodeGen/SystemZ/fp-move-01.ll1
-rw-r--r--test/CodeGen/SystemZ/fp-move-09.ll5
-rw-r--r--test/CodeGen/SystemZ/fp-move-10.ll4
-rw-r--r--test/CodeGen/SystemZ/fp-move-12.ll33
-rw-r--r--test/CodeGen/SystemZ/fp-sqrt-01.ll4
-rw-r--r--test/CodeGen/SystemZ/fp-sqrt-02.ll4
-rw-r--r--test/CodeGen/SystemZ/frameaddr-01.ll28
-rw-r--r--test/CodeGen/SystemZ/htm-intrinsics.ll8
-rw-r--r--test/CodeGen/SystemZ/int-cmp-01.ll2
-rw-r--r--test/CodeGen/SystemZ/int-cmp-02.ll24
-rw-r--r--test/CodeGen/SystemZ/int-cmp-03.ll24
-rw-r--r--test/CodeGen/SystemZ/int-cmp-04.ll2
-rw-r--r--test/CodeGen/SystemZ/int-cmp-05.ll28
-rw-r--r--test/CodeGen/SystemZ/int-cmp-06.ll36
-rw-r--r--test/CodeGen/SystemZ/int-cmp-07.ll18
-rw-r--r--test/CodeGen/SystemZ/int-cmp-08.ll18
-rw-r--r--test/CodeGen/SystemZ/int-cmp-09.ll57
-rw-r--r--test/CodeGen/SystemZ/int-cmp-10.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-11.ll45
-rw-r--r--test/CodeGen/SystemZ/int-cmp-12.ll18
-rw-r--r--test/CodeGen/SystemZ/int-cmp-13.ll48
-rw-r--r--test/CodeGen/SystemZ/int-cmp-14.ll48
-rw-r--r--test/CodeGen/SystemZ/int-cmp-15.ll20
-rw-r--r--test/CodeGen/SystemZ/int-cmp-16.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-17.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-18.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-19.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-20.ll20
-rw-r--r--test/CodeGen/SystemZ/int-cmp-21.ll20
-rw-r--r--test/CodeGen/SystemZ/int-cmp-22.ll18
-rw-r--r--test/CodeGen/SystemZ/int-cmp-23.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-24.ll8
-rw-r--r--test/CodeGen/SystemZ/int-cmp-25.ll8
-rw-r--r--test/CodeGen/SystemZ/int-cmp-26.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-27.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-28.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-29.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-30.ll20
-rw-r--r--test/CodeGen/SystemZ/int-cmp-31.ll20
-rw-r--r--test/CodeGen/SystemZ/int-cmp-32.ll28
-rw-r--r--test/CodeGen/SystemZ/int-cmp-33.ll16
-rw-r--r--test/CodeGen/SystemZ/int-cmp-34.ll28
-rw-r--r--test/CodeGen/SystemZ/int-cmp-35.ll16
-rw-r--r--test/CodeGen/SystemZ/int-cmp-36.ll10
-rw-r--r--test/CodeGen/SystemZ/int-cmp-37.ll18
-rw-r--r--test/CodeGen/SystemZ/int-cmp-38.ll14
-rw-r--r--test/CodeGen/SystemZ/int-cmp-39.ll10
-rw-r--r--test/CodeGen/SystemZ/int-cmp-40.ll18
-rw-r--r--test/CodeGen/SystemZ/int-cmp-41.ll10
-rw-r--r--test/CodeGen/SystemZ/int-cmp-42.ll10
-rw-r--r--test/CodeGen/SystemZ/int-cmp-43.ll12
-rw-r--r--test/CodeGen/SystemZ/int-cmp-44.ll82
-rw-r--r--test/CodeGen/SystemZ/int-cmp-46.ll48
-rw-r--r--test/CodeGen/SystemZ/int-cmp-47.ll30
-rw-r--r--test/CodeGen/SystemZ/int-cmp-48.ll4
-rw-r--r--test/CodeGen/SystemZ/memchr-01.ll2
-rw-r--r--test/CodeGen/SystemZ/memchr-02.ll2
-rw-r--r--test/CodeGen/SystemZ/memchr-nobuiltin.ll16
-rw-r--r--test/CodeGen/SystemZ/memcmp-01.ll12
-rw-r--r--test/CodeGen/SystemZ/memcmp-02.ll10
-rw-r--r--test/CodeGen/SystemZ/memcmp-nobuiltin.ll191
-rw-r--r--test/CodeGen/SystemZ/pie.ll13
-rw-r--r--test/CodeGen/SystemZ/ret-addr-01.ll15
-rw-r--r--test/CodeGen/SystemZ/risbg-01.ll21
-rw-r--r--test/CodeGen/SystemZ/risbg-02.ll25
-rw-r--r--test/CodeGen/SystemZ/rot-01.ll35
-rw-r--r--test/CodeGen/SystemZ/rot-02.ll86
-rw-r--r--test/CodeGen/SystemZ/shift-11.ll63
-rw-r--r--test/CodeGen/SystemZ/shift-12.ll106
-rw-r--r--test/CodeGen/SystemZ/stack-guard.ll35
-rw-r--r--test/CodeGen/SystemZ/strcmp-01.ll4
-rw-r--r--test/CodeGen/SystemZ/strcmp-02.ll4
-rw-r--r--test/CodeGen/SystemZ/strcmp-nobuiltin.ll54
-rw-r--r--test/CodeGen/SystemZ/strcpy-nobuiltin.ll42
-rw-r--r--test/CodeGen/SystemZ/strlen-nobuiltin.ll25
-rw-r--r--test/CodeGen/SystemZ/swift-return.ll203
-rw-r--r--test/CodeGen/SystemZ/swifterror.ll358
-rw-r--r--test/CodeGen/SystemZ/swiftself.ll66
-rw-r--r--test/CodeGen/SystemZ/tdc-01.ll95
-rw-r--r--test/CodeGen/SystemZ/tdc-02.ll96
-rw-r--r--test/CodeGen/SystemZ/tdc-03.ll139
-rw-r--r--test/CodeGen/SystemZ/tdc-04.ll85
-rw-r--r--test/CodeGen/SystemZ/tdc-05.ll97
-rw-r--r--test/CodeGen/SystemZ/tdc-06.ll48
-rw-r--r--test/CodeGen/SystemZ/trap-01.ll179
-rw-r--r--test/CodeGen/SystemZ/vec-extract-02.ll2
-rw-r--r--test/CodeGen/SystemZ/vec-intrinsics.ll44
-rw-r--r--test/CodeGen/SystemZ/vec-sub-01.ll2
-rw-r--r--test/CodeGen/Thumb/2010-07-01-FuncAlign.ll2
-rw-r--r--test/CodeGen/Thumb/2010-07-15-debugOrdering.ll18
-rw-r--r--test/CodeGen/Thumb/and_neg.ll20
-rw-r--r--test/CodeGen/Thumb/barrier.ll2
-rw-r--r--test/CodeGen/Thumb/bic_imm.ll26
-rw-r--r--test/CodeGen/Thumb/constants.ll19
-rw-r--r--test/CodeGen/Thumb/ldm-merge-struct.ll2
-rw-r--r--test/CodeGen/Thumb/ldm-stm-postinc.ll81
-rw-r--r--test/CodeGen/Thumb/segmented-stacks.ll2
-rw-r--r--test/CodeGen/Thumb2/2009-09-01-PostRAProlog.ll2
-rw-r--r--test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll1
-rw-r--r--test/CodeGen/Thumb2/2010-02-11-phi-cycle.ll4
-rw-r--r--test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll2
-rw-r--r--test/CodeGen/Thumb2/aligned-constants.ll4
-rw-r--r--test/CodeGen/Thumb2/bicbfi.ll17
-rw-r--r--test/CodeGen/Thumb2/carry.ll8
-rw-r--r--test/CodeGen/Thumb2/emit-unwinding.ll2
-rw-r--r--test/CodeGen/Thumb2/ldr-str-imm12.ll20
-rw-r--r--test/CodeGen/Thumb2/thumb2-call.ll18
-rw-r--r--test/CodeGen/Thumb2/thumb2-cbnz.ll4
-rw-r--r--test/CodeGen/Thumb2/thumb2-cpsr-liveness.ll41
-rw-r--r--test/CodeGen/Thumb2/thumb2-ldm.ll55
-rw-r--r--test/CodeGen/Thumb2/thumb2-tbb.ll2
-rw-r--r--test/CodeGen/Thumb2/tls2.ll4
-rw-r--r--test/CodeGen/Thumb2/v8_IT_5.ll25
-rw-r--r--test/CodeGen/WebAssembly/address-offsets.ll672
-rw-r--r--test/CodeGen/WebAssembly/byval.ll131
-rw-r--r--test/CodeGen/WebAssembly/call.ll5
-rw-r--r--test/CodeGen/WebAssembly/cfg-stackify.ll571
-rw-r--r--test/CodeGen/WebAssembly/comparisons_f32.ll2
-rw-r--r--test/CodeGen/WebAssembly/comparisons_f64.ll2
-rw-r--r--test/CodeGen/WebAssembly/comparisons_i32.ll3
-rw-r--r--test/CodeGen/WebAssembly/comparisons_i64.ll3
-rw-r--r--test/CodeGen/WebAssembly/conv.ll2
-rw-r--r--test/CodeGen/WebAssembly/cpus.ll4
-rw-r--r--test/CodeGen/WebAssembly/dead-vreg.ll2
-rw-r--r--test/CodeGen/WebAssembly/divrem-constant.ll62
-rw-r--r--test/CodeGen/WebAssembly/f32.ll2
-rw-r--r--test/CodeGen/WebAssembly/f64.ll2
-rw-r--r--test/CodeGen/WebAssembly/fast-isel.ll28
-rw-r--r--test/CodeGen/WebAssembly/frem.ll2
-rw-r--r--test/CodeGen/WebAssembly/func.ll5
-rw-r--r--test/CodeGen/WebAssembly/global.ll48
-rw-r--r--test/CodeGen/WebAssembly/i128.ll280
-rw-r--r--test/CodeGen/WebAssembly/i32-load-store-alignment.ll212
-rw-r--r--test/CodeGen/WebAssembly/i32.ll67
-rw-r--r--test/CodeGen/WebAssembly/i64-load-store-alignment.ll325
-rw-r--r--test/CodeGen/WebAssembly/i64.ll67
-rw-r--r--test/CodeGen/WebAssembly/immediates.ll37
-rw-r--r--test/CodeGen/WebAssembly/indirect-import.ll73
-rw-r--r--test/CodeGen/WebAssembly/inline-asm.ll6
-rw-r--r--test/CodeGen/WebAssembly/irreducible-cfg.ll94
-rw-r--r--test/CodeGen/WebAssembly/legalize.ll2
-rw-r--r--test/CodeGen/WebAssembly/load-ext.ll2
-rw-r--r--test/CodeGen/WebAssembly/load-store-i1.ll24
-rw-r--r--test/CodeGen/WebAssembly/load.ll3
-rw-r--r--test/CodeGen/WebAssembly/loop-idiom.ll53
-rw-r--r--test/CodeGen/WebAssembly/mem-intrinsics.ll140
-rw-r--r--test/CodeGen/WebAssembly/memory-addr32.ll12
-rw-r--r--test/CodeGen/WebAssembly/memory-addr64.ll12
-rw-r--r--test/CodeGen/WebAssembly/non-executable-stack.ll9
-rw-r--r--test/CodeGen/WebAssembly/offset-folding.ll23
-rw-r--r--test/CodeGen/WebAssembly/offset.ll88
-rw-r--r--test/CodeGen/WebAssembly/phi.ll2
-rw-r--r--test/CodeGen/WebAssembly/reg-stackify.ll404
-rw-r--r--test/CodeGen/WebAssembly/return-int32.ll26
-rw-r--r--test/CodeGen/WebAssembly/return-void.ll21
-rw-r--r--test/CodeGen/WebAssembly/returned.ll4
-rw-r--r--test/CodeGen/WebAssembly/select.ll28
-rw-r--r--test/CodeGen/WebAssembly/signext-zeroext.ll24
-rw-r--r--test/CodeGen/WebAssembly/store-results.ll19
-rw-r--r--test/CodeGen/WebAssembly/store-trunc.ll10
-rw-r--r--test/CodeGen/WebAssembly/store.ll11
-rw-r--r--test/CodeGen/WebAssembly/switch.ll6
-rw-r--r--test/CodeGen/WebAssembly/unreachable.ll2
-rw-r--r--test/CodeGen/WebAssembly/unused-argument.ll4
-rw-r--r--test/CodeGen/WebAssembly/userstack.ll277
-rw-r--r--test/CodeGen/WebAssembly/varargs.ll87
-rw-r--r--test/CodeGen/WinEH/wineh-asm.ll26
-rw-r--r--test/CodeGen/WinEH/wineh-cloning.ll5
-rw-r--r--test/CodeGen/WinEH/wineh-nested-unwind.ll55
-rw-r--r--test/CodeGen/WinEH/wineh-setjmp.ll75
-rw-r--r--test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll1
-rw-r--r--test/CodeGen/WinEH/wineh-statenumbering.ll73
-rw-r--r--test/CodeGen/X86/2006-05-02-InstrSched1.ll4
-rw-r--r--test/CodeGen/X86/2006-11-12-CSRetCC.ll9
-rw-r--r--test/CodeGen/X86/2007-08-10-SignExtSubreg.ll5
-rw-r--r--test/CodeGen/X86/2007-08-13-AppendingLinkage.ll12
-rw-r--r--test/CodeGen/X86/2007-10-15-CoalescerCrash.ll2
-rw-r--r--test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll6
-rw-r--r--test/CodeGen/X86/2008-07-19-movups-spills.ll64
-rw-r--r--test/CodeGen/X86/2008-07-22-CombinerCrash.ll4
-rw-r--r--test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll2
-rw-r--r--test/CodeGen/X86/2008-08-19-SubAndFetch.ll12
-rw-r--r--test/CodeGen/X86/2008-09-11-CoalescerBug2.ll4
-rw-r--r--test/CodeGen/X86/2008-09-29-ReMatBug.ll2
-rw-r--r--test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll9
-rw-r--r--test/CodeGen/X86/2009-03-05-burr-list-crash.ll2
-rw-r--r--test/CodeGen/X86/2009-10-16-Scope.ll5
-rw-r--r--test/CodeGen/X86/2010-01-18-DbgValue.ll5
-rw-r--r--test/CodeGen/X86/2010-02-01-DbgValueCrash.ll5
-rw-r--r--test/CodeGen/X86/2010-05-25-DotDebugLoc.ll5
-rw-r--r--test/CodeGen/X86/2010-05-26-DotDebugLoc.ll9
-rw-r--r--test/CodeGen/X86/2010-05-28-Crash.ll7
-rw-r--r--test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll9
-rw-r--r--test/CodeGen/X86/2010-07-06-DbgCrash.ll7
-rw-r--r--test/CodeGen/X86/2010-08-04-StackVariable.ll8
-rw-r--r--test/CodeGen/X86/2010-09-16-EmptyFilename.ll7
-rw-r--r--test/CodeGen/X86/2010-11-02-DbgParameter.ll5
-rw-r--r--test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll7
-rw-r--r--test/CodeGen/X86/2011-06-14-PreschedRegalias.ll2
-rw-r--r--test/CodeGen/X86/2011-09-14-valcoalesce.ll2
-rw-r--r--test/CodeGen/X86/2011-10-21-widen-cmp.ll2
-rw-r--r--test/CodeGen/X86/2012-01-11-split-cv.ll17
-rw-r--r--test/CodeGen/X86/2012-01-12-extract-sv.ll9
-rw-r--r--test/CodeGen/X86/2012-04-26-sdglue.ll36
-rw-r--r--test/CodeGen/X86/2012-1-10-buildvector.ll2
-rw-r--r--test/CodeGen/X86/2012-11-30-handlemove-dbg.ll5
-rw-r--r--test/CodeGen/X86/2012-11-30-misched-dbg.ll10
-rw-r--r--test/CodeGen/X86/2012-11-30-regpres-dbg.ll7
-rw-r--r--test/CodeGen/X86/3addr-16bit.ll8
-rw-r--r--test/CodeGen/X86/AppendingLinkage.ll4
-rw-r--r--test/CodeGen/X86/GC/dynamic-frame-size.ll2
-rw-r--r--test/CodeGen/X86/GC/erlang-gc.ll4
-rw-r--r--test/CodeGen/X86/GC/ocaml-gc.ll4
-rw-r--r--test/CodeGen/X86/MachineSink-DbgValue.ll9
-rw-r--r--test/CodeGen/X86/MergeConsecutiveStores.ll24
-rw-r--r--test/CodeGen/X86/StackColoring-dbg.ll5
-rw-r--r--test/CodeGen/X86/StackColoring.ll175
-rw-r--r--test/CodeGen/X86/WidenArith.ll22
-rw-r--r--test/CodeGen/X86/abi-isel.ll72
-rw-r--r--test/CodeGen/X86/add-nsw-sext.ll6
-rw-r--r--test/CodeGen/X86/add.ll36
-rw-r--r--test/CodeGen/X86/alias-gep.ll22
-rw-r--r--test/CodeGen/X86/aligned-variadic.ll4
-rw-r--r--test/CodeGen/X86/alignment.ll4
-rw-r--r--test/CodeGen/X86/all-ones-vector.ll139
-rw-r--r--test/CodeGen/X86/and-encoding.ll27
-rw-r--r--test/CodeGen/X86/anyext.ll39
-rw-r--r--test/CodeGen/X86/atom-lea-sp.ll4
-rw-r--r--test/CodeGen/X86/atomic-eflags-reuse.ll179
-rw-r--r--test/CodeGen/X86/atomic-non-integer.ll2
-rw-r--r--test/CodeGen/X86/atomic128.ll11
-rw-r--r--test/CodeGen/X86/atomic16.ll56
-rw-r--r--test/CodeGen/X86/atomic8.ll48
-rw-r--r--test/CodeGen/X86/atomic_mi.ll17
-rw-r--r--test/CodeGen/X86/avoid-loop-align.ll2
-rw-r--r--test/CodeGen/X86/avx-basic.ll85
-rw-r--r--test/CodeGen/X86/avx-cast.ll11
-rw-r--r--test/CodeGen/X86/avx-intel-ocl.ll7
-rw-r--r--test/CodeGen/X86/avx-intrinsics-fast-isel.ll3778
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll392
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll4341
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86_64.ll1
-rw-r--r--test/CodeGen/X86/avx-isa-check.ll114
-rw-r--r--test/CodeGen/X86/avx-select.ll29
-rw-r--r--test/CodeGen/X86/avx-shift.ll9
-rwxr-xr-xtest/CodeGen/X86/avx-shuffle-x86_32.ll2
-rw-r--r--test/CodeGen/X86/avx-splat.ll14
-rwxr-xr-xtest/CodeGen/X86/avx-trunc.ll37
-rw-r--r--test/CodeGen/X86/avx-vbroadcast.ll20
-rw-r--r--test/CodeGen/X86/avx-vbroadcastf128.ll111
-rw-r--r--test/CodeGen/X86/avx-vextractf128.ll91
-rw-r--r--test/CodeGen/X86/avx-vperm2x128.ll433
-rw-r--r--test/CodeGen/X86/avx-vzeroupper.ll5
-rwxr-xr-xtest/CodeGen/X86/avx2-conversions.ll6
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-fast-isel.ll3388
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll361
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86.ll1342
-rw-r--r--test/CodeGen/X86/avx2-logic.ll29
-rw-r--r--test/CodeGen/X86/avx2-nontemporal.ll65
-rw-r--r--test/CodeGen/X86/avx2-phaddsub.ll51
-rw-r--r--test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll110
-rw-r--r--test/CodeGen/X86/avx2-pmovxrm.ll201
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll13
-rw-r--r--test/CodeGen/X86/avx2-vbroadcasti128.ll129
-rw-r--r--test/CodeGen/X86/avx2-vector-shifts.ll445
-rwxr-xr-xtest/CodeGen/X86/avx2-vperm.ll27
-rw-r--r--test/CodeGen/X86/avx512-any_extend_load.ll70
-rw-r--r--test/CodeGen/X86/avx512-arith.ll142
-rw-r--r--test/CodeGen/X86/avx512-bugfix-23634.ll27
-rw-r--r--test/CodeGen/X86/avx512-bugfix-26264.ll47
-rw-r--r--test/CodeGen/X86/avx512-build-vector.ll7
-rw-r--r--test/CodeGen/X86/avx512-calling-conv.ll103
-rw-r--r--test/CodeGen/X86/avx512-cmp.ll130
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll665
-rw-r--r--test/CodeGen/X86/avx512-ext.ll713
-rw-r--r--test/CodeGen/X86/avx512-extract-subvector.ll290
-rw-r--r--test/CodeGen/X86/avx512-fma-intrinsics.ll307
-rw-r--r--test/CodeGen/X86/avx512-fma.ll35
-rw-r--r--test/CodeGen/X86/avx512-gather-scatter-intrin.ll28
-rw-r--r--test/CodeGen/X86/avx512-inc-dec.ll2
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll521
-rw-r--r--test/CodeGen/X86/avx512-intel-ocl.ll13
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-fast-isel.ll1134
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll1089
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll2817
-rw-r--r--test/CodeGen/X86/avx512-logic.ll66
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll2318
-rw-r--r--test/CodeGen/X86/avx512-mask-spills.ll126
-rw-r--r--test/CodeGen/X86/avx512-mov.ll376
-rw-r--r--test/CodeGen/X86/avx512-nontemporal.ll16
-rw-r--r--test/CodeGen/X86/avx512-scalarIntrinsics.ll66
-rw-r--r--test/CodeGen/X86/avx512-select.ll138
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll73
-rw-r--r--test/CodeGen/X86/avx512-trunc.ll15
-rw-r--r--test/CodeGen/X86/avx512-unsafe-fp-math.ll107
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll48
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll1073
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll413
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll538
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll905
-rw-r--r--test/CodeGen/X86/avx512bw-mask-op.ll107
-rw-r--r--test/CodeGen/X86/avx512bw-mov.ll185
-rw-r--r--test/CodeGen/X86/avx512bw-vec-cmp.ll113
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll244
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll629
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll4769
-rw-r--r--test/CodeGen/X86/avx512bwvl-mov.ll129
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-cmp.ll225
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll61
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics.ll277
-rw-r--r--test/CodeGen/X86/avx512dq-mask-op.ll55
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics.ll1530
-rw-r--r--test/CodeGen/X86/avx512ifma-intrinsics.ll105
-rw-r--r--test/CodeGen/X86/avx512ifmavl-intrinsics.ll226
-rw-r--r--test/CodeGen/X86/avx512vbmi-intrinsics.ll95
-rw-r--r--test/CodeGen/X86/avx512vbmivl-intrinsics.ll195
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll1391
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll2536
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll6995
-rw-r--r--test/CodeGen/X86/avx512vl-logic.ll48
-rw-r--r--test/CodeGen/X86/avx512vl-mov.ll517
-rw-r--r--test/CodeGen/X86/avx512vl-vbroadcast.ll175
-rw-r--r--test/CodeGen/X86/avx512vl-vec-cmp.ll301
-rw-r--r--test/CodeGen/X86/base-pointer-and-cmpxchg.ll51
-rw-r--r--test/CodeGen/X86/bit-piece-comment.ll5
-rw-r--r--test/CodeGen/X86/bitreverse.ll382
-rw-r--r--test/CodeGen/X86/block-placement.ll273
-rw-r--r--test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll165
-rw-r--r--test/CodeGen/X86/bmi-intrinsics-fast-isel.ll326
-rw-r--r--test/CodeGen/X86/bmi.ll555
-rw-r--r--test/CodeGen/X86/bool-zext.ll37
-rw-r--r--test/CodeGen/X86/br-fold.ll6
-rw-r--r--test/CodeGen/X86/break-false-dep.ll4
-rw-r--r--test/CodeGen/X86/bss_pagealigned.ll2
-rw-r--r--test/CodeGen/X86/bswap-vector.ll151
-rw-r--r--test/CodeGen/X86/bt.ll1225
-rw-r--r--test/CodeGen/X86/buildvec-insertvec.ll2
-rw-r--r--test/CodeGen/X86/byval2.ll4
-rw-r--r--test/CodeGen/X86/call-push.ll2
-rw-r--r--test/CodeGen/X86/catchpad-dynamic-alloca.ll65
-rw-r--r--test/CodeGen/X86/catchpad-lifetime.ll8
-rw-r--r--test/CodeGen/X86/catchret-regmask.ll73
-rw-r--r--test/CodeGen/X86/cfstring.ll2
-rw-r--r--test/CodeGen/X86/cleanuppad-inalloca.ll4
-rw-r--r--test/CodeGen/X86/cleanuppad-realign.ll4
-rw-r--r--test/CodeGen/X86/clear_upper_vector_element_bits.ll683
-rw-r--r--test/CodeGen/X86/clz.ll755
-rw-r--r--test/CodeGen/X86/cmov-into-branch.ll132
-rw-r--r--test/CodeGen/X86/cmov.ll4
-rw-r--r--test/CodeGen/X86/cmovcmov.ll6
-rw-r--r--test/CodeGen/X86/cmp.ll31
-rw-r--r--test/CodeGen/X86/cmpxchg-clobber-flags.ll12
-rw-r--r--test/CodeGen/X86/cmpxchg-i1.ll6
-rw-r--r--test/CodeGen/X86/cmpxchg-i128-i1.ll4
-rw-r--r--test/CodeGen/X86/coalescer-commute3.ll2
-rw-r--r--test/CodeGen/X86/code_placement_align_all.ll6
-rw-r--r--test/CodeGen/X86/code_placement_cold_loop_blocks.ll2
-rw-r--r--test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll2
-rw-r--r--test/CodeGen/X86/code_placement_loop_rotation.ll2
-rw-r--r--test/CodeGen/X86/code_placement_loop_rotation2.ll2
-rw-r--r--test/CodeGen/X86/code_placement_loop_rotation3.ll42
-rw-r--r--test/CodeGen/X86/code_placement_outline_optional_branches.ll2
-rw-r--r--test/CodeGen/X86/combine-multiplies.ll10
-rw-r--r--test/CodeGen/X86/combine-or.ll59
-rw-r--r--test/CodeGen/X86/combine-testm-and.ll57
-rw-r--r--test/CodeGen/X86/commute-blend-avx2.ll67
-rw-r--r--test/CodeGen/X86/commute-blend-sse41.ll27
-rw-r--r--test/CodeGen/X86/commute-fcmp.ll693
-rw-r--r--test/CodeGen/X86/constructor.ll12
-rw-r--r--test/CodeGen/X86/crash-lre-eliminate-dead-def.ll268
-rw-r--r--test/CodeGen/X86/ctpop-combine.ll38
-rw-r--r--test/CodeGen/X86/cxx_tlscc64.ll94
-rw-r--r--test/CodeGen/X86/dag-optnone.ll15
-rw-r--r--test/CodeGen/X86/darwin-stub.ll12
-rw-r--r--test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll253
-rw-r--r--test/CodeGen/X86/dbg-combine.ll5
-rw-r--r--test/CodeGen/X86/debugloc-argsize.ll5
-rw-r--r--test/CodeGen/X86/deopt-bundles.ll161
-rw-r--r--test/CodeGen/X86/deopt-intrinsic-cconv.ll34
-rw-r--r--test/CodeGen/X86/deopt-intrinsic.ll56
-rw-r--r--test/CodeGen/X86/dllexport-x86_64.ll31
-rw-r--r--test/CodeGen/X86/dllexport.ll18
-rw-r--r--test/CodeGen/X86/dwarf-comp-dir.ll2
-rw-r--r--test/CodeGen/X86/dynamic-alloca-in-entry.ll2
-rw-r--r--test/CodeGen/X86/dynamic-allocas-VLAs.ll12
-rw-r--r--test/CodeGen/X86/eflags-copy-expansion.mir67
-rw-r--r--test/CodeGen/X86/emutls-pic.ll26
-rw-r--r--test/CodeGen/X86/emutls-pie.ll21
-rw-r--r--test/CodeGen/X86/emutls_generic.ll46
-rw-r--r--test/CodeGen/X86/exedepsfix-broadcast.ll98
-rw-r--r--test/CodeGen/X86/expand-vr64-gr64-copy.mir2
-rw-r--r--test/CodeGen/X86/extractelement-index.ll643
-rw-r--r--test/CodeGen/X86/extractelement-load.ll91
-rw-r--r--test/CodeGen/X86/extractps.ll2
-rw-r--r--test/CodeGen/X86/f16c-intrinsics-fast-isel.ll132
-rw-r--r--test/CodeGen/X86/fast-isel-call.ll44
-rw-r--r--test/CodeGen/X86/fast-isel-cmp-branch2.ll5
-rw-r--r--test/CodeGen/X86/fast-isel-cmp-branch3.ll5
-rw-r--r--test/CodeGen/X86/fast-isel-float-half-convertion.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-nontemporal.ll1083
-rw-r--r--test/CodeGen/X86/fast-isel-stackcheck.ll8
-rw-r--r--test/CodeGen/X86/fast-isel-vecload.ll21
-rw-r--r--test/CodeGen/X86/fast-isel-x86-64.ll8
-rw-r--r--test/CodeGen/X86/fast-isel-x86.ll35
-rw-r--r--test/CodeGen/X86/fastmath-float-half-conversion.ll4
-rw-r--r--test/CodeGen/X86/fixup-bw-copy.ll71
-rw-r--r--test/CodeGen/X86/fixup-bw-copy.mir156
-rw-r--r--test/CodeGen/X86/fixup-bw-inst.ll126
-rw-r--r--test/CodeGen/X86/float-conv-elim.ll2
-rw-r--r--test/CodeGen/X86/fma_patterns.ll41
-rw-r--r--test/CodeGen/X86/fold-push.ll2
-rw-r--r--test/CodeGen/X86/fold-tied-op.ll5
-rw-r--r--test/CodeGen/X86/fold-vector-sext-zext.ll153
-rw-r--r--test/CodeGen/X86/force-align-stack-alloca.ll14
-rw-r--r--test/CodeGen/X86/fp-logic.ll48
-rw-r--r--test/CodeGen/X86/fp-une-cmp.ll122
-rw-r--r--test/CodeGen/X86/fp128-cast.ll267
-rw-r--r--test/CodeGen/X86/fp128-compare.ll33
-rw-r--r--test/CodeGen/X86/fp128-select.ll35
-rw-r--r--test/CodeGen/X86/fpstack-debuginstr-kill.ll7
-rw-r--r--test/CodeGen/X86/frame-order.ll122
-rw-r--r--test/CodeGen/X86/ga-offset.ll13
-rw-r--r--test/CodeGen/X86/ga-offset2.ll10
-rw-r--r--test/CodeGen/X86/global-access-pie.ll123
-rw-r--r--test/CodeGen/X86/global-sections.ll35
-rw-r--r--test/CodeGen/X86/h-registers-3.ll28
-rw-r--r--test/CodeGen/X86/haddsub-2.ll1017
-rw-r--r--test/CodeGen/X86/haddsub-undef.ll321
-rw-r--r--test/CodeGen/X86/haddsub.ll307
-rw-r--r--test/CodeGen/X86/half.ll71
-rw-r--r--test/CodeGen/X86/hipe-cc.ll20
-rw-r--r--test/CodeGen/X86/hipe-cc64.ll21
-rw-r--r--test/CodeGen/X86/hipe-prologue.ll13
-rw-r--r--test/CodeGen/X86/hoist-invariant-load.ll35
-rw-r--r--test/CodeGen/X86/hoist-spill-lpad.ll62
-rw-r--r--test/CodeGen/X86/hoist-spill.ll121
-rw-r--r--test/CodeGen/X86/i16lshr8pat.ll32
-rw-r--r--test/CodeGen/X86/i386-setjmp-pic.ll23
-rw-r--r--test/CodeGen/X86/i386-shrink-wrapping.ll8
-rw-r--r--test/CodeGen/X86/i386-tlscall-fastregalloc.ll11
-rw-r--r--test/CodeGen/X86/i686-win-shrink-wrapping.ll44
-rw-r--r--test/CodeGen/X86/ifunc-asm.ll15
-rw-r--r--test/CodeGen/X86/implicit-null-check.ll42
-rw-r--r--test/CodeGen/X86/implicit-null-checks.mir266
-rw-r--r--test/CodeGen/X86/inalloca-ctor.ll4
-rw-r--r--test/CodeGen/X86/inalloca-invoke.ll3
-rw-r--r--test/CodeGen/X86/inalloca-stdcall.ll4
-rw-r--r--test/CodeGen/X86/inalloca.ll12
-rw-r--r--test/CodeGen/X86/indirect-hidden.ll4
-rw-r--r--test/CodeGen/X86/insertelement-zero.ll264
-rw-r--r--test/CodeGen/X86/insertps-combine.ll159
-rw-r--r--test/CodeGen/X86/interval-update-remat.ll161
-rw-r--r--test/CodeGen/X86/ipra-inline-asm.ll20
-rw-r--r--test/CodeGen/X86/ipra-local-linkage.ll30
-rw-r--r--test/CodeGen/X86/ipra-reg-usage.ll12
-rw-r--r--test/CodeGen/X86/ipra-transform.ll32
-rw-r--r--test/CodeGen/X86/lakemont.ll9
-rw-r--r--test/CodeGen/X86/lea-opt-memop-check-1.ll99
-rw-r--r--test/CodeGen/X86/lea-opt-memop-check-2.ll21
-rw-r--r--test/CodeGen/X86/lea-opt.ll73
-rw-r--r--test/CodeGen/X86/libcall-sret.ll25
-rw-r--r--test/CodeGen/X86/licm-dominance.ll61
-rw-r--r--test/CodeGen/X86/licm-symbol.ll2
-rw-r--r--test/CodeGen/X86/loc-remat.ll55
-rw-r--r--test/CodeGen/X86/local_stack_symbol_ordering.ll184
-rw-r--r--test/CodeGen/X86/localescape.ll34
-rw-r--r--test/CodeGen/X86/lock-inst-encoding.ll44
-rw-r--r--test/CodeGen/X86/loop-blocks.ll28
-rw-r--r--test/CodeGen/X86/lsr-static-addr.ll2
-rw-r--r--test/CodeGen/X86/lzcnt-tzcnt.ll209
-rw-r--r--test/CodeGen/X86/machine-combiner-int.ll37
-rw-r--r--test/CodeGen/X86/machine-copy-prop.mir227
-rw-r--r--test/CodeGen/X86/machine-cp.ll2
-rw-r--r--test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll83
-rw-r--r--test/CodeGen/X86/machine-trace-metrics-crash.ll4
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll382
-rw-r--r--test/CodeGen/X86/masked_memop.ll10394
-rw-r--r--test/CodeGen/X86/materialize-one.ll100
-rw-r--r--test/CodeGen/X86/materialize.ll216
-rw-r--r--test/CodeGen/X86/mbp-false-cfg-break.ll39
-rw-r--r--test/CodeGen/X86/mcinst-lowering.ll15
-rw-r--r--test/CodeGen/X86/mcu-abi.ll59
-rw-r--r--test/CodeGen/X86/memcmp.ll18
-rw-r--r--test/CodeGen/X86/memcpy-from-string.ll24
-rw-r--r--test/CodeGen/X86/memset-2.ll52
-rw-r--r--test/CodeGen/X86/memset-nonzero.ll470
-rw-r--r--test/CodeGen/X86/memset64-on-x86-32.ll58
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-128.ll783
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-256.ll756
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-512.ll718
-rw-r--r--test/CodeGen/X86/merge-sp-update-lea.ll32
-rw-r--r--test/CodeGen/X86/merge-store-partially-alias-loads.ll4
-rw-r--r--test/CodeGen/X86/mfence.ll35
-rw-r--r--test/CodeGen/X86/mingw-alloca.ll4
-rw-r--r--test/CodeGen/X86/misched-aa-colored.ll1
-rw-r--r--test/CodeGen/X86/misched-code-difference-with-debug.ll15
-rw-r--r--test/CodeGen/X86/misched-ilp.ll4
-rw-r--r--test/CodeGen/X86/mmx-bitcast-fold.ll12
-rw-r--r--test/CodeGen/X86/movgs.ll9
-rw-r--r--test/CodeGen/X86/movmsk.ll75
-rw-r--r--test/CodeGen/X86/movpc32-check.ll5
-rw-r--r--test/CodeGen/X86/movtopush.ll112
-rw-r--r--test/CodeGen/X86/movtopush64.ll193
-rw-r--r--test/CodeGen/X86/mul-i256.ll27
-rw-r--r--test/CodeGen/X86/mul128.ll13
-rw-r--r--test/CodeGen/X86/mul64.ll25
-rw-r--r--test/CodeGen/X86/musttail-varargs.ll11
-rw-r--r--test/CodeGen/X86/mwaitx.ll38
-rw-r--r--test/CodeGen/X86/negate-add-zero.ll2
-rw-r--r--test/CodeGen/X86/negative-offset.ll18
-rw-r--r--test/CodeGen/X86/new-remat.ll70
-rw-r--r--test/CodeGen/X86/no-prolog-kill.ll21
-rw-r--r--test/CodeGen/X86/no-sse2-avg.ll32
-rw-r--r--test/CodeGen/X86/nontemporal-2.ll1122
-rw-r--r--test/CodeGen/X86/nontemporal-loads.ll1638
-rw-r--r--test/CodeGen/X86/nontemporal.ll137
-rw-r--r--test/CodeGen/X86/noreturn-call.ll48
-rw-r--r--test/CodeGen/X86/null-streamer.ll5
-rw-r--r--test/CodeGen/X86/opt-ext-uses.ll4
-rw-r--r--test/CodeGen/X86/or-lea.ll13
-rw-r--r--test/CodeGen/X86/osx-private-labels.ll24
-rw-r--r--test/CodeGen/X86/patchable-prologue.ll67
-rw-r--r--test/CodeGen/X86/patchpoint-verifiable.mir2
-rw-r--r--test/CodeGen/X86/peephole-na-phys-copy-folding.ll17
-rw-r--r--test/CodeGen/X86/phaddsub.ll173
-rw-r--r--test/CodeGen/X86/phi-immediate-factoring.ll3
-rw-r--r--test/CodeGen/X86/phys-reg-local-regalloc.ll6
-rw-r--r--test/CodeGen/X86/phys_subreg_coalesce-2.ll2
-rw-r--r--test/CodeGen/X86/pic.ll2
-rw-r--r--test/CodeGen/X86/pic_jumptable.ll2
-rw-r--r--test/CodeGen/X86/pie.ll45
-rw-r--r--test/CodeGen/X86/pku.ll2
-rw-r--r--test/CodeGen/X86/pmul.ll978
-rw-r--r--test/CodeGen/X86/pop-stack-cleanup.ll13
-rw-r--r--test/CodeGen/X86/popcnt.ll243
-rw-r--r--test/CodeGen/X86/post-ra-sched.ll40
-rw-r--r--test/CodeGen/X86/postra-licm.ll4
-rw-r--r--test/CodeGen/X86/powi.ll5
-rw-r--r--test/CodeGen/X86/pr15267.ll16
-rw-r--r--test/CodeGen/X86/pr16360.ll17
-rw-r--r--test/CodeGen/X86/pr17764.ll14
-rw-r--r--test/CodeGen/X86/pr23664.ll2
-rw-r--r--test/CodeGen/X86/pr2585.ll32
-rw-r--r--test/CodeGen/X86/pr26350.ll21
-rw-r--r--test/CodeGen/X86/pr2659.ll2
-rw-r--r--test/CodeGen/X86/pr26652.ll9
-rw-r--r--test/CodeGen/X86/pr26757.ll34
-rw-r--r--test/CodeGen/X86/pr26835.ll10
-rw-r--r--test/CodeGen/X86/pr26870.ll37
-rw-r--r--test/CodeGen/X86/pr27071.ll29
-rw-r--r--test/CodeGen/X86/pr27501.ll67
-rw-r--r--test/CodeGen/X86/pr27591.ll51
-rw-r--r--test/CodeGen/X86/pr27681.mir87
-rw-r--r--test/CodeGen/X86/pr28173.ll41
-rw-r--r--test/CodeGen/X86/pr28444.ll27
-rw-r--r--test/CodeGen/X86/pr28472.ll11
-rw-r--r--test/CodeGen/X86/pr28489.ll15
-rw-r--r--test/CodeGen/X86/pr28515.ll16
-rw-r--r--test/CodeGen/X86/pr28560.ll13
-rw-r--r--test/CodeGen/X86/pr5145.ll16
-rw-r--r--test/CodeGen/X86/promote-i16.ll16
-rw-r--r--test/CodeGen/X86/ps4-noreturn.ll38
-rw-r--r--test/CodeGen/X86/pshufb-mask-comments.ll66
-rw-r--r--test/CodeGen/X86/psubus.ll95
-rw-r--r--test/CodeGen/X86/push-cfi-debug.ll7
-rw-r--r--test/CodeGen/X86/push-cfi.ll2
-rw-r--r--test/CodeGen/X86/ragreedy-hoist-spill.ll10
-rw-r--r--test/CodeGen/X86/reduce-trunc-shl.ll28
-rw-r--r--test/CodeGen/X86/regalloc-reconcile-broken-hints.ll2
-rw-r--r--test/CodeGen/X86/rem.ll89
-rw-r--r--test/CodeGen/X86/rem_crash.ll3
-rw-r--r--test/CodeGen/X86/return-ext.ll138
-rw-r--r--test/CodeGen/X86/rtm.ll18
-rw-r--r--test/CodeGen/X86/sad.ll1001
-rw-r--r--test/CodeGen/X86/safestack_ssp.ll27
-rw-r--r--test/CodeGen/X86/segmented-stacks.ll34
-rw-r--r--test/CodeGen/X86/seh-catch-all-win32.ll8
-rw-r--r--test/CodeGen/X86/seh-safe-div-win32.ll4
-rw-r--r--test/CodeGen/X86/seh-safe-div.ll8
-rw-r--r--test/CodeGen/X86/seh-stack-realign.ll12
-rw-r--r--test/CodeGen/X86/setcc-lowering.ll4
-rw-r--r--test/CodeGen/X86/setcc-narrowing.ll4
-rw-r--r--test/CodeGen/X86/setcc.ll26
-rw-r--r--test/CodeGen/X86/sext-ret-val.ll12
-rw-r--r--test/CodeGen/X86/sext-setcc-self.ll87
-rw-r--r--test/CodeGen/X86/sext-trunc.ll11
-rw-r--r--test/CodeGen/X86/shift-pcmp.ll45
-rw-r--r--test/CodeGen/X86/shrink-wrap-chkstk.ll8
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll865
-rw-r--r--test/CodeGen/X86/sibcall-5.ll10
-rw-r--r--test/CodeGen/X86/sibcall-byval.ll6
-rw-r--r--test/CodeGen/X86/sincos-opt.ll30
-rw-r--r--test/CodeGen/X86/sink-blockfreq.ll4
-rw-r--r--test/CodeGen/X86/sink-cheap-instructions.ll2
-rw-r--r--test/CodeGen/X86/sjlj-eh.ll72
-rw-r--r--test/CodeGen/X86/slow-unaligned-mem.ll18
-rw-r--r--test/CodeGen/X86/sqrt-fastmath-mir.ll52
-rw-r--r--test/CodeGen/X86/sqrt-fastmath.ll29
-rw-r--r--test/CodeGen/X86/sse-intel-ocl.ll2
-rw-r--r--test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll35
-rw-r--r--test/CodeGen/X86/sse-intrinsics-fast-isel.ll2303
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll27
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86.ll449
-rw-r--r--test/CodeGen/X86/sse1.ll15
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll76
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel.ll3849
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll178
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86.ll937
-rw-r--r--test/CodeGen/X86/sse2.ll6
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub-2.ll2
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub.ll7
-rw-r--r--test/CodeGen/X86/sse3-intrinsics-fast-isel.ll13
-rw-r--r--test/CodeGen/X86/sse3.ll6
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-fast-isel.ll1008
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll228
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86.ll355
-rw-r--r--test/CodeGen/X86/sse41-pmovxrm.ll (renamed from test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll)37
-rw-r--r--test/CodeGen/X86/sse41.ll53
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll26
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-fast-isel.ll401
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-x86.ll197
-rw-r--r--test/CodeGen/X86/sse42.ll53
-rw-r--r--test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll20
-rw-r--r--test/CodeGen/X86/sse4a-upgrade.ll39
-rw-r--r--test/CodeGen/X86/sse4a.ll77
-rw-r--r--test/CodeGen/X86/sse_partial_update.ll84
-rw-r--r--test/CodeGen/X86/ssp-data-layout.ll2
-rw-r--r--test/CodeGen/X86/ssp-guard-spill.ll54
-rw-r--r--test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll28
-rw-r--r--test/CodeGen/X86/stack-align.ll28
-rw-r--r--test/CodeGen/X86/stack-align2.ll5
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx1.ll59
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx512vl.ll137
-rw-r--r--test/CodeGen/X86/stack-folding-fp-sse42.ll18
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx2.ll53
-rw-r--r--test/CodeGen/X86/stack-folding-xop.ll40
-rw-r--r--test/CodeGen/X86/stack-protector-dbginfo.ll11
-rw-r--r--test/CodeGen/X86/stack-protector-msvc.ll40
-rw-r--r--test/CodeGen/X86/stack-protector-target.ll27
-rw-r--r--test/CodeGen/X86/stack-protector-weight.ll40
-rw-r--r--test/CodeGen/X86/stack-protector.ll462
-rw-r--r--test/CodeGen/X86/stack_guard_remat.ll2
-rw-r--r--test/CodeGen/X86/stackguard-internal.ll15
-rw-r--r--test/CodeGen/X86/stackmap-frame-setup.ll4
-rw-r--r--test/CodeGen/X86/stackmap-large-constants.ll2
-rw-r--r--test/CodeGen/X86/stackmap-liveness.ll16
-rw-r--r--test/CodeGen/X86/statepoint-allocas.ll4
-rw-r--r--test/CodeGen/X86/statepoint-invoke.ll4
-rw-r--r--test/CodeGen/X86/statepoint-stack-usage.ll2
-rw-r--r--test/CodeGen/X86/statepoint-stackmap-format.ll10
-rw-r--r--test/CodeGen/X86/statepoint-uniqueing.ll31
-rw-r--r--test/CodeGen/X86/statepoint-vector-bad-spill.ll39
-rw-r--r--test/CodeGen/X86/statepoint-vector.ll2
-rw-r--r--test/CodeGen/X86/stdarg.ll2
-rw-r--r--test/CodeGen/X86/store-narrow.ll12
-rw-r--r--test/CodeGen/X86/store-zero-and-minus-one.ll88
-rw-r--r--test/CodeGen/X86/swift-return.ll206
-rw-r--r--test/CodeGen/X86/swifterror.ll359
-rw-r--r--test/CodeGen/X86/swiftself.ll62
-rw-r--r--test/CodeGen/X86/switch-bt.ll2
-rw-r--r--test/CodeGen/X86/switch-density.ll81
-rw-r--r--test/CodeGen/X86/switch-edge-weight.ll12
-rw-r--r--test/CodeGen/X86/switch-jump-table.ll2
-rw-r--r--test/CodeGen/X86/switch.ll77
-rw-r--r--test/CodeGen/X86/tail-call-attrs.ll4
-rw-r--r--test/CodeGen/X86/tail-call-casts.ll27
-rw-r--r--test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll40
-rw-r--r--test/CodeGen/X86/tail-merge-unreachable.ll34
-rw-r--r--test/CodeGen/X86/tail-opts.ll2
-rw-r--r--test/CodeGen/X86/tailcall-stackalign.ll2
-rw-r--r--test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll133
-rw-r--r--test/CodeGen/X86/tbm-intrinsics-fast-isel.ll216
-rw-r--r--test/CodeGen/X86/tls-android.ll8
-rw-r--r--test/CodeGen/X86/tls-pie.ll8
-rw-r--r--test/CodeGen/X86/tls-windows-itanium.ll30
-rw-r--r--test/CodeGen/X86/tls.ll15
-rw-r--r--test/CodeGen/X86/trunc-to-bool.ll4
-rw-r--r--test/CodeGen/X86/twoaddr-coalesce.ll2
-rw-r--r--test/CodeGen/X86/uint_to_fp-2.ll13
-rw-r--r--test/CodeGen/X86/uint_to_fp.ll33
-rw-r--r--test/CodeGen/X86/umul-with-overflow.ll3
-rw-r--r--test/CodeGen/X86/unaligned-load.ll4
-rw-r--r--test/CodeGen/X86/unaligned-spill-folding.ll2
-rw-r--r--test/CodeGen/X86/unknown-location.ll5
-rw-r--r--test/CodeGen/X86/unreachableblockelim.ll21
-rw-r--r--test/CodeGen/X86/unused_stackslots.ll246
-rw-r--r--test/CodeGen/X86/update-terminator.mir57
-rw-r--r--test/CodeGen/X86/urem-i8-constant.ll21
-rw-r--r--test/CodeGen/X86/urem-power-of-two.ll82
-rw-r--r--test/CodeGen/X86/utf16-cfstrings.ll2
-rw-r--r--test/CodeGen/X86/v4f32-immediate.ll15
-rw-r--r--test/CodeGen/X86/v8i1-masks.ll70
-rw-r--r--test/CodeGen/X86/vararg-callee-cleanup.ll2
-rw-r--r--test/CodeGen/X86/vec-sign.ll30
-rw-r--r--test/CodeGen/X86/vec_compare-sse4.ll81
-rw-r--r--test/CodeGen/X86/vec_ctbits.ll10
-rw-r--r--test/CodeGen/X86/vec_ext_inreg.ll74
-rw-r--r--test/CodeGen/X86/vec_extract-avx.ll177
-rw-r--r--test/CodeGen/X86/vec_extract-mmx.ll147
-rw-r--r--test/CodeGen/X86/vec_extract-sse4.ll115
-rw-r--r--test/CodeGen/X86/vec_extract.ll142
-rw-r--r--test/CodeGen/X86/vec_fabs.ll91
-rw-r--r--test/CodeGen/X86/vec_floor.ll361
-rw-r--r--test/CodeGen/X86/vec_fneg.ll99
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll1
-rw-r--r--test/CodeGen/X86/vec_fpext.ll152
-rw-r--r--test/CodeGen/X86/vec_fptrunc.ll168
-rw-r--r--test/CodeGen/X86/vec_i64.ll43
-rw-r--r--test/CodeGen/X86/vec_ins_extract-1.ll87
-rw-r--r--test/CodeGen/X86/vec_ins_extract.ll3
-rw-r--r--test/CodeGen/X86/vec_insert-2.ll60
-rw-r--r--test/CodeGen/X86/vec_insert-3.ll23
-rw-r--r--test/CodeGen/X86/vec_insert-4.ll43
-rw-r--r--test/CodeGen/X86/vec_insert-5.ll166
-rw-r--r--test/CodeGen/X86/vec_insert-7.ll47
-rw-r--r--test/CodeGen/X86/vec_insert-8.ll57
-rw-r--r--test/CodeGen/X86/vec_insert-9.ll22
-rw-r--r--test/CodeGen/X86/vec_insert-mmx.ll90
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll1697
-rw-r--r--test/CodeGen/X86/vec_loadsingles.ll85
-rw-r--r--test/CodeGen/X86/vec_logical.ll99
-rw-r--r--test/CodeGen/X86/vec_partial.ll21
-rw-r--r--test/CodeGen/X86/vec_sdiv_to_shift.ll285
-rw-r--r--test/CodeGen/X86/vec_set-2.ll34
-rw-r--r--test/CodeGen/X86/vec_set-3.ll17
-rw-r--r--test/CodeGen/X86/vec_set-4.ll46
-rw-r--r--test/CodeGen/X86/vec_set-6.ll20
-rw-r--r--test/CodeGen/X86/vec_set-7.ll20
-rw-r--r--test/CodeGen/X86/vec_set-8.ll19
-rw-r--r--test/CodeGen/X86/vec_set-A.ll13
-rw-r--r--test/CodeGen/X86/vec_set-B.ll32
-rw-r--r--test/CodeGen/X86/vec_set-C.ll19
-rw-r--r--test/CodeGen/X86/vec_set-D.ll15
-rw-r--r--test/CodeGen/X86/vec_set-F.ll34
-rw-r--r--test/CodeGen/X86/vec_set-H.ll30
-rw-r--r--test/CodeGen/X86/vec_set.ll43
-rw-r--r--test/CodeGen/X86/vec_setcc.ll242
-rw-r--r--test/CodeGen/X86/vec_shift.ll41
-rw-r--r--test/CodeGen/X86/vec_shift2.ll34
-rw-r--r--test/CodeGen/X86/vec_shift3.ll41
-rw-r--r--test/CodeGen/X86/vec_shift4.ll64
-rw-r--r--test/CodeGen/X86/vec_shift5.ll217
-rw-r--r--test/CodeGen/X86/vec_shift6.ll225
-rw-r--r--test/CodeGen/X86/vec_shift7.ll23
-rw-r--r--test/CodeGen/X86/vec_ss_load_fold.ll109
-rw-r--r--test/CodeGen/X86/vec_uint_to_fp-fastmath.ll4
-rw-r--r--test/CodeGen/X86/vector-bitreverse.ll3772
-rw-r--r--test/CodeGen/X86/vector-blend.ll285
-rw-r--r--test/CodeGen/X86/vector-compare-combines.ll47
-rw-r--r--test/CodeGen/X86/vector-compare-results.ll6625
-rw-r--r--test/CodeGen/X86/vector-gep.ll9
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll3922
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-128.ll622
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-256.ll545
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-512.ll2392
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-128.ll592
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-256.ll551
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-512.ll2100
-rw-r--r--test/CodeGen/X86/vector-idiv.ll1297
-rw-r--r--test/CodeGen/X86/vector-lzcnt-128.ll1216
-rw-r--r--test/CodeGen/X86/vector-lzcnt-256.ll1378
-rw-r--r--test/CodeGen/X86/vector-lzcnt-512.ll171
-rw-r--r--test/CodeGen/X86/vector-pcmp.ll495
-rw-r--r--test/CodeGen/X86/vector-popcnt-512.ll288
-rw-r--r--test/CodeGen/X86/vector-rem.ll118
-rw-r--r--test/CodeGen/X86/vector-rotate-128.ll97
-rw-r--r--test/CodeGen/X86/vector-rotate-256.ll46
-rw-r--r--test/CodeGen/X86/vector-sext.ll834
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll53
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll44
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-512.ll48
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll71
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll44
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-512.ll30
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll32
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll11
-rw-r--r--test/CodeGen/X86/vector-shift-shl-512.ll10
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll81
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v2.ll142
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v4.ll211
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll82
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll338
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll180
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll105
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll250
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v16.ll125
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v32.ll74
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v64.ll88
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll364
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx.ll242
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx2.ll324
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll515
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-ssse3.ll267
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-xop.ll133
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining.ll99
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse1.ll40
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse41.ll59
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll158
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-128.ll1321
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-256.ll720
-rw-r--r--test/CodeGen/X86/vector-trunc-math.ll5315
-rw-r--r--test/CodeGen/X86/vector-trunc.ll44
-rw-r--r--test/CodeGen/X86/vector-tzcnt-128.ll818
-rw-r--r--test/CodeGen/X86/vector-tzcnt-256.ll446
-rw-r--r--test/CodeGen/X86/vector-tzcnt-512.ll693
-rw-r--r--test/CodeGen/X86/vector-zext.ll326
-rw-r--r--test/CodeGen/X86/viabs.ll703
-rw-r--r--test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll4
-rw-r--r--test/CodeGen/X86/vselect-avx.ll102
-rw-r--r--test/CodeGen/X86/vselect-minmax.ll1
-rw-r--r--test/CodeGen/X86/vzero-excess.ll94
-rw-r--r--test/CodeGen/X86/warn-stack.ll2
-rw-r--r--test/CodeGen/X86/weak_def_can_be_hidden.ll8
-rw-r--r--test/CodeGen/X86/widen_bitops-0.ll307
-rw-r--r--test/CodeGen/X86/widen_bitops-1.ll235
-rw-r--r--test/CodeGen/X86/widen_compare-1.ll21
-rw-r--r--test/CodeGen/X86/widen_conv-1.ll95
-rw-r--r--test/CodeGen/X86/widen_conv-2.ll25
-rw-r--r--test/CodeGen/X86/widen_conv-3.ll147
-rw-r--r--test/CodeGen/X86/widen_conv-4.ll173
-rw-r--r--test/CodeGen/X86/widen_load-1.ll4
-rw-r--r--test/CodeGen/X86/widen_load-2.ll221
-rw-r--r--test/CodeGen/X86/win-alloca-expander.ll154
-rw-r--r--test/CodeGen/X86/win-catchpad-csrs.ll4
-rw-r--r--test/CodeGen/X86/win-catchpad-varargs.ll4
-rw-r--r--test/CodeGen/X86/win-catchpad.ll68
-rw-r--r--test/CodeGen/X86/win-cleanuppad.ll8
-rw-r--r--test/CodeGen/X86/win32-eh-states.ll16
-rw-r--r--test/CodeGen/X86/win32-eh.ll50
-rw-r--r--test/CodeGen/X86/win32-seh-catchpad-realign.ll2
-rw-r--r--test/CodeGen/X86/win32-seh-catchpad.ll8
-rw-r--r--test/CodeGen/X86/win32-seh-nested-finally.ll18
-rw-r--r--test/CodeGen/X86/win32_sret.ll47
-rw-r--r--test/CodeGen/X86/win64_eh.ll4
-rw-r--r--test/CodeGen/X86/win_cst_pool.ll26
-rw-r--r--test/CodeGen/X86/x86-16.ll20
-rw-r--r--test/CodeGen/X86/x86-32-intrcc.ll18
-rw-r--r--test/CodeGen/X86/x86-32-vector-calling-conv.ll24
-rw-r--r--test/CodeGen/X86/x86-64-flags-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/x86-64-intrcc.ll21
-rw-r--r--test/CodeGen/X86/x86-64-pic.ll8
-rw-r--r--test/CodeGen/X86/x86-64-plt-relative-reloc.ll19
-rw-r--r--test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll6
-rw-r--r--test/CodeGen/X86/x86-big-ret.ll22
-rw-r--r--test/CodeGen/X86/x86-flags-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/x86-interrupt_cc.ll33
-rw-r--r--test/CodeGen/X86/x86-interrupt_cld.ll17
-rw-r--r--test/CodeGen/X86/x86-interrupt_vzeroupper.ll19
-rw-r--r--test/CodeGen/X86/x86-plt-relative-reloc.ll16
-rw-r--r--test/CodeGen/X86/x86-shrink-wrap-unwind.ll4
-rw-r--r--test/CodeGen/X86/x86-shrink-wrapping.ll2
-rw-r--r--test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll29
-rw-r--r--test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll12
-rw-r--r--test/CodeGen/X86/x87.ll55
-rw-r--r--test/CodeGen/X86/xaluo.ll26
-rw-r--r--test/CodeGen/X86/xmulo.ll18
-rw-r--r--test/CodeGen/X86/xop-intrinsics-fast-isel.ll1111
-rw-r--r--test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll727
-rw-r--r--test/CodeGen/X86/xop-intrinsics-x86_64.ll950
-rw-r--r--test/CodeGen/X86/xop-mask-comments.ll188
-rw-r--r--test/CodeGen/X86/xray-attribute-instrumentation.ll13
-rw-r--r--test/CodeGen/X86/xray-selective-instrumentation-miss.ll9
-rw-r--r--test/CodeGen/X86/xray-selective-instrumentation.ll9
-rw-r--r--test/CodeGen/X86/zext-fold.ll7
-rw-r--r--test/CodeGen/XCore/align.ll4
-rw-r--r--test/CodeGen/XCore/dwarf_debug.ll5
-rw-r--r--test/CodeGen/XCore/epilogue_prologue.ll4
-rw-r--r--test/CodeGen/XCore/linkage.ll5
-rw-r--r--test/CodeGen/XCore/scavenging.ll2
-rw-r--r--test/CodeGen/XCore/threads.ll4
2443 files changed, 204034 insertions, 45557 deletions
diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll
index 94fd386e0eaf..38d30dba4b8c 100644
--- a/test/CodeGen/AArch64/128bit_load_store.ll
+++ b/test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
; CHECK-LABEL: test_store_f128
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
new file mode 100644
index 000000000000..7d416d9b0add
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -0,0 +1,63 @@
+; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; REQUIRES: global-isel
+; This file checks that the translation from llvm IR to generic MachineInstr
+; is correct.
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-apple-ios"
+
+; Tests for add.
+; CHECK: name: addi64
+; CHECK: [[ARG1:%[0-9]+]](64) = COPY %x0
+; CHECK-NEXT: [[ARG2:%[0-9]+]](64) = COPY %x1
+; CHECK-NEXT: [[RES:%[0-9]+]](64) = G_ADD i64 [[ARG1]], [[ARG2]]
+; CHECK-NEXT: %x0 = COPY [[RES]]
+; CHECK-NEXT: RET_ReallyLR implicit %x0
+define i64 @addi64(i64 %arg1, i64 %arg2) {
+ %res = add i64 %arg1, %arg2
+ ret i64 %res
+}
+
+; Tests for br.
+; CHECK: name: uncondbr
+; CHECK: body:
+;
+; Entry basic block.
+; CHECK: {{[0-9a-zA-Z._-]+}}:
+;
+; Make sure we have one successor and only one.
+; CHECK-NEXT: successors: %[[END:[0-9a-zA-Z._-]+]]({{0x[a-f0-9]+ / 0x[a-f0-9]+}} = 100.00%)
+;
+; Check that we emit the correct branch.
+; CHECK: G_BR label %[[END]]
+;
+; Check that end contains the return instruction.
+; CHECK: [[END]]:
+; CHECK-NEXT: RET_ReallyLR
+define void @uncondbr() {
+ br label %end
+end:
+ ret void
+}
+
+; Tests for or.
+; CHECK: name: ori64
+; CHECK: [[ARG1:%[0-9]+]](64) = COPY %x0
+; CHECK-NEXT: [[ARG2:%[0-9]+]](64) = COPY %x1
+; CHECK-NEXT: [[RES:%[0-9]+]](64) = G_OR i64 [[ARG1]], [[ARG2]]
+; CHECK-NEXT: %x0 = COPY [[RES]]
+; CHECK-NEXT: RET_ReallyLR implicit %x0
+define i64 @ori64(i64 %arg1, i64 %arg2) {
+ %res = or i64 %arg1, %arg2
+ ret i64 %res
+}
+
+; CHECK: name: ori32
+; CHECK: [[ARG1:%[0-9]+]](32) = COPY %w0
+; CHECK-NEXT: [[ARG2:%[0-9]+]](32) = COPY %w1
+; CHECK-NEXT: [[RES:%[0-9]+]](32) = G_OR i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT: %w0 = COPY [[RES]]
+; CHECK-NEXT: RET_ReallyLR implicit %w0
+define i32 @ori32(i32 %arg1, i32 %arg2) {
+ %res = or i32 %arg1, %arg2
+ ret i32 %res
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
new file mode 100644
index 000000000000..f5d85e189d75
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
@@ -0,0 +1,329 @@
+# RUN: llc -O0 -run-pass=regbankselect -global-isel %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+# RUN: llc -O0 -run-pass=regbankselect -global-isel %s -regbankselect-greedy -o - 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY
+# REQUIRES: global-isel
+
+--- |
+ ; ModuleID = 'generic-virtual-registers-type-error.mir'
+ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+ target triple = "aarch64-apple-ios"
+ define void @defaultMapping() {
+ entry:
+ ret void
+ }
+ define void @defaultMappingVector() {
+ entry:
+ ret void
+ }
+ define void @defaultMapping1Repair() {
+ entry:
+ ret void
+ }
+ define void @defaultMapping2Repairs() {
+ entry:
+ ret void
+ }
+ define void @defaultMappingDefRepair() {
+ entry:
+ ret void
+ }
+ define void @phiPropagation(i32* %src, i32* %dst, i1 %cond) {
+ entry:
+ %srcVal = load i32, i32* %src
+ br i1 %cond, label %end, label %then
+ then:
+ %res = add i32 %srcVal, 36
+ br label %end
+ end:
+ %toStore = phi i32 [ %srcVal, %entry ], [ %res, %then ]
+ store i32 %toStore, i32* %dst
+ ret void
+ }
+ define void @defaultMappingUseRepairPhysReg() {
+ entry:
+ ret void
+ }
+ define void @defaultMappingDefRepairPhysReg() {
+ entry:
+ ret void
+ }
+ define void @greedyMappingOr() {
+ entry:
+ ret void
+ }
+ define void @greedyMappingOrWithConstraints() {
+ entry:
+ ret void
+ }
+...
+
+---
+# Check that we assign a relevant register bank for %0.
+# Based on the type i32, this should be gpr.
+name: defaultMapping
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+registers:
+ - { id: 0, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %x0
+ ; CHECK: %0(32) = G_ADD i32 %x0
+ %0(32) = G_ADD i32 %x0, %x0
+...
+
+---
+# Check that we assign a relevant register bank for %0.
+# Based on the type <2 x i32>, this should be fpr.
+# FPR is used for both floating point and vector registers.
+name: defaultMappingVector
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: fpr }
+registers:
+ - { id: 0, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %d0
+ ; CHECK: %0(32) = G_ADD <2 x i32> %d0
+ %0(32) = G_ADD <2 x i32> %d0, %d0
+...
+
+---
+# Check that we repair the assignment for %0.
+# Indeed based on the source of the copy it should live
+# in FPR, but at the use, it should be GPR.
+name: defaultMapping1Repair
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: fpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %s0, %x0
+ ; CHECK: %0(32) = COPY %s0
+ ; CHECK-NEXT: %2(32) = COPY %0
+ ; CHECK-NEXT: %1(32) = G_ADD i32 %2, %x0
+ %0(32) = COPY %s0
+ %1(32) = G_ADD i32 %0, %x0
+...
+
+# Check that we repair the assignment for %0 differently for both uses.
+name: defaultMapping2Repairs
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: fpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+# CHECK-NEXT: - { id: 3, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %s0, %x0
+ ; CHECK: %0(32) = COPY %s0
+ ; CHECK-NEXT: %2(32) = COPY %0
+ ; CHECK-NEXT: %3(32) = COPY %0
+ ; CHECK-NEXT: %1(32) = G_ADD i32 %2, %3
+ %0(32) = COPY %s0
+ %1(32) = G_ADD i32 %0, %0
+...
+
+---
+# Check that we repair the definition of %1.
+# %1 is forced to be into FPR, but its definition actually
+# requires that it lives in GPR. Make sure regbankselect
+# fixes that.
+name: defaultMappingDefRepair
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: fpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: fpr }
+body: |
+ bb.0.entry:
+ liveins: %w0
+ ; CHECK: %0(32) = COPY %w0
+ ; CHECK-NEXT: %2(32) = G_ADD i32 %0, %w0
+ ; CHECK-NEXT: %1(32) = COPY %2
+ %0(32) = COPY %w0
+ %1(32) = G_ADD i32 %0, %w0
+...
+
+---
+# Check that we are able to propagate register banks from phis.
+name: phiPropagation
+isSSA: true
+tracksRegLiveness: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr32 }
+# CHECK-NEXT: - { id: 1, class: gpr64sp }
+# CHECK-NEXT: - { id: 2, class: gpr32 }
+# CHECK-NEXT: - { id: 3, class: gpr }
+# CHECK-NEXT: - { id: 4, class: gpr }
+registers:
+ - { id: 0, class: gpr32 }
+ - { id: 1, class: gpr64sp }
+ - { id: 2, class: gpr32 }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+body: |
+ bb.0.entry:
+ successors: %bb.2.end, %bb.1.then
+ liveins: %x0, %x1, %w2
+
+ %0 = LDRWui killed %x0, 0 :: (load 4 from %ir.src)
+ %1 = COPY %x1
+ %2 = COPY %w2
+ TBNZW killed %2, 0, %bb.2.end
+
+ bb.1.then:
+ successors: %bb.2.end
+ %3(32) = G_ADD i32 %0, %0
+
+ bb.2.end:
+ %4(32) = PHI %0, %bb.0.entry, %3, %bb.1.then
+ STRWui killed %4, killed %1, 0 :: (store 4 into %ir.dst)
+ RET_ReallyLR
+...
+
+---
+# Make sure we can repair physical register uses as well.
+name: defaultMappingUseRepairPhysReg
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %w0, %s0
+ ; CHECK: %0(32) = COPY %w0
+ ; CHECK-NEXT: %2(32) = COPY %s0
+ ; CHECK-NEXT: %1(32) = G_ADD i32 %0, %2
+ %0(32) = COPY %w0
+ %1(32) = G_ADD i32 %0, %s0
+...
+
+---
+# Make sure we can repair physical register defs.
+name: defaultMappingDefRepairPhysReg
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+registers:
+ - { id: 0, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %w0
+ ; CHECK: %0(32) = COPY %w0
+ ; CHECK-NEXT: %1(32) = G_ADD i32 %0, %0
+ ; CHECK-NEXT: %s0 = COPY %1
+ %0(32) = COPY %w0
+ %s0 = G_ADD i32 %0, %0
+...
+
+---
+# Check that the greedy mode is able to switch the
+# G_OR instruction from fpr to gpr.
+name: greedyMappingOr
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+
+# Fast mode maps vector instruction on FPR.
+# FAST-NEXT: - { id: 2, class: fpr }
+# Fast mode needs two extra copies.
+# FAST-NEXT: - { id: 3, class: fpr }
+# FAST-NEXT: - { id: 4, class: fpr }
+
+# Greedy mode coalesce the computation on the GPR register
+# because it is the cheapest.
+# GREEDY-NEXT: - { id: 2, class: gpr }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %x0, %x1
+ ; CHECK: %0(64) = COPY %x0
+ ; CHECK-NEXT: %1(64) = COPY %x1
+
+
+ ; Fast mode tries to reuse the source of the copy for the destination.
+ ; Now, the default mapping says that %0 and %1 need to be in FPR.
+ ; The repairing code insert two copies to materialize that.
+ ; FAST-NEXT: %3(64) = COPY %0
+ ; FAST-NEXT: %4(64) = COPY %1
+ ; The mapping of G_OR is on FPR.
+ ; FAST-NEXT: %2(64) = G_OR <2 x i32> %3, %4
+
+ ; Greedy mode remapped the instruction on the GPR bank.
+ ; GREEDY-NEXT: %2(64) = G_OR <2 x i32> %0, %1
+ %0(64) = COPY %x0
+ %1(64) = COPY %x1
+ %2(64) = G_OR <2 x i32> %0, %1
+...
+
+---
+# Check that the greedy mode is able to switch the
+# G_OR instruction from fpr to gpr, while still honoring
+# %2 constraint.
+name: greedyMappingOrWithConstraints
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: fpr }
+
+# Fast mode maps vector instruction on FPR.
+# Fast mode needs two extra copies.
+# FAST-NEXT: - { id: 3, class: fpr }
+# FAST-NEXT: - { id: 4, class: fpr }
+
+# Greedy mode coalesce the computation on the GPR register because it
+# is the cheapest, but will need one extra copy to materialize %2 into a FPR.
+# GREEDY-NEXT: - { id: 3, class: gpr }
+
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: fpr }
+body: |
+ bb.0.entry:
+ liveins: %x0, %x1
+ ; CHECK: %0(64) = COPY %x0
+ ; CHECK-NEXT: %1(64) = COPY %x1
+
+
+ ; Fast mode tries to reuse the source of the copy for the destination.
+ ; Now, the default mapping says that %0 and %1 need to be in FPR.
+ ; The repairing code insert two copies to materialize that.
+ ; FAST-NEXT: %3(64) = COPY %0
+ ; FAST-NEXT: %4(64) = COPY %1
+ ; The mapping of G_OR is on FPR.
+ ; FAST-NEXT: %2(64) = G_OR <2 x i32> %3, %4
+
+ ; Greedy mode remapped the instruction on the GPR bank.
+ ; GREEDY-NEXT: %3(64) = G_OR <2 x i32> %0, %1
+ ; We need to keep %2 into FPR because we do not know anything about it.
+ ; GREEDY-NEXT: %2(64) = COPY %3
+ %0(64) = COPY %x0
+ %1(64) = COPY %x1
+ %2(64) = G_OR <2 x i32> %0, %1
+...
diff --git a/test/CodeGen/AArch64/a57-csel.ll b/test/CodeGen/AArch64/a57-csel.ll
index f5496f777765..3c99a90fe28a 100644
--- a/test/CodeGen/AArch64/a57-csel.ll
+++ b/test/CodeGen/AArch64/a57-csel.ll
@@ -1,8 +1,9 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mcpu=cortex-a57 -aarch64-enable-early-ifcvt=false | FileCheck %s
-; Check that the select is expanded into a branch sequence.
+; Check that the select isn't expanded into a branch sequence
+; when the icmp's first operand %x0 is from load.
define i64 @f(i64 %a, i64 %b, i64* %c, i64 %d, i64 %e) {
- ; CHECK: cbz
+ ; CHECK: csel
%x0 = load i64, i64* %c
%x1 = icmp eq i64 %x0, 0
%x2 = select i1 %x1, i64 %a, i64 %b
diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index 5eb455f3a22c..d12c4c6f9fae 100644
--- a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -44,11 +44,10 @@ attributes #1 = { nounwind readnone }
!llvm.module.flags = !{!36, !37}
!llvm.ident = !{!38}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "test.c", directory: "")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 141, file: !1, scope: !1, type: !6, variables: !12)
+!4 = distinct !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 141, file: !1, scope: !1, type: !6, variables: !12)
!6 = !DISubroutineType(types: !7)
!7 = !{null, !8}
!8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9)
diff --git a/test/CodeGen/AArch64/aarch64-DAGCombine-findBetterNeighborChains-crash.ll b/test/CodeGen/AArch64/aarch64-DAGCombine-findBetterNeighborChains-crash.ll
new file mode 100644
index 000000000000..73200b581585
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-DAGCombine-findBetterNeighborChains-crash.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm64
+; Make sure we are not crashing on this test.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @extern(i8*)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #0
+
+; Function Attrs: nounwind
+define void @func(float* noalias %arg, i32* noalias %arg1, i8* noalias %arg2, i8* noalias %arg3) #1 {
+bb:
+ %tmp = getelementptr inbounds i8, i8* %arg2, i64 88
+ tail call void @llvm.memset.p0i8.i64(i8* noalias %arg2, i8 0, i64 40, i32 8, i1 false)
+ store i8 0, i8* %arg3
+ store i8 2, i8* %arg2
+ store float 0.000000e+00, float* %arg
+ %tmp4 = bitcast i8* %tmp to <4 x float>*
+ store volatile <4 x float> zeroinitializer, <4 x float>* %tmp4
+ store i32 5, i32* %arg1
+ tail call void @extern(i8* %tmp)
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @func2(float* noalias %arg, i32* noalias %arg1, i8* noalias %arg2, i8* noalias %arg3) #1 {
+bb:
+ %tmp = getelementptr inbounds i8, i8* %arg2, i64 88
+ tail call void @llvm.memset.p0i8.i64(i8* noalias %arg2, i8 0, i64 40, i32 8, i1 false)
+ store i8 0, i8* %arg3
+ store i8 2, i8* %arg2
+ store float 0.000000e+00, float* %arg
+ %tmp4 = bitcast i8* %tmp to <4 x float>*
+ store <4 x float> zeroinitializer, <4 x float>* %tmp4
+ store i32 5, i32* %arg1
+ tail call void @extern(i8* %tmp)
+ ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind "target-cpu"="cortex-a53" }
diff --git a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index b0e9d4aa7703..29b71e042611 100644
--- a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -1,7 +1,13 @@
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-EVEN
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-ODD
-; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN
-; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD
+
+; The following tests use the balance-fp-ops feature, and should be independent of
+; the target cpu.
+
+; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN --check-prefix CHECK-BALFP
+; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD --check-prefix CHECK-BALFP
; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
; our test strategy is to:
@@ -75,7 +81,7 @@ entry:
; CHECK: fmsub [[x]]
; CHECK: fmadd [[y]]
; CHECK: fmadd [[x]]
-; CHECK-A57: stp [[x]], [[y]]
+; CHECK-BALFP: stp [[x]], [[y]]
; CHECK-A53-DAG: str [[x]]
; CHECK-A53-DAG: str [[y]]
@@ -170,7 +176,7 @@ declare void @g(...) #1
; CHECK: fmsub [[x]]
; CHECK: fmadd [[y]]
; CHECK: fmadd [[x]]
-; CHECK-A57: stp [[x]], [[y]]
+; CHECK-BALFP: stp [[x]], [[y]]
; CHECK-A53-DAG: str [[x]]
; CHECK-A53-DAG: str [[y]]
diff --git a/test/CodeGen/AArch64/aarch64-be-bv.ll b/test/CodeGen/AArch64/aarch64-be-bv.ll
index fb41156c09df..163a86b9ae4c 100644
--- a/test/CodeGen/AArch64/aarch64-be-bv.ll
+++ b/test/CodeGen/AArch64/aarch64-be-bv.ll
@@ -5,7 +5,7 @@
; CHECK-LABEL: movi_modimm_t1:
define i16 @movi_modimm_t1() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #0x1
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #1
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -17,7 +17,7 @@ define i16 @movi_modimm_t1() nounwind {
; CHECK-LABEL: movi_modimm_t2:
define i16 @movi_modimm_t2() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #1, lsl #8
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -29,7 +29,7 @@ define i16 @movi_modimm_t2() nounwind {
; CHECK-LABEL: movi_modimm_t3:
define i16 @movi_modimm_t3() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #1, lsl #16
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -41,7 +41,7 @@ define i16 @movi_modimm_t3() nounwind {
; CHECK-LABEL: movi_modimm_t4:
define i16 @movi_modimm_t4() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #1, lsl #24
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -53,7 +53,7 @@ define i16 @movi_modimm_t4() nounwind {
; CHECK-LABEL: movi_modimm_t5:
define i16 @movi_modimm_t5() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].8h, #0x1
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].8h, #1
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -65,7 +65,7 @@ define i16 @movi_modimm_t5() nounwind {
; CHECK-LABEL: movi_modimm_t6:
define i16 @movi_modimm_t6() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].8h, #1, lsl #8
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -77,7 +77,7 @@ define i16 @movi_modimm_t6() nounwind {
; CHECK-LABEL: movi_modimm_t7:
define i16 @movi_modimm_t7() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #0x1, msl #8
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #1, msl #8
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -89,7 +89,7 @@ define i16 @movi_modimm_t7() nounwind {
; CHECK-LABEL: movi_modimm_t8:
define i16 @movi_modimm_t8() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #0x1, msl #16
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].4s, #1, msl #16
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -101,7 +101,7 @@ define i16 @movi_modimm_t8() nounwind {
; CHECK-LABEL: movi_modimm_t9:
define i16 @movi_modimm_t9() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: movi v[[REG2:[0-9]+]].16b, #0x1
+ ; CHECK-NEXT: movi v[[REG2:[0-9]+]].16b, #1
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -149,7 +149,7 @@ define i16 @fmov_modimm_t12() nounwind {
; CHECK-LABEL: mvni_modimm_t1:
define i16 @mvni_modimm_t1() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #0x1
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #1
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -161,7 +161,7 @@ define i16 @mvni_modimm_t1() nounwind {
; CHECK-LABEL: mvni_modimm_t2:
define i16 @mvni_modimm_t2() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #1, lsl #8
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -173,7 +173,7 @@ define i16 @mvni_modimm_t2() nounwind {
; CHECK-LABEL: mvni_modimm_t3:
define i16 @mvni_modimm_t3() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #1, lsl #16
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -185,7 +185,7 @@ define i16 @mvni_modimm_t3() nounwind {
; CHECK-LABEL: mvni_modimm_t4:
define i16 @mvni_modimm_t4() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #1, lsl #24
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -197,7 +197,7 @@ define i16 @mvni_modimm_t4() nounwind {
; CHECK-LABEL: mvni_modimm_t5:
define i16 @mvni_modimm_t5() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].8h, #0x1
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].8h, #1
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -209,7 +209,7 @@ define i16 @mvni_modimm_t5() nounwind {
; CHECK-LABEL: mvni_modimm_t6:
define i16 @mvni_modimm_t6() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].8h, #1, lsl #8
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -221,7 +221,7 @@ define i16 @mvni_modimm_t6() nounwind {
; CHECK-LABEL: mvni_modimm_t7:
define i16 @mvni_modimm_t7() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #0x1, msl #8
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #1, msl #8
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -233,7 +233,7 @@ define i16 @mvni_modimm_t7() nounwind {
; CHECK-LABEL: mvni_modimm_t8:
define i16 @mvni_modimm_t8() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #0x1, msl #16
+ ; CHECK-NEXT: mvni v[[REG2:[0-9]+]].4s, #1, msl #16
; CHECK-NEXT: add v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
@@ -245,7 +245,7 @@ define i16 @mvni_modimm_t8() nounwind {
; CHECK-LABEL: bic_modimm_t1:
define i16 @bic_modimm_t1() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1
+ ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #1
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = and <8 x i16> %in, <i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535>
@@ -256,7 +256,7 @@ define i16 @bic_modimm_t1() nounwind {
; CHECK-LABEL: bic_modimm_t2:
define i16 @bic_modimm_t2() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+ ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #1, lsl #8
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = and <8 x i16> %in, <i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535>
@@ -267,7 +267,7 @@ define i16 @bic_modimm_t2() nounwind {
; CHECK-LABEL: bic_modimm_t3:
define i16 @bic_modimm_t3() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+ ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #1, lsl #16
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = and <8 x i16> %in, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
@@ -278,7 +278,7 @@ define i16 @bic_modimm_t3() nounwind {
; CHECK-LABEL: bic_modimm_t4:
define i16 @bic_modimm_t4() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+ ; CHECK-NEXT: bic v[[REG2:[0-9]+]].4s, #1, lsl #24
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = and <8 x i16> %in, <i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279>
@@ -289,7 +289,7 @@ define i16 @bic_modimm_t4() nounwind {
; CHECK-LABEL: bic_modimm_t5:
define i16 @bic_modimm_t5() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: bic v[[REG2:[0-9]+]].8h, #0x1
+ ; CHECK-NEXT: bic v[[REG2:[0-9]+]].8h, #1
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = and <8 x i16> %in, <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>
@@ -300,7 +300,7 @@ define i16 @bic_modimm_t5() nounwind {
; CHECK-LABEL: bic_modimm_t6:
define i16 @bic_modimm_t6() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: bic v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+ ; CHECK-NEXT: bic v[[REG2:[0-9]+]].8h, #1, lsl #8
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = and <8 x i16> %in, <i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279>
@@ -311,7 +311,7 @@ define i16 @bic_modimm_t6() nounwind {
; CHECK-LABEL: orr_modimm_t1:
define i16 @orr_modimm_t1() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1
+ ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #1
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = or <8 x i16> %in, <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>
@@ -322,7 +322,7 @@ define i16 @orr_modimm_t1() nounwind {
; CHECK-LABEL: orr_modimm_t2:
define i16 @orr_modimm_t2() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+ ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #1, lsl #8
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = or <8 x i16> %in, <i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0>
@@ -333,7 +333,7 @@ define i16 @orr_modimm_t2() nounwind {
; CHECK-LABEL: orr_modimm_t3:
define i16 @orr_modimm_t3() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+ ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #1, lsl #16
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = or <8 x i16> %in, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
@@ -344,7 +344,7 @@ define i16 @orr_modimm_t3() nounwind {
; CHECK-LABEL: orr_modimm_t4:
define i16 @orr_modimm_t4() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+ ; CHECK-NEXT: orr v[[REG2:[0-9]+]].4s, #1, lsl #24
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = or <8 x i16> %in, <i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256>
@@ -355,7 +355,7 @@ define i16 @orr_modimm_t4() nounwind {
; CHECK-LABEL: orr_modimm_t5:
define i16 @orr_modimm_t5() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: orr v[[REG2:[0-9]+]].8h, #0x1
+ ; CHECK-NEXT: orr v[[REG2:[0-9]+]].8h, #1
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = or <8 x i16> %in, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -366,7 +366,7 @@ define i16 @orr_modimm_t5() nounwind {
; CHECK-LABEL: orr_modimm_t6:
define i16 @orr_modimm_t6() nounwind {
; CHECK: ld1 { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
- ; CHECK-NEXT: orr v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+ ; CHECK-NEXT: orr v[[REG2:[0-9]+]].8h, #1, lsl #8
; CHECK-NEXT: umov w{{[0-9]+}}, v[[REG1]].h[0]
%in = load <8 x i16>, <8 x i16>* @vec_v8i16
%rv = or <8 x i16> %in, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
@@ -385,37 +385,37 @@ declare i64 @f_v2i64(<2 x i64> %arg)
; CHECK-LABEL: modimm_t1_call:
define void @modimm_t1_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x7
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #7
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 7, i16 0, i16 7, i16 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x6
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #6
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 6, i32 6>)
- ; CHECK: movi v{{[0-9]+}}.2s, #0x5
+ ; CHECK: movi v{{[0-9]+}}.2s, #5
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 21474836485>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x5
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #5
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x4
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #4
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x3
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #3
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 3, i32 3, i32 3, i32 3>)
- ; CHECK: movi v[[REG:[0-9]+]].4s, #0x2
+ ; CHECK: movi v[[REG:[0-9]+]].4s, #2
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 8589934594, i64 8589934594>)
@@ -425,37 +425,37 @@ define void @modimm_t1_call() {
; CHECK-LABEL: modimm_t2_call:
define void @modimm_t2_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #8, lsl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x7, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #7, lsl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 1792, i16 0, i16 1792, i16 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x6, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #6, lsl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 1536, i32 1536>)
- ; CHECK: movi v{{[0-9]+}}.2s, #0x5, lsl #8
+ ; CHECK: movi v{{[0-9]+}}.2s, #5, lsl #8
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 5497558140160>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x5, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #5, lsl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x4, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #4, lsl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x3, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #3, lsl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 768, i32 768, i32 768, i32 768>)
- ; CHECK: movi v[[REG:[0-9]+]].4s, #0x2, lsl #8
+ ; CHECK: movi v[[REG:[0-9]+]].4s, #2, lsl #8
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 2199023256064, i64 2199023256064>)
@@ -465,37 +465,37 @@ define void @modimm_t2_call() {
; CHECK-LABEL: modimm_t3_call:
define void @modimm_t3_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, lsl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #8, lsl #16
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x7, lsl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #7, lsl #16
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 0, i16 7, i16 0, i16 7>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x6, lsl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #6, lsl #16
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 393216, i32 393216>)
- ; CHECK: movi v{{[0-9]+}}.2s, #0x5, lsl #16
+ ; CHECK: movi v{{[0-9]+}}.2s, #5, lsl #16
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 1407374883880960>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x5, lsl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #5, lsl #16
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x4, lsl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #4, lsl #16
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x3, lsl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #3, lsl #16
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 196608, i32 196608, i32 196608, i32 196608>)
- ; CHECK: movi v[[REG:[0-9]+]].4s, #0x2, lsl #16
+ ; CHECK: movi v[[REG:[0-9]+]].4s, #2, lsl #16
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 562949953552384, i64 562949953552384>)
@@ -505,37 +505,37 @@ define void @modimm_t3_call() {
; CHECK-LABEL: modimm_t4_call:
define void @modimm_t4_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, lsl #24
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #8, lsl #24
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x7, lsl #24
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #7, lsl #24
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 0, i16 1792, i16 0, i16 1792>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x6, lsl #24
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #6, lsl #24
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 100663296, i32 100663296>)
- ; CHECK: movi v{{[0-9]+}}.2s, #0x5, lsl #24
+ ; CHECK: movi v{{[0-9]+}}.2s, #5, lsl #24
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 360287970273525760>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x5, lsl #24
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #5, lsl #24
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x4, lsl #24
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #4, lsl #24
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x3, lsl #24
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #3, lsl #24
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 50331648, i32 50331648, i32 50331648, i32 50331648>)
- ; CHECK: movi v[[REG:[0-9]+]].4s, #0x2, lsl #24
+ ; CHECK: movi v[[REG:[0-9]+]].4s, #2, lsl #24
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 144115188109410304, i64 144115188109410304>)
@@ -545,37 +545,37 @@ define void @modimm_t4_call() {
; CHECK-LABEL: modimm_t5_call:
define void @modimm_t5_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x8
+ ; CHECK: movi v[[REG1:[0-9]+]].4h, #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x7
+ ; CHECK: movi v[[REG1:[0-9]+]].4h, #7
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 7, i16 7, i16 7, i16 7>)
- ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x6
+ ; CHECK: movi v[[REG1:[0-9]+]].4h, #6
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 393222, i32 393222>)
- ; CHECK: movi v{{[0-9]+}}.4h, #0x5
+ ; CHECK: movi v{{[0-9]+}}.4h, #5
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 1407396358717445>)
- ; CHECK: movi v[[REG1:[0-9]+]].8h, #0x5
+ ; CHECK: movi v[[REG1:[0-9]+]].8h, #5
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].8h, #0x4
+ ; CHECK: movi v[[REG1:[0-9]+]].8h, #4
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>)
- ; CHECK: movi v[[REG1:[0-9]+]].8h, #0x3
+ ; CHECK: movi v[[REG1:[0-9]+]].8h, #3
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 196611, i32 196611, i32 196611, i32 196611>)
- ; CHECK: movi v[[REG:[0-9]+]].8h, #0x2
+ ; CHECK: movi v[[REG:[0-9]+]].8h, #2
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 562958543486978, i64 562958543486978>)
@@ -585,37 +585,37 @@ define void @modimm_t5_call() {
; CHECK-LABEL: modimm_t6_call:
define void @modimm_t6_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x8, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4h, #8, lsl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8>)
- ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x7, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4h, #7, lsl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 1792, i16 1792, i16 1792, i16 1792>)
- ; CHECK: movi v[[REG1:[0-9]+]].4h, #0x6, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4h, #6, lsl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 100664832, i32 100664832>)
- ; CHECK: movi v{{[0-9]+}}.4h, #0x5, lsl #8
+ ; CHECK: movi v{{[0-9]+}}.4h, #5, lsl #8
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 360293467831665920>)
- ; CHECK: movi v[[REG1:[0-9]+]].8h, #0x5, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].8h, #5, lsl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5>)
- ; CHECK: movi v[[REG1:[0-9]+]].8h, #0x4, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].8h, #4, lsl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024>)
- ; CHECK: movi v[[REG1:[0-9]+]].8h, #0x3, lsl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].8h, #3, lsl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 50332416, i32 50332416, i32 50332416, i32 50332416>)
- ; CHECK: movi v[[REG:[0-9]+]].8h, #0x2, lsl #8
+ ; CHECK: movi v[[REG:[0-9]+]].8h, #2, lsl #8
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 144117387132666368, i64 144117387132666368>)
@@ -625,37 +625,37 @@ define void @modimm_t6_call() {
; CHECK-LABEL: modimm_t7_call:
define void @modimm_t7_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, msl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #8, msl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 255, i8 8, i8 0, i8 0, i8 255, i8 8, i8 0, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x7, msl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #7, msl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 2047, i16 0, i16 2047, i16 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x6, msl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #6, msl #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 1791, i32 1791>)
- ; CHECK: movi v{{[0-9]+}}.2s, #0x5, msl #8
+ ; CHECK: movi v{{[0-9]+}}.2s, #5, msl #8
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 6592774800895>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x5, msl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #5, msl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x4, msl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #4, msl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x3, msl #8
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #3, msl #8
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 1023, i32 1023, i32 1023, i32 1023>)
- ; CHECK: movi v[[REG:[0-9]+]].4s, #0x2, msl #8
+ ; CHECK: movi v[[REG:[0-9]+]].4s, #2, msl #8
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 3294239916799, i64 3294239916799>)
@@ -665,37 +665,37 @@ define void @modimm_t7_call() {
; CHECK-LABEL: modimm_t8_call:
define void @modimm_t8_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x8, msl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #8, msl #16
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 255, i8 255, i8 8, i8 0, i8 255, i8 255, i8 8, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x7, msl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #7, msl #16
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 65535, i16 7, i16 65535, i16 7>)
- ; CHECK: movi v[[REG1:[0-9]+]].2s, #0x6, msl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].2s, #6, msl #16
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 458751, i32 458751>)
- ; CHECK: movi v{{[0-9]+}}.2s, #0x5, msl #16
+ ; CHECK: movi v{{[0-9]+}}.2s, #5, msl #16
; CHECK-NEXT: bl f_v1i64
call i64 @f_v1i64(<1 x i64> <i64 1688845565689855>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x5, msl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #5, msl #16
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x4, msl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #4, msl #16
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4>)
- ; CHECK: movi v[[REG1:[0-9]+]].4s, #0x3, msl #16
+ ; CHECK: movi v[[REG1:[0-9]+]].4s, #3, msl #16
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
call i32 @f_v4i32(<4 x i32> <i32 262143, i32 262143, i32 262143, i32 262143>)
- ; CHECK: movi v[[REG:[0-9]+]].4s, #0x2, msl #16
+ ; CHECK: movi v[[REG:[0-9]+]].4s, #2, msl #16
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v2i64
call i64 @f_v2i64(<2 x i64> <i64 844420635361279, i64 844420635361279>)
@@ -705,29 +705,29 @@ define void @modimm_t8_call() {
; CHECK-LABEL: modimm_t9_call:
define void @modimm_t9_call() {
- ; CHECK: movi v[[REG1:[0-9]+]].8b, #0x8
+ ; CHECK: movi v[[REG1:[0-9]+]].8b, #8
; CHECK-NEXT: rev64 v{{[0-9]+}}.8b, v[[REG1]].8b
; CHECK-NEXT: bl f_v8i8
call i8 @f_v8i8(<8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
- ; CHECK: movi v[[REG1:[0-9]+]].8b, #0x7
+ ; CHECK: movi v[[REG1:[0-9]+]].8b, #7
; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h
; CHECK-NEXT: bl f_v4i16
call i16 @f_v4i16(<4 x i16> <i16 1799, i16 1799, i16 1799, i16 1799>)
- ; CHECK: movi v[[REG1:[0-9]+]].8b, #0x6
+ ; CHECK: movi v[[REG1:[0-9]+]].8b, #6
; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s
; CHECK-NEXT: bl f_v2i32
call i32 @f_v2i32(<2 x i32> <i32 101058054, i32 101058054>)
- ; CHECK: movi v[[REG1:[0-9]+]].16b, #0x5
+ ; CHECK: movi v[[REG1:[0-9]+]].16b, #5
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].16b, v[[REG1]].16b
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v16i8
call i8 @f_v16i8(<16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>)
- ; CHECK: movi v[[REG1:[0-9]+]].16b, #0x4
+ ; CHECK: movi v[[REG1:[0-9]+]].16b, #4
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].8h, v[[REG1]].8h
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v8i16
call i16 @f_v8i16(<8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>)
- ; CHECK: movi v[[REG1:[0-9]+]].16b, #0x3
+ ; CHECK: movi v[[REG1:[0-9]+]].16b, #3
; CHECK-NEXT: rev64 v[[REG2:[0-9]+]].4s, v[[REG1]].4s
; CHECK-NEXT: ext v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
; CHECK-NEXT: bl f_v4i32
diff --git a/test/CodeGen/AArch64/aarch64-deferred-spilling.ll b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
deleted file mode 100644
index 7accdced7d44..000000000000
--- a/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
+++ /dev/null
@@ -1,514 +0,0 @@
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR
-
-; Check that we do not end up with useless spill code.
-;
-; Move to the basic block we are interested in.
-;
-; CHECK: // %if.then.120
-;
-; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill
-; Check that w21 wouldn't need to be spilled since it is never reused.
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-;
-; Check that w22 is used to carry a value through the call.
-; DEFERRED-NOT: str {{[wx]}}22,
-; DEFERRED: mov {{[wx]}}22,
-; DEFERRED-NOT: str {{[wx]}}22,
-;
-; CHECK: bl fprintf
-;
-; DEFERRED-NOT: ldr {{[wx]}}22,
-; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22
-; DEFERRED-NOT: ldr {{[wx]}}22,
-;
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload
-;
-; End of the basic block we are interested in.
-; CHECK: b
-; CHECK: {{[^:]+}}: // %sw.bb.123
-
-%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-%struct.__sbuf = type { i8*, i64 }
-%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* }
-%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* }
-
-@__sF = external global [0 x %struct.__sFILE], align 8
-@.str = private unnamed_addr constant [20 x i8] c"\0A [%d: stuff+mf \00", align 1
-
-declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...)
-
-declare void @bar(i32)
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-define i32 @foo(%struct.DState* %s) {
-entry:
- %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1
- %tmp = load i32, i32* %state, align 4
- %cmp = icmp eq i32 %tmp, 10
- %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40
- br i1 %cmp, label %if.end.thread, label %if.end
-
-if.end.thread: ; preds = %entry
- %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
- %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
- %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
- %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
- %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
- %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
- %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
- %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
- %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
- %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
- %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
- %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
- %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
- %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
- %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
- %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
- %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
- %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
- %tmp1 = bitcast i32* %save_i to i8*
- call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false)
- br label %sw.default
-
-if.end: ; preds = %entry
- %.pre = load i32, i32* %save_i, align 4
- %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
- %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4
- %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
- %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4
- %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
- %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4
- %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
- %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4
- %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
- %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4
- %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
- %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4
- %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
- %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4
- %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
- %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4
- %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
- %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4
- %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
- %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4
- %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
- %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4
- %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
- %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4
- %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
- %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4
- %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
- %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4
- %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
- %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4
- %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
- %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4
- %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
- %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4
- %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
- %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4
- switch i32 %tmp, label %sw.default [
- i32 13, label %sw.bb
- i32 14, label %if.end.sw.bb.65_crit_edge
- i32 25, label %if.end.sw.bb.123_crit_edge
- ]
-
-if.end.sw.bb.123_crit_edge: ; preds = %if.end
- %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
- br label %sw.bb.123
-
-if.end.sw.bb.65_crit_edge: ; preds = %if.end
- %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
- %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4
- br label %sw.bb.65
-
-sw.bb: ; preds = %if.end
- %sunkaddr = ptrtoint %struct.DState* %s to i64
- %sunkaddr485 = add i64 %sunkaddr, 8
- %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32*
- store i32 13, i32* %sunkaddr486, align 4
- %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
- %tmp2 = load i32, i32* %bsLive, align 4
- %cmp28.400 = icmp sgt i32 %tmp2, 7
- br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph
-
-sw.bb.if.then.29_crit_edge: ; preds = %sw.bb
- %sunkaddr487 = ptrtoint %struct.DState* %s to i64
- %sunkaddr488 = add i64 %sunkaddr487, 32
- %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32*
- %.pre425 = load i32, i32* %sunkaddr489, align 4
- br label %if.then.29
-
-if.end.33.lr.ph: ; preds = %sw.bb
- %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream**
- %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8
- %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1
- %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4
- %tmp4 = add i32 %.pre430, -1
- br label %if.end.33
-
-if.then.29: ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge
- %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ]
- %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ]
- %sub = add nsw i32 %.lcssa393, -8
- %shr = lshr i32 %tmp5, %sub
- %and = and i32 %shr, 255
- %sunkaddr491 = ptrtoint %struct.DState* %s to i64
- %sunkaddr492 = add i64 %sunkaddr491, 36
- %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32*
- store i32 %sub, i32* %sunkaddr493, align 4
- %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9
- store i32 %and, i32* %blockSize100k, align 4
- %and.off = add nsw i32 %and, -49
- %tmp6 = icmp ugt i32 %and.off, 8
- br i1 %tmp6, label %save_state_and_return, label %if.end.62
-
-if.end.33: ; preds = %while.body.backedge, %if.end.33.lr.ph
- %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ]
- %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ]
- %cmp35 = icmp eq i32 %lsr.iv482, -1
- br i1 %cmp35, label %save_state_and_return, label %if.end.37
-
-if.end.37: ; preds = %if.end.33
- %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8**
- %sunkaddr494 = ptrtoint %struct.DState* %s to i64
- %sunkaddr495 = add i64 %sunkaddr494, 32
- %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32*
- %tmp9 = load i32, i32* %sunkaddr496, align 4
- %shl = shl i32 %tmp9, 8
- %tmp10 = load i8*, i8** %tmp8, align 8
- %tmp11 = load i8, i8* %tmp10, align 1
- %conv = zext i8 %tmp11 to i32
- %or = or i32 %conv, %shl
- store i32 %or, i32* %sunkaddr496, align 4
- %add = add nsw i32 %tmp7, 8
- %sunkaddr497 = ptrtoint %struct.DState* %s to i64
- %sunkaddr498 = add i64 %sunkaddr497, 36
- %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32*
- store i32 %add, i32* %sunkaddr499, align 4
- %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1
- store i8* %incdec.ptr, i8** %tmp8, align 8
- %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64
- %sunkaddr501 = add i64 %sunkaddr500, 8
- %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32*
- store i32 %lsr.iv482, i32* %sunkaddr502, align 4
- %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64
- %sunkaddr504 = add i64 %sunkaddr503, 12
- %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32*
- %tmp12 = load i32, i32* %sunkaddr505, align 4
- %inc = add i32 %tmp12, 1
- store i32 %inc, i32* %sunkaddr505, align 4
- %cmp49 = icmp eq i32 %inc, 0
- br i1 %cmp49, label %if.then.51, label %while.body.backedge
-
-if.then.51: ; preds = %if.end.37
- %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64
- %sunkaddr507 = add i64 %sunkaddr506, 16
- %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32*
- %tmp13 = load i32, i32* %sunkaddr508, align 4
- %inc53 = add i32 %tmp13, 1
- store i32 %inc53, i32* %sunkaddr508, align 4
- br label %while.body.backedge
-
-while.body.backedge: ; preds = %if.then.51, %if.end.37
- %lsr.iv.next483 = add i32 %lsr.iv482, -1
- %cmp28 = icmp sgt i32 %add, 7
- br i1 %cmp28, label %if.then.29, label %if.end.33
-
-if.end.62: ; preds = %if.then.29
- %sub64 = add nsw i32 %and, -48
- %sunkaddr509 = ptrtoint %struct.DState* %s to i64
- %sunkaddr510 = add i64 %sunkaddr509, 40
- %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32*
- store i32 %sub64, i32* %sunkaddr511, align 4
- br label %sw.bb.65
-
-sw.bb.65: ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge
- %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ]
- %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ]
- %sunkaddr512 = ptrtoint %struct.DState* %s to i64
- %sunkaddr513 = add i64 %sunkaddr512, 8
- %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32*
- store i32 14, i32* %sunkaddr514, align 4
- %cmp70.397 = icmp sgt i32 %tmp14, 7
- br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph
-
-if.end.82.lr.ph: ; preds = %sw.bb.65
- %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream**
- %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8
- %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1
- %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4
- %tmp16 = add i32 %.pre431, -1
- br label %if.end.82
-
-if.then.72: ; preds = %while.body.68.backedge, %sw.bb.65
- %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ]
- %sub76 = add nsw i32 %.lcssa390, -8
- %sunkaddr516 = ptrtoint %struct.DState* %s to i64
- %sunkaddr517 = add i64 %sunkaddr516, 36
- %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32*
- store i32 %sub76, i32* %sunkaddr518, align 4
- %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11
- %tmp17 = load i32, i32* %currBlockNo, align 4
- %inc117 = add nsw i32 %tmp17, 1
- store i32 %inc117, i32* %currBlockNo, align 4
- %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12
- %tmp18 = load i32, i32* %verbosity, align 4
- %cmp118 = icmp sgt i32 %tmp18, 1
- br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0
-
-if.end.82: ; preds = %while.body.68.backedge, %if.end.82.lr.ph
- %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ]
- %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ]
- %cmp85 = icmp eq i32 %lsr.iv480, -1
- br i1 %cmp85, label %save_state_and_return, label %if.end.88
-
-if.end.88: ; preds = %if.end.82
- %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8**
- %sunkaddr519 = ptrtoint %struct.DState* %s to i64
- %sunkaddr520 = add i64 %sunkaddr519, 32
- %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32*
- %tmp21 = load i32, i32* %sunkaddr521, align 4
- %shl90 = shl i32 %tmp21, 8
- %tmp22 = load i8*, i8** %tmp20, align 8
- %tmp23 = load i8, i8* %tmp22, align 1
- %conv93 = zext i8 %tmp23 to i32
- %or94 = or i32 %conv93, %shl90
- store i32 %or94, i32* %sunkaddr521, align 4
- %add97 = add nsw i32 %tmp19, 8
- %sunkaddr522 = ptrtoint %struct.DState* %s to i64
- %sunkaddr523 = add i64 %sunkaddr522, 36
- %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32*
- store i32 %add97, i32* %sunkaddr524, align 4
- %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1
- store i8* %incdec.ptr100, i8** %tmp20, align 8
- %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64
- %sunkaddr526 = add i64 %sunkaddr525, 8
- %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32*
- store i32 %lsr.iv480, i32* %sunkaddr527, align 4
- %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64
- %sunkaddr529 = add i64 %sunkaddr528, 12
- %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32*
- %tmp24 = load i32, i32* %sunkaddr530, align 4
- %inc106 = add i32 %tmp24, 1
- store i32 %inc106, i32* %sunkaddr530, align 4
- %cmp109 = icmp eq i32 %inc106, 0
- br i1 %cmp109, label %if.then.111, label %while.body.68.backedge
-
-if.then.111: ; preds = %if.end.88
- %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64
- %sunkaddr532 = add i64 %sunkaddr531, 16
- %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32*
- %tmp25 = load i32, i32* %sunkaddr533, align 4
- %inc114 = add i32 %tmp25, 1
- store i32 %inc114, i32* %sunkaddr533, align 4
- br label %while.body.68.backedge
-
-while.body.68.backedge: ; preds = %if.then.111, %if.end.88
- %lsr.iv.next481 = add i32 %lsr.iv480, -1
- %cmp70 = icmp sgt i32 %add97, 7
- br i1 %cmp70, label %if.then.72, label %if.end.82
-
-if.then.120: ; preds = %if.then.72
- %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117)
- br label %sw.bb.123
-
-sw.bb.123: ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge
- %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ]
- %sunkaddr534 = ptrtoint %struct.DState* %s to i64
- %sunkaddr535 = add i64 %sunkaddr534, 8
- %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32*
- store i32 25, i32* %sunkaddr536, align 4
- %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4
- %cmp128.395 = icmp sgt i32 %tmp26, 7
- br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph
-
-sw.bb.123.if.then.130_crit_edge: ; preds = %sw.bb.123
- %sunkaddr537 = ptrtoint %struct.DState* %s to i64
- %sunkaddr538 = add i64 %sunkaddr537, 32
- %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32*
- %.pre429 = load i32, i32* %sunkaddr539, align 4
- br label %if.then.130
-
-if.end.140.lr.ph: ; preds = %sw.bb.123
- %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream**
- %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8
- %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1
- %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4
- %tmp28 = add i32 %.pre432, -1
- br label %if.end.140
-
-if.then.130: ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge
- %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ]
- %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ]
- %sub134 = add nsw i32 %.lcssa, -8
- %shr135 = lshr i32 %tmp29, %sub134
- store i32 %sub134, i32* %bsLive127.pre-phi, align 4
- %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13
- %tmp30 = load i32, i32* %origPtr, align 4
- %shl175 = shl i32 %tmp30, 8
- %conv176 = and i32 %shr135, 255
- %or177 = or i32 %shl175, %conv176
- store i32 %or177, i32* %origPtr, align 4
- %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27
- %tmp31 = load i32, i32* %nInUse, align 4
- %add179 = add nsw i32 %tmp31, 2
- br label %save_state_and_return
-
-if.end.140: ; preds = %while.body.126.backedge, %if.end.140.lr.ph
- %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ]
- %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ]
- %cmp143 = icmp eq i32 %lsr.iv, -1
- br i1 %cmp143, label %save_state_and_return, label %if.end.146
-
-if.end.146: ; preds = %if.end.140
- %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8**
- %sunkaddr541 = ptrtoint %struct.DState* %s to i64
- %sunkaddr542 = add i64 %sunkaddr541, 32
- %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32*
- %tmp34 = load i32, i32* %sunkaddr543, align 4
- %shl148 = shl i32 %tmp34, 8
- %tmp35 = load i8*, i8** %tmp33, align 8
- %tmp36 = load i8, i8* %tmp35, align 1
- %conv151 = zext i8 %tmp36 to i32
- %or152 = or i32 %conv151, %shl148
- store i32 %or152, i32* %sunkaddr543, align 4
- %add155 = add nsw i32 %tmp32, 8
- store i32 %add155, i32* %bsLive127.pre-phi, align 4
- %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1
- store i8* %incdec.ptr158, i8** %tmp33, align 8
- %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64
- %sunkaddr545 = add i64 %sunkaddr544, 8
- %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32*
- store i32 %lsr.iv, i32* %sunkaddr546, align 4
- %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64
- %sunkaddr548 = add i64 %sunkaddr547, 12
- %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32*
- %tmp37 = load i32, i32* %sunkaddr549, align 4
- %inc164 = add i32 %tmp37, 1
- store i32 %inc164, i32* %sunkaddr549, align 4
- %cmp167 = icmp eq i32 %inc164, 0
- br i1 %cmp167, label %if.then.169, label %while.body.126.backedge
-
-if.then.169: ; preds = %if.end.146
- %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64
- %sunkaddr551 = add i64 %sunkaddr550, 16
- %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32*
- %tmp38 = load i32, i32* %sunkaddr552, align 4
- %inc172 = add i32 %tmp38, 1
- store i32 %inc172, i32* %sunkaddr552, align 4
- br label %while.body.126.backedge
-
-while.body.126.backedge: ; preds = %if.then.169, %if.end.146
- %lsr.iv.next = add i32 %lsr.iv, -1
- %cmp128 = icmp sgt i32 %add155, 7
- br i1 %cmp128, label %if.then.130, label %if.end.140
-
-sw.default: ; preds = %if.end, %if.end.thread
- %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ]
- %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ]
- %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ]
- %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ]
- %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ]
- %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ]
- %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ]
- %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ]
- %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ]
- %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ]
- %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ]
- %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ]
- %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ]
- %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ]
- %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ]
- %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ]
- %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ]
- %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ]
- %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ]
- %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ]
- %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ]
- %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ]
- %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ]
- %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ]
- %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ]
- %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ]
- %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ]
- %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ]
- %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ]
- %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ]
- %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ]
- %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ]
- %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ]
- %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ]
- %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ]
- %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ]
- %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ]
- tail call void @bar(i32 4001)
- br label %save_state_and_return
-
-save_state_and_return: ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29
- %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ]
- %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ]
- %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ]
- %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ]
- %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ]
- %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ]
- %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ]
- %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ]
- %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ]
- %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ]
- %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ]
- %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ]
- %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ]
- %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ]
- %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ]
- %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ]
- %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ]
- %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ]
- %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ]
- %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ]
- %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ]
- %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ]
- %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ]
- %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ]
- %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ]
- %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ]
- %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ]
- %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ]
- %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ]
- %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ]
- %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ]
- %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ]
- %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ]
- %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ]
- %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ]
- %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ]
- %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ]
- %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ]
- store i32 %tmp58, i32* %save_i, align 4
- store i32 %tmp59, i32* %save_j3.pre-phi468, align 4
- store i32 %tmp60, i32* %save_t4.pre-phi466, align 4
- store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4
- store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4
- store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4
- store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4
- store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4
- store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4
- store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4
- store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4
- store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4
- store i32 %tmp68, i32* %save_es14.pre-phi446, align 4
- store i32 %tmp69, i32* %save_N15.pre-phi444, align 4
- store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4
- store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4
- store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4
- store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4
- store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4
- ret i32 %retVal.0
-}
-
-!0 = !{!"branch_weights", i32 10, i32 1}
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 90093f94d0ad..708ae083eb86 100644
--- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -1,4 +1,5 @@
; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-post-ra < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios -disable-fp-elim -disable-post-ra < %s | FileCheck %s --check-prefix=CHECK-MACHO
; This test aims to check basic correctness of frame layout &
; frame access code. There are 8 functions in this test file,
@@ -97,27 +98,47 @@ entry:
; CHECK-LABEL: novla_nodynamicrealign_call
; CHECK: .cfi_startproc
; Check that used callee-saved registers are saved
-; CHECK: stp x20, x19, [sp, #-32]!
-; Check that the frame pointer is created:
-; CHECK: stp x29, x30, [sp, #16]
-; CHECK: add x29, sp, #16
+; CHECK: sub sp, sp, #32
+; CHECK: stp x19, x30, [sp, #16]
; Check correctness of cfi pseudo-instructions
-; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_def_cfa_offset 32
; CHECK: .cfi_offset w30, -8
-; CHECK: .cfi_offset w29, -16
-; CHECK: .cfi_offset w19, -24
-; CHECK: .cfi_offset w20, -32
-; Check correct access to arguments passed on the stack, through frame pointer
-; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
-; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: .cfi_offset w19, -16
+; Check correct access to arguments passed on the stack, through stack pointer
+; CHECK: ldr d[[DARG:[0-9]+]], [sp, #56]
+; CHECK: ldr w[[IARG:[0-9]+]], [sp, #40]
; Check correct access to local variable on the stack, through stack pointer
; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12]
; Check epilogue:
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ldp x19, x30, [sp, #16]
; CHECK: ret
; CHECK: .cfi_endproc
+; CHECK-MACHO-LABEL: _novla_nodynamicrealign_call:
+; CHECK-MACHO: .cfi_startproc
+; Check that used callee-saved registers are saved
+; CHECK-MACHO: sub sp, sp, #48
+; CHECK-MACHO: stp x20, x19, [sp, #16]
+; Check that the frame pointer is created:
+; CHECK-MACHO: stp x29, x30, [sp, #32]
+; CHECK-MACHO: add x29, sp, #32
+; Check correctness of cfi pseudo-instructions
+; CHECK-MACHO: .cfi_def_cfa w29, 16
+; CHECK-MACHO: .cfi_offset w30, -8
+; CHECK-MACHO: .cfi_offset w29, -16
+; CHECK-MACHO: .cfi_offset w19, -24
+; CHECK-MACHO: .cfi_offset w20, -32
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr d[[DARG:[0-9]+]], [x29, #32]
+; CHECK-MACHO: ldr w[[IARG:[0-9]+]], [x29, #20]
+; Check correct access to local variable on the stack, through stack pointer
+; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [sp, #12]
+; Check epilogue:
+; CHECK-MACHO: ldp x29, x30, [sp, #32]
+; CHECK-MACHO: ldp x20, x19, [sp, #16]
+; CHECK-MACHO: ret
+; CHECK-MACHO: .cfi_endproc
+
declare i32 @g() #0
@@ -159,7 +180,7 @@ entry:
; CHECK-LABEL: novla_dynamicrealign_call
; CHECK: .cfi_startproc
; Check that used callee-saved registers are saved
-; CHECK: stp x20, x19, [sp, #-32]!
+; CHECK: str x19, [sp, #-32]!
; Check that the frame pointer is created:
; CHECK: stp x29, x30, [sp, #16]
; CHECK: add x29, sp, #16
@@ -170,8 +191,7 @@ entry:
; CHECK: .cfi_def_cfa w29, 16
; CHECK: .cfi_offset w30, -8
; CHECK: .cfi_offset w29, -16
-; CHECK: .cfi_offset w19, -24
-; CHECK: .cfi_offset w20, -32
+; CHECK: .cfi_offset w19, -32
; Check correct access to arguments passed on the stack, through frame pointer
; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
@@ -181,10 +201,39 @@ entry:
; Check that stack pointer get restored from frame pointer.
; CHECK: sub sp, x29, #16 // =16
; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ldr x19, [sp], #32
; CHECK: ret
; CHECK: .cfi_endproc
+; CHECK-MACHO-LABEL: _novla_dynamicrealign_call:
+; CHECK-MACHO: .cfi_startproc
+; Check that used callee-saved registers are saved
+; CHECK-MACHO: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK-MACHO: stp x29, x30, [sp, #16]
+; CHECK-MACHO: add x29, sp, #16
+; Check the dynamic realignment of the stack pointer to a 128-byte boundary
+; CHECK-MACHO: sub x9, sp, #96
+; CHECK-MACHO: and sp, x9, #0xffffffffffffff80
+; Check correctness of cfi pseudo-instructions
+; CHECK-MACHO: .cfi_def_cfa w29, 16
+; CHECK-MACHO: .cfi_offset w30, -8
+; CHECK-MACHO: .cfi_offset w29, -16
+; CHECK-MACHO: .cfi_offset w19, -24
+; CHECK-MACHO: .cfi_offset w20, -32
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr d[[DARG:[0-9]+]], [x29, #32]
+; CHECK-MACHO: ldr w[[IARG:[0-9]+]], [x29, #20]
+; Check correct access to local variable on the stack, through re-aligned stack pointer
+; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [sp]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK-MACHO: sub sp, x29, #16
+; CHECK-MACHO: ldp x29, x30, [sp, #16]
+; CHECK-MACHO: ldp x20, x19, [sp], #32
+; CHECK-MACHO: ret
+; CHECK-MACHO: .cfi_endproc
+
; Function Attrs: nounwind
define i32 @novla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
@@ -336,7 +385,7 @@ entry:
; CHECK-LABEL: vla_dynamicrealign_call
; CHECK: .cfi_startproc
; Check that used callee-saved registers are saved
-; CHECK: stp x22, x21, [sp, #-48]!
+; CHECK: str x21, [sp, #-48]!
; CHECK: stp x20, x19, [sp, #16]
; Check that the frame pointer is created:
; CHECK: stp x29, x30, [sp, #32]
@@ -354,8 +403,7 @@ entry:
; CHECK: .cfi_offset w29, -16
; CHECK: .cfi_offset w19, -24
; CHECK: .cfi_offset w20, -32
-; CHECK: .cfi_offset w21, -40
-; CHECK: .cfi_offset w22, -48
+; CHECK: .cfi_offset w21, -48
; Check correct access to arguments passed on the stack, through frame pointer
; CHECK: ldr w[[IARG:[0-9]+]], [x29, #24]
; CHECK: ldr d[[DARG:[0-9]+]], [x29, #40]
@@ -376,10 +424,57 @@ entry:
; CHECK: sub sp, x29, #32
; CHECK: ldp x29, x30, [sp, #32]
; CHECK: ldp x20, x19, [sp, #16]
-; CHECK: ldp x22, x21, [sp], #48
+; CHECK: ldr x21, [sp], #48
; CHECK: ret
; CHECK: .cfi_endproc
+; CHECK-MACHO-LABEL: _vla_dynamicrealign_call:
+; CHECK-MACHO: .cfi_startproc
+; Check that used callee-saved registers are saved
+; CHECK-MACHO: stp x22, x21, [sp, #-48]!
+; CHECK-MACHO: stp x20, x19, [sp, #16]
+; Check that the frame pointer is created:
+; CHECK-MACHO: stp x29, x30, [sp, #32]
+; CHECK-MACHO: add x29, sp, #32
+; Check that the stack pointer gets re-aligned to 128
+; bytes & the base pointer (x19) gets initialized to
+; this 128-byte aligned area for local variables &
+; spill slots
+; CHECK-MACHO: sub x9, sp, #80
+; CHECK-MACHO: and sp, x9, #0xffffffffffffff80
+; CHECK-MACHO: mov x19, sp
+; Check correctness of cfi pseudo-instructions
+; CHECK-MACHO: .cfi_def_cfa w29, 16
+; CHECK-MACHO: .cfi_offset w30, -8
+; CHECK-MACHO: .cfi_offset w29, -16
+; CHECK-MACHO: .cfi_offset w19, -24
+; CHECK-MACHO: .cfi_offset w20, -32
+; CHECK-MACHO: .cfi_offset w21, -40
+; CHECK-MACHO: .cfi_offset w22, -48
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr w[[IARG:[0-9]+]], [x29, #20]
+; CHECK-MACHO: ldr d[[DARG:[0-9]+]], [x29, #32]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; and set-up of base pointer (x19).
+; CHECK-MACHO: mov w9, w0
+; CHECK-MACHO: mov x10, sp
+; CHECK-MACHO: lsl x9, x9, #2
+; CHECK-MACHO: add x9, x9, #15
+; CHECK-MACHO: and x9, x9, #0x7fffffff0
+; CHECK-MACHO: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK-MACHO: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through base pointer
+; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [x19]
+; CHECK-MACHO: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK-MACHO: sub sp, x29, #32
+; CHECK-MACHO: ldp x29, x30, [sp, #32]
+; CHECK-MACHO: ldp x20, x19, [sp, #16]
+; CHECK-MACHO: ldp x22, x21, [sp], #48
+; CHECK-MACHO: ret
+; CHECK-MACHO: .cfi_endproc
+
; Function Attrs: nounwind
define i32 @vla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
@@ -398,7 +493,7 @@ entry:
; CHECK-LABEL: vla_dynamicrealign_nocall
; Check that used callee-saved registers are saved
-; CHECK: stp x20, x19, [sp, #-32]!
+; CHECK: str x19, [sp, #-32]!
; Check that the frame pointer is created:
; CHECK: stp x29, x30, [sp, #16]
; CHECK: add x29, sp, #16
@@ -428,9 +523,44 @@ entry:
; Check that stack pointer get restored from frame pointer.
; CHECK: sub sp, x29, #16
; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ldr x19, [sp], #32
; CHECK: ret
+; CHECK-MACHO-LABEL: _vla_dynamicrealign_nocall:
+; Check that used callee-saved registers are saved
+; CHECK-MACHO: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK-MACHO: stp x29, x30, [sp, #16]
+; CHECK-MACHO: add x29, sp, #16
+; Check that the stack pointer gets re-aligned to 128
+; bytes & the base pointer (x19) gets initialized to
+; this 128-byte aligned area for local variables &
+; spill slots
+; CHECK-MACHO: sub x9, sp, #96
+; CHECK-MACHO: and sp, x9, #0xffffffffffffff80
+; CHECK-MACHO: mov x19, sp
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr w[[IARG:[0-9]+]], [x29, #20]
+; CHECK-MACHO: ldr d[[DARG:[0-9]+]], [x29, #32]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; and set-up of base pointer (x19).
+; CHECK-MACHO: mov w9, w0
+; CHECK-MACHO: mov x10, sp
+; CHECK-MACHO: lsl x9, x9, #2
+; CHECK-MACHO: add x9, x9, #15
+; CHECK-MACHO: and x9, x9, #0x7fffffff0
+; CHECK-MACHO: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK-MACHO: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through base pointer
+; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [x19]
+; CHECK-MACHO: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK-MACHO: sub sp, x29, #16
+; CHECK-MACHO: ldp x29, x30, [sp, #16]
+; CHECK-MACHO: ldp x20, x19, [sp], #32
+; CHECK-MACHO: ret
+
; Function Attrs: nounwind
define i32 @vla_dynamicrealign_nocall_large_align(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
@@ -449,7 +579,7 @@ entry:
; CHECK-LABEL: vla_dynamicrealign_nocall_large_align
; Check that used callee-saved registers are saved
-; CHECK: stp x20, x19, [sp, #-32]!
+; CHECK: stp x28, x19, [sp, #-32]!
; Check that the frame pointer is created:
; CHECK: stp x29, x30, [sp, #16]
; CHECK: add x29, sp, #16
@@ -479,9 +609,44 @@ entry:
; Check that stack pointer get restored from frame pointer.
; CHECK: sub sp, x29, #16
; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ldp x28, x19, [sp], #32
; CHECK: ret
+; CHECK-MACHO-LABEL: _vla_dynamicrealign_nocall_large_align:
+; Check that used callee-saved registers are saved
+; CHECK-MACHO: stp x20, x19, [sp, #-32]!
+; Check that the frame pointer is created:
+; CHECK-MACHO: stp x29, x30, [sp, #16]
+; CHECK-MACHO: add x29, sp, #16
+; Check that the stack pointer gets re-aligned to 128
+; bytes & the base pointer (x19) gets initialized to
+; this 128-byte aligned area for local variables &
+; spill slots
+; CHECK-MACHO: sub x9, sp, #7, lsl #12
+; CHECK-MACHO: and sp, x9, #0xffffffffffff8000
+; CHECK-MACHO: mov x19, sp
+; Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr w[[IARG:[0-9]+]], [x29, #20]
+; CHECK-MACHO: ldr d[[DARG:[0-9]+]], [x29, #32]
+; Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; and set-up of base pointer (x19).
+; CHECK-MACHO: mov w9, w0
+; CHECK-MACHO: mov x10, sp
+; CHECK-MACHO: lsl x9, x9, #2
+; CHECK-MACHO: add x9, x9, #15
+; CHECK-MACHO: and x9, x9, #0x7fffffff0
+; CHECK-MACHO: sub x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK-MACHO: mov sp, x[[VLASPTMP]]
+; Check correct access to local variable, through base pointer
+; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [x19]
+; CHECK-MACHO: ldr w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+; Check epilogue:
+; Check that stack pointer get restored from frame pointer.
+; CHECK-MACHO: sub sp, x29, #16
+; CHECK-MACHO: ldp x29, x30, [sp, #16]
+; CHECK-MACHO: ldp x20, x19, [sp], #32
+; CHECK-MACHO: ret
+
define void @realign_conditional(i1 %b) {
entry:
@@ -509,7 +674,7 @@ bb1:
define void @realign_conditional2(i1 %b) {
entry:
- %tmp = alloca i8, i32 4
+ %tmp = alloca i8, i32 16
br i1 %b, label %bb0, label %bb1
bb0:
@@ -522,18 +687,18 @@ bb1:
; CHECK-LABEL: realign_conditional2
; Extra realignment in the prologue (performance issue).
+; CHECK: tbz {{.*}} .[[LABEL:.*]]
; CHECK: sub x9, sp, #32 // =32
; CHECK: and sp, x9, #0xffffffffffffffe0
; CHECK: mov x19, sp
-; CHECK: tbz {{.*}} .[[LABEL:.*]]
; Stack is realigned in a non-entry BB.
; CHECK: sub [[REG:x[01-9]+]], sp, #64
; CHECK: and sp, [[REG]], #0xffffffffffffffe0
; CHECK: .[[LABEL]]:
; CHECK: ret
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!1 = !{!2, !2, i64 0}
!2 = !{!"int", !3, i64 0}
diff --git a/test/CodeGen/AArch64/aarch64-gep-opt.ll b/test/CodeGen/AArch64/aarch64-gep-opt.ll
index 93e2ff14ac71..cae00a9b1cb3 100644
--- a/test/CodeGen/AArch64/aarch64-gep-opt.ll
+++ b/test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -1,6 +1,9 @@
; RUN: llc -O3 -aarch64-gep-opt=true -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O3 -aarch64-gep-opt=true -mattr=-use-aa -print-after=codegenprepare < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s
+; RUN: llc -O3 -aarch64-gep-opt=true -mattr=+use-aa -print-after=codegenprepare < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s
; RUN: llc -O3 -aarch64-gep-opt=true -print-after=codegenprepare -mcpu=cyclone < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s
; RUN: llc -O3 -aarch64-gep-opt=true -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s
+
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-linux-gnueabi"
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll
new file mode 100644
index 000000000000..8628c4288c69
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -mtriple=aarch64 -interleaved-access -S | FileCheck %s
+
+; CHECK-LABEL: @extract_user_basic(
+; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi(
+; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br label %if.merge
+
+if.merge:
+ %E2 = extractelement <8 x i32> %L, i32 2
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi_no_dom(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E2 = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_wrong_const_index(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_wrong_const_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 1
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_undef_index(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_undef_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 undef
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_var_index(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 %I
+ ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
index 1bc2a3ccb1ca..845050156baa 100644
--- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -268,3 +268,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
store <3 x float> %tmp1, <3 x float>* %p, align 16
ret void
}
+
+; NEON-LABEL: load_factor2_with_extract_user:
+; NEON: ld2 { v0.4s, v1.4s }, [x0]
+; NEON: mov w0, v0.s[1]
+; NONEON-LABEL: load_factor2_with_extract_user:
+; NONEON-NOT: ld2
+define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
+ %1 = load <8 x i32>, <8 x i32>* %a, align 8
+ %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = extractelement <8 x i32> %1, i32 2
+ ret i32 %3
+}
diff --git a/test/CodeGen/AArch64/aarch64-smull.ll b/test/CodeGen/AArch64/aarch64-smull.ll
index ec0e2de92d0d..1c8d13a00b2a 100644
--- a/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/test/CodeGen/AArch64/aarch64-smull.ll
@@ -234,7 +234,7 @@ define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; Do not use SMULL if the BUILD_VECTOR element values are too big.
; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
-; CHECK: movz
+; CHECK: mov
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = sext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
@@ -268,7 +268,7 @@ define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; Do not use SMULL if the BUILD_VECTOR element values are too big.
; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
-; CHECK: movz
+; CHECK: mov
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = zext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
diff --git a/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/test/CodeGen/AArch64/aarch64-stp-cluster.ll
new file mode 100644
index 000000000000..5cab38eafb52
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -aarch64-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_i64_scale:BB#0
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(4): STRXui %vreg1, %vreg0, 1
+; CHECK:SU(3): STRXui %vreg1, %vreg0, 2
+; CHECK:SU(2): STRXui %vreg1, %vreg0, 3
+; CHECK:SU(5): STRXui %vreg1, %vreg0, 4
+define i64 @stp_i64_scale(i64* nocapture %P, i64 %v) {
+entry:
+ %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
+ store i64 %v, i64* %arrayidx
+ %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
+ store i64 %v, i64* %arrayidx1
+ %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx2
+ %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
+ store i64 %v, i64* %arrayidx3
+ ret i64 %v
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_i32_scale:BB#0
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(4): STRWui %vreg1, %vreg0, 1
+; CHECK:SU(3): STRWui %vreg1, %vreg0, 2
+; CHECK:SU(2): STRWui %vreg1, %vreg0, 3
+; CHECK:SU(5): STRWui %vreg1, %vreg0, 4
+define i32 @stp_i32_scale(i32* nocapture %P, i32 %v) {
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %P, i32 3
+ store i32 %v, i32* %arrayidx
+ %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 2
+ store i32 %v, i32* %arrayidx1
+ %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 1
+ store i32 %v, i32* %arrayidx2
+ %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 4
+ store i32 %v, i32* %arrayidx3
+ ret i32 %v
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_i64_unscale:BB#0 entry
+; CHECK:Cluster ld/st SU(5) - SU(2)
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:SU(5): STURXi %vreg1, %vreg0, -32
+; CHECK:SU(2): STURXi %vreg1, %vreg0, -24
+; CHECK:SU(4): STURXi %vreg1, %vreg0, -16
+; CHECK:SU(3): STURXi %vreg1, %vreg0, -8
+define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 {
+entry:
+ %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
+ store i64 %v, i64* %arrayidx
+ %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx1
+ %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2
+ store i64 %v, i64* %arrayidx2
+ %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4
+ store i64 %v, i64* %arrayidx3
+ ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_i32_unscale:BB#0 entry
+; CHECK:Cluster ld/st SU(5) - SU(2)
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:SU(5): STURWi %vreg1, %vreg0, -16
+; CHECK:SU(2): STURWi %vreg1, %vreg0, -12
+; CHECK:SU(4): STURWi %vreg1, %vreg0, -8
+; CHECK:SU(3): STURWi %vreg1, %vreg0, -4
+define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 {
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
+ store i32 %v, i32* %arrayidx
+ %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1
+ store i32 %v, i32* %arrayidx1
+ %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2
+ store i32 %v, i32* %arrayidx2
+ %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4
+ store i32 %v, i32* %arrayidx3
+ ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_double:BB#0
+; CHECK:Cluster ld/st SU(3) - SU(4)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(3): STRDui %vreg1, %vreg0, 1
+; CHECK:SU(4): STRDui %vreg1, %vreg0, 2
+; CHECK:SU(2): STRDui %vreg1, %vreg0, 3
+; CHECK:SU(5): STRDui %vreg1, %vreg0, 4
+define void @stp_double(double* nocapture %P, double %v) {
+entry:
+ %arrayidx = getelementptr inbounds double, double* %P, i64 3
+ store double %v, double* %arrayidx
+ %arrayidx1 = getelementptr inbounds double, double* %P, i64 1
+ store double %v, double* %arrayidx1
+ %arrayidx2 = getelementptr inbounds double, double* %P, i64 2
+ store double %v, double* %arrayidx2
+ %arrayidx3 = getelementptr inbounds double, double* %P, i64 4
+ store double %v, double* %arrayidx3
+ ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_float:BB#0
+; CHECK:Cluster ld/st SU(3) - SU(4)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(3): STRSui %vreg1, %vreg0, 1
+; CHECK:SU(4): STRSui %vreg1, %vreg0, 2
+; CHECK:SU(2): STRSui %vreg1, %vreg0, 3
+; CHECK:SU(5): STRSui %vreg1, %vreg0, 4
+define void @stp_float(float* nocapture %P, float %v) {
+entry:
+ %arrayidx = getelementptr inbounds float, float* %P, i64 3
+ store float %v, float* %arrayidx
+ %arrayidx1 = getelementptr inbounds float, float* %P, i64 1
+ store float %v, float* %arrayidx1
+ %arrayidx2 = getelementptr inbounds float, float* %P, i64 2
+ store float %v, float* %arrayidx2
+ %arrayidx3 = getelementptr inbounds float, float* %P, i64 4
+ store float %v, float* %arrayidx3
+ ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_volatile:BB#0
+; CHECK-NOT: Cluster ld/st
+; CHECK:SU(2): STRXui %vreg1, %vreg0, 3; mem:Volatile
+; CHECK:SU(3): STRXui %vreg1, %vreg0, 2; mem:Volatile
+; CHECK:SU(4): STRXui %vreg1, %vreg0, 1; mem:Volatile
+; CHECK:SU(5): STRXui %vreg1, %vreg0, 4; mem:Volatile
+define i64 @stp_volatile(i64* nocapture %P, i64 %v) {
+entry:
+ %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
+ store volatile i64 %v, i64* %arrayidx
+ %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
+ store volatile i64 %v, i64* %arrayidx1
+ %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
+ store volatile i64 %v, i64* %arrayidx2
+ %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
+ store volatile i64 %v, i64* %arrayidx3
+ ret i64 %v
+}
+
diff --git a/test/CodeGen/AArch64/aarch64-tbz.ll b/test/CodeGen/AArch64/aarch64-tbz.ll
new file mode 100644
index 000000000000..f4ebcc70674b
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-tbz.ll
@@ -0,0 +1,98 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnueabi < %s | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: tbz {{w[0-9]}}, #3, {{.LBB0_3}}
+; CHECK: tbz w[[REG1:[0-9]+]], #2, {{.LBB0_3}}
+; CHECK-NOT: and [[REG2:x[0-9]+]], x[[REG1]], #0x4
+; CHECK-NOT: cbz [[REG2]], {{.LBB0_3}}
+
+; CHECK: b
+define void @test1(i64 %A, i64 %B) {
+entry:
+ %and = and i64 %A, 4
+ %notlhs = icmp eq i64 %and, 0
+ %and.1 = and i64 %B, 8
+ %0 = icmp eq i64 %and.1, 0
+ %1 = or i1 %0, %notlhs
+ br i1 %1, label %if.end3, label %if.then2
+
+if.then2: ; preds = %entry
+ tail call void @foo(i64 %A, i64 %B)
+ br label %if.end3
+
+if.end3: ; preds = %if.then2, %entry
+ ret void
+}
+
+; CHECK-LABEL: test2
+; CHECK: cbz {{x[0-9]}}, {{.LBB1_3}}
+; CHECK: tbz w[[REG1:[0-9]+]], #3, {{.LBB1_3}}
+; CHECK-NOT: and [REG2:x[0-9]+], x[[REG1]], #0x08
+; CHECK-NOT: cbz [[REG2]], {{.LBB1_3}}
+
+define void @test2(i64 %A, i64* readonly %B) #0 {
+entry:
+ %tobool = icmp eq i64* %B, null
+ %and = and i64 %A, 8
+ %tobool1 = icmp eq i64 %and, 0
+ %or.cond = or i1 %tobool, %tobool1
+ br i1 %or.cond, label %if.end3, label %if.then2
+
+if.then2: ; preds = %entry
+ %0 = load i64, i64* %B, align 4
+ tail call void @foo(i64 %A, i64 %0)
+ br label %if.end3
+
+if.end3: ; preds = %entry, %if.then2
+ ret void
+}
+
+; Make sure we use the W variant when log2(mask) is < 32.
+; CHECK-LABEL: test3
+; CHECK: tbz {{w[0-9]}}, #3, {{.LBB2_3}}
+; CHECK: tbz w[[REG1:[0-9]+]], #28, {{.LBB2_3}}
+; CHECK-NOT: and [[REG2:x[0-9]+]], x[[REG1]]
+; CHECK-NOT: cbz [[REG2]], {{.LBB2_3}}
+define void @test3(i64 %A, i64 %B) {
+entry:
+ %shift = shl i64 1, 28
+ %and = and i64 %A, %shift
+ %notlhs = icmp eq i64 %and, 0
+ %and.1 = and i64 %B, 8
+ %0 = icmp eq i64 %and.1, 0
+ %1 = or i1 %0, %notlhs
+ br i1 %1, label %if.then2, label %if.end3
+
+if.then2: ; preds = %entry
+ tail call void @foo(i64 %A, i64 %B)
+ br label %if.end3
+
+if.end3: ; preds = %if.then2, %entry
+ ret void
+}
+
+; CHECK-LABEL: test4
+; CHECK: tbz {{w[0-9]}}, #3, {{.LBB3_3}}
+; CHECK: tbz [[REG1:x[0-9]+]], #35, {{.LBB3_3}}
+; CHECK-NOT: and [[REG2:x[0-9]+]], x[[REG1]]
+; CHECK-NOT: cbz [[REG2]], {{.LBB2_3}}
+define void @test4(i64 %A, i64 %B) {
+entry:
+ %shift = shl i64 1, 35
+ %and = and i64 %A, %shift
+ %notlhs = icmp eq i64 %and, 0
+ %and.1 = and i64 %B, 8
+ %0 = icmp eq i64 %and.1, 0
+ %1 = or i1 %0, %notlhs
+ br i1 %1, label %if.then2, label %if.end3
+
+if.then2: ; preds = %entry
+ tail call void @foo(i64 %A, i64 %B)
+ br label %if.end3
+
+if.end3: ; preds = %if.then2, %entry
+ ret void
+}
+
+
+declare void @foo(i64, i64)
diff --git a/test/CodeGen/AArch64/aarch64-tryBitfieldInsertOpFromOr-crash.ll b/test/CodeGen/AArch64/aarch64-tryBitfieldInsertOpFromOr-crash.ll
new file mode 100644
index 000000000000..3c986ba2e513
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-tryBitfieldInsertOpFromOr-crash.ll
@@ -0,0 +1,36 @@
+; RUN: llc <%s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; Function Attrs: noreturn nounwind
+define void @foo(i32* %d) {
+entry:
+ %0 = ptrtoint i32* %d to i64
+ %1 = and i64 %0, -36028797018963969
+ %2 = inttoptr i64 %1 to i32*
+ %arrayidx5 = getelementptr inbounds i32, i32* %2, i64 1
+ %arrayidx6 = getelementptr inbounds i32, i32* %2, i64 2
+ %arrayidx7 = getelementptr inbounds i32, i32* %2, i64 3
+ br label %for.cond
+
+for.cond: ; preds = %for.cond, %entry
+ %B.0 = phi i32* [ %d, %entry ], [ %12, %for.cond ]
+ %3 = ptrtoint i32* %B.0 to i64
+ %4 = and i64 %3, -36028797018963969
+ %5 = inttoptr i64 %4 to i32*
+ %6 = load i32, i32* %5, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 1
+ %7 = load i32, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %5, i64 2
+ %8 = load i32, i32* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 3
+ %9 = load i32, i32* %arrayidx3, align 4
+ store i32 %6, i32* %2, align 4
+ store i32 %7, i32* %arrayidx5, align 4
+ store i32 %8, i32* %arrayidx6, align 4
+ store i32 %9, i32* %arrayidx7, align 4
+ %10 = ptrtoint i32* %arrayidx1 to i64
+ %11 = or i64 %10, 36028797018963968
+ %12 = inttoptr i64 %11 to i32*
+ br label %for.cond
+}
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index d6350a6db0ee..c0235cd5d9ef 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -5,6 +5,7 @@
; loads and stores.
@var_i32 = global i32 42
+@var2_i32 = global i32 43
@var_i64 = global i64 0
; Add pure 12-bit immediates:
@@ -106,6 +107,7 @@ define void @sub_med() {
define void @testing() {
; CHECK-LABEL: testing:
%val = load i32, i32* @var_i32
+ %val2 = load i32, i32* @var2_i32
; CHECK: cmp {{w[0-9]+}}, #4095
; CHECK: b.ne [[RET:.?LBB[0-9]+_[0-9]+]]
@@ -117,7 +119,7 @@ test2:
; CHECK: b.lo [[RET]]
%newval2 = add i32 %val, 1
store i32 %newval2, i32* @var_i32
- %cmp_pos_big = icmp ult i32 %val, 14610432
+ %cmp_pos_big = icmp ult i32 %val2, 14610432
br i1 %cmp_pos_big, label %ret, label %test3
test3:
@@ -133,7 +135,7 @@ test4:
; CHECK: b.gt [[RET]]
%newval4 = add i32 %val, 3
store i32 %newval4, i32* @var_i32
- %cmp_pos_sgt = icmp sgt i32 %val, 321
+ %cmp_pos_sgt = icmp sgt i32 %val2, 321
br i1 %cmp_pos_sgt, label %ret, label %test5
test5:
@@ -141,7 +143,7 @@ test5:
; CHECK: b.gt [[RET]]
%newval5 = add i32 %val, 4
store i32 %newval5, i32* @var_i32
- %cmp_neg_uge = icmp sgt i32 %val, -444
+ %cmp_neg_uge = icmp sgt i32 %val2, -444
br i1 %cmp_neg_uge, label %ret, label %test6
test6:
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index 45754377b2d9..ab7a631dc248 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK-MACHO
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
declare void @use_addr(i8*)
@@ -113,14 +114,21 @@ define void @test_variadic_alloca(i64 %n, ...) {
define void @test_alloca_large_frame(i64 %n) {
; CHECK-LABEL: test_alloca_large_frame:
+; CHECK-MACHO-LABEL: test_alloca_large_frame:
-; CHECK: stp x20, x19, [sp, #-32]!
+; CHECK: stp x28, x19, [sp, #-32]!
; CHECK: stp x29, x30, [sp, #16]
; CHECK: add x29, sp, #16
; CHECK: sub sp, sp, #1953, lsl #12
; CHECK: sub sp, sp, #512
+; CHECK-MACHO: stp x20, x19, [sp, #-32]!
+; CHECK-MACHO: stp x29, x30, [sp, #16]
+; CHECK-MACHO: add x29, sp, #16
+; CHECK-MACHO: sub sp, sp, #1953, lsl #12
+; CHECK-MACHO: sub sp, sp, #512
+
%addr1 = alloca i8, i64 %n
%addr2 = alloca i64, i64 1000000
@@ -130,7 +138,11 @@ define void @test_alloca_large_frame(i64 %n) {
; CHECK: sub sp, x29, #16
; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: ldp x20, x19, [sp], #32
+; CHECK: ldp x28, x19, [sp], #32
+
+; CHECK-MACHO: sub sp, x29, #16
+; CHECK-MACHO: ldp x29, x30, [sp, #16]
+; CHECK-MACHO: ldp x20, x19, [sp], #32
}
declare i8* @llvm.stacksave()
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index a66ea0df2e98..caafde0a1bb2 100644
--- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -18,22 +18,21 @@ if.else295: ; preds = %entry
declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-!llvm.dbg.gv = !{!0}
-!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
+!llvm.dbg.cu = !{!0}
!0 = !DIGlobalVariable(name: "vsplive", line: 617, isLocal: true, isDefinition: true, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
+!1 = distinct !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, file: !20, scope: !2, type: !4)
!2 = !DIFile(filename: "print.i", directory: "/Volumes/Ebi/echeng/radars/r9146594")
-!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: 0, file: !20, enums: !21, retainedTypes: !21)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: FullDebug, file: !20, enums: !21, retainedTypes: !21, globals: !{!0})
!4 = !DISubroutineType(types: !5)
!5 = !{!6}
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!7 = distinct !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
+!7 = distinct !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, file: !20, scope: !2, type: !8)
!8 = !DISubroutineType(types: !9)
!9 = !{null}
-!10 = distinct !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
-!11 = distinct !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
-!12 = distinct !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
+!10 = distinct !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, file: !20, scope: !2, type: !4)
+!11 = distinct !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, file: !20, scope: !2, type: !4)
+!12 = distinct !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, file: !20, scope: !2, type: !8)
!13 = !DILocation(line: 653, column: 5, scope: !14)
!14 = distinct !DILexicalBlock(line: 652, column: 35, file: !20, scope: !15)
!15 = distinct !DILexicalBlock(line: 616, column: 1, file: !20, scope: !1)
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 8784abdadfab..6d8c639adb95 100644
--- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NOOPT
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPT
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-NOOPT
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-OPT
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NOOPT
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPT
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-NOOPT
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-OPT
define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK-LABEL: bar:
diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll
index e77952e4b8a1..74b6ae16142e 100644
--- a/test/CodeGen/AArch64/arm64-aapcs-be.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -32,7 +32,8 @@ define float @test_block_addr([8 x float], [1 x float] %in) {
define void @test_block_addr_callee() {
; CHECK-LABEL: test_block_addr_callee:
-; CHECK: str {{[a-z0-9]+}}, [sp, #-16]!
+; CHECK: sub sp, sp, #32
+; CHECK: str {{[a-z0-9]+}}, [sp, #16]
; CHECK: bl test_block_addr
%val = insertvalue [1 x float] undef, float 0.0, 0
call float @test_block_addr([8 x float] undef, [1 x float] %val)
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index 03414b56144c..c92703651385 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -125,7 +125,7 @@ entry:
define void @bar(i32 %x, <4 x i32> %y) nounwind {
entry:
; CHECK-LABEL: bar:
-; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #16]
; CHECK: str {{x[0-9]+}}, [sp]
%x.addr = alloca i32, align 4
%y.addr = alloca <4 x i32>, align 16
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index 36a682242aaa..fb52b1d99fc9 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -130,7 +130,7 @@ entry:
; CHECK-LABEL: test3
; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
; FAST-LABEL: test3
-; FAST: sub sp, sp, #32
+; FAST: sub sp, sp, #48
; FAST: mov x[[ADDR:[0-9]+]], sp
; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
%0 = load <2 x i32>, <2 x i32>* %in, align 8
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index dc9884f12f57..e76adb4abc02 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -O0 -disable-fp-elim | FileCheck -check-prefix=FAST %s
target triple = "arm64-apple-darwin"
; rdar://12648441
@@ -74,7 +74,7 @@ define i32 @caller38_stack() #1 {
entry:
; CHECK-LABEL: caller38_stack
; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: mov w[[C:[0-9]+]], #9
; CHECK: str w[[C]], [sp]
%0 = load i64, i64* bitcast (%struct.s38* @g38 to i64*), align 4
%1 = load i64, i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
@@ -128,7 +128,7 @@ entry:
; CHECK-LABEL: caller39_stack
; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: mov w[[C:[0-9]+]], #9
; CHECK: str w[[C]], [sp]
%0 = load i128, i128* bitcast (%struct.s39* @g39 to i128*), align 16
%1 = load i128, i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
@@ -184,7 +184,7 @@ entry:
; CHECK-LABEL: caller40_stack
; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: mov w[[C:[0-9]+]], #9
; CHECK: str w[[C]], [sp]
%0 = load [2 x i64], [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
%1 = load [2 x i64], [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
@@ -238,7 +238,7 @@ entry:
; CHECK-LABEL: caller41_stack
; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: mov w[[C:[0-9]+]], #9
; CHECK: str w[[C]], [sp]
%0 = load i128, i128* bitcast (%struct.s41* @g41 to i128*), align 16
%1 = load i128, i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
@@ -291,7 +291,7 @@ entry:
; Space for s2 is allocated at sp
; FAST-LABEL: caller42
-; FAST: sub sp, sp, #96
+; FAST: sub sp, sp, #112
; Space for s1 is allocated at fp-24 = sp+72
; Space for s2 is allocated at sp+48
; FAST: sub x[[A:[0-9]+]], x29, #24
@@ -317,8 +317,8 @@ declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
define i32 @caller42_stack() #3 {
entry:
; CHECK-LABEL: caller42_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
+; CHECK: sub sp, sp, #112
+; CHECK: add x29, sp, #96
; CHECK: stur {{x[0-9]+}}, [x29, #-16]
; CHECK: stur {{q[0-9]+}}, [x29, #-32]
; CHECK: str {{x[0-9]+}}, [sp, #48]
@@ -330,7 +330,7 @@ entry:
; CHECK: sub x[[A:[0-9]+]], x29, #32
; Address of s1 is passed on stack at sp+8
; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: mov w[[C:[0-9]+]], #9
; CHECK: str w[[C]], [sp]
; FAST-LABEL: caller42_stack
@@ -399,7 +399,7 @@ entry:
; Space for s2 is allocated at sp
; FAST-LABEL: caller43
-; FAST: mov x29, sp
+; FAST: add x29, sp, #64
; Space for s1 is allocated at sp+32
; Space for s2 is allocated at sp
; FAST: add x1, sp, #32
@@ -429,8 +429,8 @@ declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
define i32 @caller43_stack() #3 {
entry:
; CHECK-LABEL: caller43_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
+; CHECK: sub sp, sp, #112
+; CHECK: add x29, sp, #96
; CHECK: stur {{q[0-9]+}}, [x29, #-16]
; CHECK: stur {{q[0-9]+}}, [x29, #-32]
; CHECK: str {{q[0-9]+}}, [sp, #48]
@@ -442,11 +442,11 @@ entry:
; CHECK: sub x[[A:[0-9]+]], x29, #32
; Address of s1 is passed on stack at sp+8
; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: mov w[[C:[0-9]+]], #9
; CHECK: str w[[C]], [sp]
; FAST-LABEL: caller43_stack
-; FAST: sub sp, sp, #96
+; FAST: sub sp, sp, #112
; Space for s1 is allocated at fp-32 = sp+64
; Space for s2 is allocated at sp+32
; FAST: sub x[[A:[0-9]+]], x29, #32
@@ -508,7 +508,7 @@ entry:
; "i64 %0" should be in register x7.
; "i32 8" should be on stack at [sp].
; CHECK: ldr x7, [{{x[0-9]+}}]
-; CHECK: str {{w[0-9]+}}, [sp, #-16]!
+; CHECK: str {{w[0-9]+}}, [sp]
; FAST-LABEL: i64_split
; FAST: ldr x7, [{{x[0-9]+}}]
; FAST: mov x[[R0:[0-9]+]], sp
diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
index c22d0312b24d..0e651a910d7b 100644
--- a/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -82,7 +82,7 @@ define void @t7(i64 %a) {
define void @t8(i64 %a) {
; CHECK-LABEL: t8:
-; CHECK: movn [[REG:x[0-9]+]], #0x1235
+; CHECK: mov [[REG:x[0-9]+]], #-4662
; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
%1 = sub i64 %a, 4662 ;-4662 is 0xffffffffffffedca
%2 = inttoptr i64 %1 to i64*
@@ -92,7 +92,7 @@ define void @t8(i64 %a) {
define void @t9(i64 %a) {
; CHECK-LABEL: t9:
-; CHECK: movn [[REG:x[0-9]+]], #0x1235, lsl #16
+; CHECK: mov [[REG:x[0-9]+]], #-305463297
; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
%1 = add i64 -305463297, %a ;-305463297 is 0xffffffffedcaffff
%2 = inttoptr i64 %1 to i64*
@@ -102,7 +102,7 @@ define void @t9(i64 %a) {
define void @t10(i64 %a) {
; CHECK-LABEL: t10:
-; CHECK: movz [[REG:x[0-9]+]], #0x123, lsl #48
+; CHECK: mov [[REG:x[0-9]+]], #81909218222800896
; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
%1 = add i64 %a, 81909218222800896 ;0x123000000000000
%2 = inttoptr i64 %1 to i64*
@@ -112,8 +112,8 @@ define void @t10(i64 %a) {
define void @t11(i64 %a) {
; CHECK-LABEL: t11:
-; CHECK: movz w[[NUM:[0-9]+]], #0x123, lsl #16
-; CHECK: movk w[[NUM:[0-9]+]], #0x4567
+; CHECK: mov w[[NUM:[0-9]+]], #19070976
+; CHECK: movk w[[NUM:[0-9]+]], #17767
; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]]
%1 = add i64 %a, 19088743 ;0x1234567
%2 = inttoptr i64 %1 to i64*
diff --git a/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
index bf2d2cfa6066..71bf2039eaa1 100644
--- a/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
+++ b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false -enable-post-misched=false | FileCheck %s
; rdar://12713765
; Make sure we are not creating stack objects that are assumed to be 64-byte
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index 44c24c51f0df..d7188f31c567 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -190,7 +190,7 @@ define void @atomic_store_seq_cst(i128 %in, i128* %p) {
; CHECK-LABEL: atomic_store_seq_cst:
; CHECK-NOT: dmb
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp xzr, xzr, [x2]
+; CHECK: ldaxp xzr, [[IGNORED:x[0-9]+]], [x2]
; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
; CHECK: cbnz [[SUCCESS]], [[LABEL]]
; CHECK-NOT: dmb
@@ -202,7 +202,7 @@ define void @atomic_store_release(i128 %in, i128* %p) {
; CHECK-LABEL: atomic_store_release:
; CHECK-NOT: dmb
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: ldxp xzr, [[IGNORED:x[0-9]+]], [x2]
; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
; CHECK: cbnz [[SUCCESS]], [[LABEL]]
; CHECK-NOT: dmb
@@ -214,7 +214,7 @@ define void @atomic_store_relaxed(i128 %in, i128* %p) {
; CHECK-LABEL: atomic_store_relaxed:
; CHECK-NOT: dmb
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: ldxp xzr, [[IGNORED:x[0-9]+]], [x2]
; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
; CHECK: cbnz [[SUCCESS]], [[LABEL]]
; CHECK-NOT: dmb
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
index 5d8d60de5fc5..fef137b1023f 100644
--- a/test/CodeGen/AArch64/arm64-atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -103,7 +103,7 @@ define i64 @fetch_and_nand_64(i64* %p) #0 {
define i32 @fetch_and_or(i32* %p) #0 {
; CHECK-LABEL: fetch_and_or:
-; CHECK: movz [[OLDVAL_REG:w[0-9]+]], #0x5
+; CHECK: mov [[OLDVAL_REG:w[0-9]+]], #5
; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
; CHECK: ldaxr w[[DEST_REG:[0-9]+]], [x0]
; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
diff --git a/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
index 5dca92941211..402e16ccdb21 100644
--- a/test/CodeGen/AArch64/arm64-bitfield-extract.ll
+++ b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -41,7 +41,7 @@ define i32 @bar(i64 %cav1.coerce) nounwind {
define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
; CHECK-LABEL: fct1:
-; CHECK: ubfx
+; CHECK: ubfx x{{[0-9]+}}, x{{[0-9]+}}
; CHECK-NOT: and
; CHECK: ret
@@ -348,8 +348,8 @@ entry:
; CHECK-LABEL: fct16:
; CHECK: ldr [[REG1:w[0-9]+]],
; Create the constant
-; CHECK: movz [[REGCST:w[0-9]+]], #0x1a, lsl #16
-; CHECK: movk [[REGCST]], #0x8160
+; CHECK: mov [[REGCST:w[0-9]+]], #1703936
+; CHECK: movk [[REGCST]], #33120
; Do the masking
; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3
@@ -377,8 +377,8 @@ entry:
; CHECK-LABEL: fct17:
; CHECK: ldr [[REG1:x[0-9]+]],
; Create the constant
-; CHECK: movz w[[REGCST:[0-9]+]], #0x1a, lsl #16
-; CHECK: movk w[[REGCST]], #0x8160
+; CHECK: mov w[[REGCST:[0-9]+]], #1703936
+; CHECK: movk w[[REGCST]], #33120
; Do the masking
; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]]
; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3
diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
index d0f6db080551..1a6c3687dcb0 100644
--- a/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -36,7 +36,7 @@ define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind {
define <8 x i16> @build_all_zero(<8 x i16> %a) #1 {
; CHECK-LABEL: build_all_zero:
-; CHECK: movz w[[GREG:[0-9]+]], #0xae80
+; CHECK: mov w[[GREG:[0-9]+]], #44672
; CHECK-NEXT: fmov s[[FREG:[0-9]+]], w[[GREG]]
; CHECK-NEXT: mul.8h v0, v0, v[[FREG]]
%b = add <8 x i16> %a, <i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
@@ -56,4 +56,4 @@ define <8 x i16> @concat_2_build_vector(<4 x i16> %in0) {
%vshl_n2 = shl <4 x i16> %vshl_n, <i16 9, i16 9, i16 9, i16 9>
%shuffle.i = shufflevector <4 x i16> %vshl_n2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %shuffle.i
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/arm64-builtins-linux.ll b/test/CodeGen/AArch64/arm64-builtins-linux.ll
index 34fa1b471561..6caf3a2a18ef 100644
--- a/test/CodeGen/AArch64/arm64-builtins-linux.ll
+++ b/test/CodeGen/AArch64/arm64-builtins-linux.ll
@@ -1,11 +1,11 @@
; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
; Function Attrs: nounwind readnone
-declare i8* @llvm.aarch64.thread.pointer() #1
+declare i8* @llvm.thread.pointer() #1
define i8* @thread_pointer() {
; CHECK: thread_pointer:
; CHECK: mrs {{x[0-9]+}}, TPIDR_EL0
- %1 = tail call i8* @llvm.aarch64.thread.pointer()
+ %1 = tail call i8* @llvm.thread.pointer()
ret i8* %1
}
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index 72d3b8331162..748bbcca079f 100644
--- a/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -317,24 +317,6 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
ret i64 %sel
}
-; CHECK-LABEL: select_complicated
-define i16 @select_complicated(double %v1, double %v2, i16 %a, i16 %b) {
-; CHECK: ldr [[REG:d[0-9]+]],
-; CHECK: fcmp d0, d2
-; CHECK-NEXT: fmov d2, #13.00000000
-; CHECK-NEXT: fccmp d1, d2, #4, ne
-; CHECK-NEXT: fccmp d0, d1, #1, ne
-; CHECK-NEXT: fccmp d0, d1, #4, vc
-; CEHCK-NEXT: csel w0, w0, w1, eq
- %1 = fcmp one double %v1, %v2
- %2 = fcmp oeq double %v2, 13.0
- %3 = fcmp oeq double %v1, 42.0
- %or0 = or i1 %2, %3
- %or1 = or i1 %1, %or0
- %sel = select i1 %or1, i16 %a, i16 %b
- ret i16 %sel
-}
-
; CHECK-LABEL: gccbug
define i64 @gccbug(i64 %x0, i64 %x1) {
; CHECK: cmp x0, #2
@@ -443,3 +425,234 @@ define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
store volatile i32 %ext, i32* @g
ret i64 %sel
}
+
+; The following is not possible to implement with a single cmp;ccmp;csel
+; sequence.
+; CHECK-LABEL: select_noccmp3
+define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) {
+ %c0 = icmp slt i32 %v0, 0
+ %c1 = icmp sgt i32 %v0, 13
+ %c2 = icmp slt i32 %v0, 22
+ %c3 = icmp sgt i32 %v0, 44
+ %c4 = icmp eq i32 %v0, 99
+ %c5 = icmp eq i32 %v0, 77
+ %or0 = or i1 %c0, %c1
+ %or1 = or i1 %c2, %c3
+ %and0 = and i1 %or0, %or1
+ %or2 = or i1 %c4, %c5
+ %and1 = and i1 %and0, %or2
+ %sel = select i1 %and1, i32 %v1, i32 %v2
+ ret i32 %sel
+}
+
+; Test the IR CCs that expand to two cond codes.
+
+; CHECK-LABEL: select_and_olt_one:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d2, d3, #4, mi
+; CHECK-NEXT: fccmp d2, d3, #1, ne
+; CHECK-NEXT: csel w0, w0, w1, vc
+; CHECK-NEXT: ret
+define i32 @select_and_olt_one(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt double %v0, %v1
+ %c1 = fcmp one double %v2, %v3
+ %cr = and i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_and_one_olt:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d0, d1, #1, ne
+; CHECK-NEXT: fccmp d2, d3, #0, vc
+; CHECK-NEXT: csel w0, w0, w1, mi
+; CHECK-NEXT: ret
+define i32 @select_and_one_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp one double %v0, %v1
+ %c1 = fcmp olt double %v2, %v3
+ %cr = and i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_and_olt_ueq:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d2, d3, #0, mi
+; CHECK-NEXT: fccmp d2, d3, #8, le
+; CHECK-NEXT: csel w0, w0, w1, pl
+; CHECK-NEXT: ret
+define i32 @select_and_olt_ueq(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt double %v0, %v1
+ %c1 = fcmp ueq double %v2, %v3
+ %cr = and i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_and_ueq_olt:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d0, d1, #8, le
+; CHECK-NEXT: fccmp d2, d3, #0, pl
+; CHECK-NEXT: csel w0, w0, w1, mi
+; CHECK-NEXT: ret
+define i32 @select_and_ueq_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp ueq double %v0, %v1
+ %c1 = fcmp olt double %v2, %v3
+ %cr = and i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_or_olt_one:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d2, d3, #0, pl
+; CHECK-NEXT: fccmp d2, d3, #8, le
+; CHECK-NEXT: csel w0, w0, w1, mi
+; CHECK-NEXT: ret
+define i32 @select_or_olt_one(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt double %v0, %v1
+ %c1 = fcmp one double %v2, %v3
+ %cr = or i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_or_one_olt:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d0, d1, #1, ne
+; CHECK-NEXT: fccmp d2, d3, #8, vs
+; CHECK-NEXT: csel w0, w0, w1, mi
+; CHECK-NEXT: ret
+define i32 @select_or_one_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp one double %v0, %v1
+ %c1 = fcmp olt double %v2, %v3
+ %cr = or i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_or_olt_ueq:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d2, d3, #4, pl
+; CHECK-NEXT: fccmp d2, d3, #1, ne
+; CHECK-NEXT: csel w0, w0, w1, vs
+; CHECK-NEXT: ret
+define i32 @select_or_olt_ueq(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt double %v0, %v1
+ %c1 = fcmp ueq double %v2, %v3
+ %cr = or i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_or_ueq_olt:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d0, d1, #8, le
+; CHECK-NEXT: fccmp d2, d3, #8, mi
+; CHECK-NEXT: csel w0, w0, w1, mi
+; CHECK-NEXT: ret
+define i32 @select_or_ueq_olt(double %v0, double %v1, double %v2, double %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp ueq double %v0, %v1
+ %c1 = fcmp olt double %v2, %v3
+ %cr = or i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_or_olt_ogt_ueq:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d2, d3, #0, pl
+; CHECK-NEXT: fccmp d4, d5, #4, le
+; CHECK-NEXT: fccmp d4, d5, #1, ne
+; CHECK-NEXT: csel w0, w0, w1, vs
+; CHECK-NEXT: ret
+define i32 @select_or_olt_ogt_ueq(double %v0, double %v1, double %v2, double %v3, double %v4, double %v5, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt double %v0, %v1
+ %c1 = fcmp ogt double %v2, %v3
+ %c2 = fcmp ueq double %v4, %v5
+ %c3 = or i1 %c1, %c0
+ %cr = or i1 %c2, %c3
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: select_or_olt_ueq_ogt:
+; CHECK-LABEL: ; BB#0:
+; CHECK-NEXT: fcmp d0, d1
+; CHECK-NEXT: fccmp d2, d3, #4, pl
+; CHECK-NEXT: fccmp d2, d3, #1, ne
+; CHECK-NEXT: fccmp d4, d5, #0, vc
+; CHECK-NEXT: csel w0, w0, w1, gt
+; CHECK-NEXT: ret
+define i32 @select_or_olt_ueq_ogt(double %v0, double %v1, double %v2, double %v3, double %v4, double %v5, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt double %v0, %v1
+ %c1 = fcmp ueq double %v2, %v3
+ %c2 = fcmp ogt double %v4, %v5
+ %c3 = or i1 %c1, %c0
+ %cr = or i1 %c2, %c3
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; Verify that we correctly promote f16.
+
+; CHECK-LABEL: half_select_and_olt_oge:
+; CHECK-LABEL: ; BB#0:
+; CHECK-DAG: fcvt [[S0:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[S1:s[0-9]+]], h1
+; CHECK-NEXT: fcmp [[S0]], [[S1]]
+; CHECK-DAG: fcvt [[S2:s[0-9]+]], h2
+; CHECK-DAG: fcvt [[S3:s[0-9]+]], h3
+; CHECK-NEXT: fccmp [[S2]], [[S3]], #8, mi
+; CHECK-NEXT: csel w0, w0, w1, ge
+; CHECK-NEXT: ret
+define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt half %v0, %v1
+ %c1 = fcmp oge half %v2, %v3
+ %cr = and i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; CHECK-LABEL: half_select_and_olt_one:
+; CHECK-LABEL: ; BB#0:
+; CHECK-DAG: fcvt [[S0:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[S1:s[0-9]+]], h1
+; CHECK-NEXT: fcmp [[S0]], [[S1]]
+; CHECK-DAG: fcvt [[S2:s[0-9]+]], h2
+; CHECK-DAG: fcvt [[S3:s[0-9]+]], h3
+; CHECK-NEXT: fccmp [[S2]], [[S3]], #4, mi
+; CHECK-NEXT: fccmp [[S2]], [[S3]], #1, ne
+; CHECK-NEXT: csel w0, w0, w1, vc
+; CHECK-NEXT: ret
+define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt half %v0, %v1
+ %c1 = fcmp one half %v2, %v3
+ %cr = and i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.
+
+; CHECK-LABEL: f128_select_and_olt_oge:
+; CHECK: bl ___lttf2
+; CHECK: bl ___getf2
+define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
+ %c0 = fcmp olt fp128 %v0, %v1
+ %c1 = fcmp oge fp128 %v2, %v3
+ %cr = and i1 %c1, %c0
+ %sel = select i1 %cr, i32 %a, i32 %b
+ ret i32 %sel
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
index 59147d401a30..3fc0d45f065c 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -613,6 +613,7 @@ define <1 x i8> @getL() {
; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: ; kill
; Ultimately we should generate str b0, but right now, we match the vector
; variant which does not allow to fold the immediate into the store.
; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]]
@@ -654,4 +655,25 @@ define void @uninterestingSub(i8* nocapture %row) #0 {
ret void
}
+@.str.89 = external unnamed_addr constant [12 x i8], align 1
+@.str.90 = external unnamed_addr constant [5 x i8], align 1
+; CHECK-LABEL: test_r274582
+define void @test_r274582() {
+entry:
+ br i1 undef, label %if.then.i, label %if.end.i
+if.then.i:
+ ret void
+if.end.i:
+; CHECK: .loh AdrpAdrp Lloh91, Lloh93
+; CHECK: .loh AdrpLdr Lloh91, Lloh92
+; CHECK: .loh AdrpLdrGot Lloh93, Lloh95
+; CHECK: .loh AdrpLdrGot Lloh94, Lloh96
+ %mul.i.i.i = fmul double undef, 1.000000e-06
+ %add.i.i.i = fadd double undef, %mul.i.i.i
+ %sub.i.i = fsub double %add.i.i.i, undef
+ call void (i8*, ...) @callee(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.89, i64 0, i64 0), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str.90, i64 0, i64 0), double %sub.i.i)
+ unreachable
+}
+declare void @callee(i8* nocapture readonly, ...)
+
attributes #0 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/AArch64/arm64-const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll
index ffc153344d3a..e55db2904489 100644
--- a/test/CodeGen/AArch64/arm64-const-addr.ll
+++ b/test/CodeGen/AArch64/arm64-const-addr.ll
@@ -5,8 +5,8 @@
; Test if the constant base address gets only materialized once.
define i32 @test1() nounwind {
; CHECK-LABEL: test1
-; CHECK: movz w8, #0x40f, lsl #16
-; CHECK-NEXT: movk w8, #0xc000
+; CHECK: mov w8, #68091904
+; CHECK-NEXT: movk w8, #49152
; CHECK-NEXT: ldp w9, w10, [x8, #4]
; CHECK: ldr w8, [x8, #12]
%at = inttoptr i64 68141056 to %T*
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index b8da39910312..ed061122f311 100644
--- a/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -23,8 +23,8 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d
; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d
-; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA0]].4s
-; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA2]].4s
+; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA2]].4s
+; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA0]].4s
; CHECK: xtn v0.8b, v[[TMP1]].8h
%tmp1 = load <8 x double>, <8 x double>* %ptr
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
diff --git a/test/CodeGen/AArch64/arm64-csldst-mmo.ll b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
new file mode 100644
index 000000000000..0b8f7a19b484
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
@@ -0,0 +1,25 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; REQUIRES: asserts
+
+@G = external global [0 x i32], align 4
+
+; Check that MMOs are added to epilogue calle-save restore loads so
+; that the store to G is not considered dependant on the callee-save
+; loads.
+;
+; CHECK: Before post-MI-sched:
+; CHECK-LABEL: # Machine code for function test1:
+; CHECK: SU(2): STRWui %WZR
+; CHECK: SU(3): %X21<def>, %X20<def> = LDPXi %SP
+; CHECK: Predecessors:
+; CHECK-NEXT: out SU(0)
+; CHECK-NEXT: out SU(0)
+; CHECK-NEXT: ch SU(0)
+; CHECK-NEXT: Successors:
+define void @test1() {
+entry:
+ tail call void asm sideeffect "nop", "~{x20},~{x21},~{x22},~{x23}"() nounwind
+ store i32 0, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @G, i64 0, i64 0), align 4
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-detect-vec-redux.ll b/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
new file mode 100644
index 000000000000..68130f1c9f88
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=arm64-darwin-unknown < %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; Function Attrs: nounwind readnone
+define i32 @dotests_56() #0 {
+entry:
+ %vqshrn_n4 = tail call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> zeroinitializer, i32 19)
+ %shuffle.i109 = shufflevector <2 x i32> %vqshrn_n4, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
+ %neg = xor <4 x i32> %shuffle.i109, <i32 undef, i32 -1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %neg, <4 x i32> undef, <2 x i32> <i32 1, i32 undef>
+ %mul = mul <2 x i32> %shuffle, <i32 add (i32 extractelement (<2 x i32> bitcast (<1 x i64> <i64 -4264345899313889281> to <2 x i32>), i32 0), i32 sub (i32 0, i32 extractelement (<2 x i32> bitcast (<1 x i64> <i64 -9223231295071453185> to <2 x i32>), i32 0))), i32 undef>
+ %shuffle27 = shufflevector <2 x i32> %mul, <2 x i32> undef, <4 x i32> zeroinitializer
+ %0 = bitcast <4 x i32> %shuffle27 to <8 x i16>
+ %shuffle.i108 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ %vqshrn_n38 = tail call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %shuffle.i108, i32 1)
+ %shuffle.i = shufflevector <8 x i8> %vqshrn_n38, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast <16 x i8> %shuffle.i to <2 x i64>
+ %vpaddq_v2.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> undef, <2 x i64> %1) #2
+ %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> undef, <2 x i32> undef) #2
+ %vqdmlal_v3.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %vpaddq_v2.i, <2 x i64> %vqdmlal2.i) #2
+ %vmovn.i = trunc <2 x i64> %vqdmlal_v3.i to <2 x i32>
+ %vqdmulh_v2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %vmovn.i, <2 x i32> zeroinitializer) #2
+ %2 = bitcast <2 x i32> %vqdmulh_v2.i to <1 x i64>
+ %vget_lane = extractelement <1 x i64> %2, i32 0
+ %cmp = icmp ne i64 %vget_lane, -7395147708962464393
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) #1
+
+; Function Attrs: nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) #1
+
+; Function Attrs: nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) #1
+
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
index 020c07c739d9..f00efbcea780 100644
--- a/test/CodeGen/AArch64/arm64-extern-weak.ll
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -1,5 +1,5 @@
; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
declare extern_weak i32 @var()
@@ -13,11 +13,6 @@ define i32()* @foo() {
; CHECK: adrp x[[VAR:[0-9]+]], :got:var
; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
-; CHECK-STATIC: .LCPI0_0:
-; CHECK-STATIC-NEXT: .xword var
-; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
-; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
-
; In the large model, the usual relocations are absolute and can
; materialise 0.
; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -36,11 +31,6 @@ define i32* @bar() {
; CHECK: add x0, [[ARR_VAR]], #20
ret i32* %addr
-; CHECK-STATIC: .LCPI1_0:
-; CHECK-STATIC-NEXT: .xword arr_var
-; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
-; CHECK-STATIC: add x0, [[BASE]], #20
-
; In the large model, the usual relocations are absolute and can
; materialise 0.
; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
@@ -56,9 +46,6 @@ define i32* @wibble() {
; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
-; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
-
; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/arm64-extract.ll b/test/CodeGen/AArch64/arm64-extract.ll
index 01984662d23a..6e07c4ce4ccb 100644
--- a/test/CodeGen/AArch64/arm64-extract.ll
+++ b/test/CodeGen/AArch64/arm64-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: llc -verify-machineinstrs < %s \
; RUN: -march=arm64 | FileCheck %s
define i64 @ror_i64(i64 %in) {
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
index e4dc948c4603..9dae7a6f5b69 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
@@ -6,10 +6,10 @@
; Load an address with an offset larget then LDR imm can handle
define i32 @foo() nounwind {
entry:
-; CHECK: @foo
+; CHECK-LABEL: @foo
; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #0x4e20
+; CHECK: mov x[[REG2:[0-9]+]], #20000
; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
; CHECK: ldr w0, [x[[REG3]]]
; CHECK: ret
@@ -19,10 +19,10 @@ entry:
define i64 @foo2() nounwind {
entry:
-; CHECK: @foo2
+; CHECK-LABEL: @foo2
; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #0x9c40
+; CHECK: mov x[[REG2:[0-9]+]], #40000
; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
; CHECK: ldr x0, [x[[REG3]]]
; CHECK: ret
@@ -36,10 +36,10 @@ entry:
define signext i8 @foo3() nounwind ssp {
entry:
-; CHECK: @foo3
-; CHECK: movz x[[REG:[0-9]+]], #0xb3a, lsl #32
-; CHECK: movk x[[REG]], #0x73ce, lsl #16
-; CHECK: movk x[[REG]], #0x2ff2
+; CHECK-LABEL: @foo3
+; CHECK: mov x[[REG:[0-9]+]], #12343736008704
+; CHECK: movk x[[REG]], #29646, lsl #16
+; CHECK: movk x[[REG]], #12274
%0 = load i8*, i8** @pd2, align 8
%arrayidx = getelementptr inbounds i8, i8* %0, i64 12345678901234
%1 = load i8, i8* %arrayidx, align 1
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
index a506607a0a5d..bdc24aea2144 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -1,5 +1,5 @@
; This test should cause the TargetMaterializeAlloca to be invoked
-; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort=1 -verify-machineinstrs -mtriple=arm64-apple-darwin -disable-fp-elim < %s | FileCheck %s
%struct.S1Ty = type { i64 }
%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
@@ -14,7 +14,7 @@ entry:
define void @main() nounwind {
entry:
; CHECK: main
-; CHECK: mov x29, sp
+; CHECK: add x29, sp, #16
; CHECK: mov [[REG:x[0-9]+]], sp
; CHECK-NEXT: add x0, [[REG]], #8
%E = alloca %struct.S2Ty, align 4
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
index d6957f9191e2..59c4e38e5467 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-call.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -1,6 +1,6 @@
-; RUN: llc -O0 -fast-isel-abort=2 -code-model=small -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
-; RUN: llc -O0 -fast-isel-abort=2 -code-model=large -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s --check-prefix=LARGE
-; RUN: llc -O0 -fast-isel-abort=2 -code-model=small -verify-machineinstrs -mtriple=aarch64_be-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O0 -fast-isel-abort=2 -code-model=small -verify-machineinstrs -disable-fp-elim -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort=2 -code-model=large -verify-machineinstrs -disable-fp-elim -mtriple=arm64-apple-darwin < %s | FileCheck %s --check-prefix=LARGE
+; RUN: llc -O0 -fast-isel-abort=2 -code-model=small -verify-machineinstrs -disable-fp-elim -mtriple=aarch64_be-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-BE
define void @call0() nounwind {
entry:
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
index ab29824ccb60..85d000b8606b 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -18,8 +18,8 @@ entry:
; CHECK: @Rand
; CHECK: adrp [[REG1:x[0-9]+]], _seed@GOTPAGE
; CHECK: ldr [[REG2:x[0-9]+]], {{\[}}[[REG1]], _seed@GOTPAGEOFF{{\]}}
-; CHECK: movz [[REG3:x[0-9]+]], #0x3619
-; CHECK: movz [[REG4:x[0-9]+]], #0x51d
+; CHECK: mov [[REG3:x[0-9]+]], #13849
+; CHECK: mov [[REG4:x[0-9]+]], #1309
; CHECK: ldr [[REG5:x[0-9]+]], {{\[}}[[REG2]]{{\]}}
; CHECK: mul [[REG6:x[0-9]+]], [[REG5]], [[REG4]]
; CHECK: add [[REG7:x[0-9]+]], [[REG6]], [[REG3]]
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index bb2889eaf4be..a8f30ad4777d 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -8,7 +8,7 @@ define void @t1() {
; ARM64: adrp x8, _message@PAGE
; ARM64: add x0, x8, _message@PAGEOFF
; ARM64: mov w9, wzr
-; ARM64: movz x2, #0x50
+; ARM64: mov x2, #80
; ARM64: uxtb w1, w9
; ARM64: bl _memset
call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
@@ -23,7 +23,7 @@ define void @t2() {
; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
; ARM64: adrp x8, _message@PAGE
; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #0x50
+; ARM64: mov x2, #80
; ARM64: bl _memcpy
call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
ret void
@@ -37,7 +37,7 @@ define void @t3() {
; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
; ARM64: adrp x8, _message@PAGE
; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #0x14
+; ARM64: mov x2, #20
; ARM64: bl _memmove
call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
ret void
@@ -137,7 +137,7 @@ define void @t8() {
define void @test_distant_memcpy(i8* %dst) {
; ARM64-LABEL: test_distant_memcpy:
; ARM64: mov [[ARRAY:x[0-9]+]], sp
-; ARM64: movz [[OFFSET:x[0-9]+]], #0x1f40
+; ARM64: mov [[OFFSET:x[0-9]+]], #8000
; ARM64: add x[[ADDR:[0-9]+]], [[ARRAY]], [[OFFSET]]
; ARM64: ldrb [[BYTE:w[0-9]+]], [x[[ADDR]]]
; ARM64: strb [[BYTE]], [x0]
diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
index feffd41f002a..9bcc8eeca219 100644
--- a/test/CodeGen/AArch64/arm64-fcopysign.ll
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -5,7 +5,7 @@
define float @test1(float %x, float %y) nounwind {
entry:
; CHECK-LABEL: test1:
-; CHECK: movi.4s v2, #0x80, lsl #24
+; CHECK: movi.4s v2, #128, lsl #24
; CHECK: bit.16b v0, v1, v2
%0 = tail call float @copysignf(float %x, float %y) nounwind readnone
ret float %0
@@ -37,7 +37,7 @@ define float @test4() nounwind {
entry:
; CHECK-LABEL: test4:
; CHECK: fcvt s0, d0
-; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
+; CHECK: movi.4s v[[CONST:[0-9]+]], #128, lsl #24
; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
%0 = tail call double (...) @bar() nounwind
%1 = fptrunc double %0 to float
diff --git a/test/CodeGen/AArch64/arm64-fma-combines.ll b/test/CodeGen/AArch64/arm64-fma-combines.ll
new file mode 100644
index 000000000000..ab875c06cc62
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -0,0 +1,136 @@
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+define void @foo_2d(double* %src) {
+; CHECK-LABEL: %entry
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+ %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
+ %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
+ %tmp = bitcast double* %arrayidx1 to <2 x double>*
+ %tmp1 = load double, double* %arrayidx2, align 8
+ %tmp2 = load double, double* %arrayidx1, align 8
+ %fmul = fmul fast double %tmp1, %tmp1
+ %fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B
+ %fadd = fadd fast double %fmul, %fmul2
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
+ %tmp3 = load double, double* %arrayidx3, align 8
+ %add = fadd fast double %tmp3, %tmp3
+ %mul = fmul fast double %add, %fadd
+ %e1 = insertelement <2 x double> undef, double %add, i32 0
+ %e2 = insertelement <2 x double> %e1, double %add, i32 1
+ %add2 = fadd fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00>
+ %e3 = insertelement <2 x double> undef, double %mul, i32 0
+ %e4 = insertelement <2 x double> %e3, double %mul, i32 1
+ %mul2 = fmul fast <2 x double> %add2,<double 3.000000e+00, double -3.000000e+00>
+ %e5 = insertelement <2 x double> undef, double %add, i32 0
+ %e6 = insertelement <2 x double> %e5, double %add, i32 1
+ %add3 = fadd fast <2 x double> %mul2, <double 3.000000e+00, double -3.000000e+00>
+ %mulx = fmul fast <2 x double> %add2, %e2
+ %addx = fadd fast <2 x double> %mulx, %e4
+ %e7 = insertelement <2 x double> undef, double %mul, i32 0
+ %e8 = insertelement <2 x double> %e7, double %mul, i32 1
+ %e9 = fmul fast <2 x double> %addx, %add3
+ store <2 x double> %e9, <2 x double>* %tmp, align 8
+ %e10 = extractelement <2 x double> %add3, i32 0
+ %mul3 = fmul fast double %mul, %e10
+ %add4 = fadd fast double %mul3, %mul
+ store double %add4, double* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_2s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <2 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <2 x float> undef, float %add, i32 0
+ %e2 = insertelement <2 x float> %e1, float %add, i32 1
+ %add2 = fadd fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00>
+ %e3 = insertelement <2 x float> undef, float %mul, i32 0
+ %e4 = insertelement <2 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00>
+ %e5 = insertelement <2 x float> undef, float %add, i32 0
+ %e6 = insertelement <2 x float> %e5, float %add, i32 1
+ %add3 = fadd fast <2 x float> %mul2, <float 3.000000e+00, float -3.000000e+00>
+ %mulx = fmul fast <2 x float> %add2, %e2
+ %addx = fadd fast <2 x float> %mulx, %e4
+ %e7 = insertelement <2 x float> undef, float %mul, i32 0
+ %e8 = insertelement <2 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <2 x float> %addx, %add3
+ store <2 x float> %e9, <2 x float>* %tmp, align 8
+ %e10 = extractelement <2 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ %add4 = fadd fast float %mul3, %mul
+ store float %add4, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_4s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <4 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <4 x float> undef, float %add, i32 0
+ %e2 = insertelement <4 x float> %e1, float %add, i32 1
+ %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e3 = insertelement <4 x float> undef, float %mul, i32 0
+ %e4 = insertelement <4 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e5 = insertelement <4 x float> undef, float %add, i32 0
+ %e6 = insertelement <4 x float> %e5, float %add, i32 1
+ %add3 = fadd fast <4 x float> %mul2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %mulx = fmul fast <4 x float> %add2, %e2
+ %addx = fadd fast <4 x float> %mulx, %e4
+ %e7 = insertelement <4 x float> undef, float %mul, i32 0
+ %e8 = insertelement <4 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <4 x float> %addx, %add3
+ store <4 x float> %e9, <4 x float>* %tmp, align 8
+ %e10 = extractelement <4 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ store float %mul3, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fml-combines.ll b/test/CodeGen/AArch64/arm64-fml-combines.ll
new file mode 100644
index 000000000000..840d1dcbf060
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fml-combines.ll
@@ -0,0 +1,128 @@
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+define void @foo_2d(double* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
+ %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
+ %tmp = bitcast double* %arrayidx1 to <2 x double>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
+ %tmp1 = load double, double* %arrayidx3, align 8
+ %add = fadd fast double %tmp1, %tmp1
+ %mul = fmul fast double %add, %add
+ %e1 = insertelement <2 x double> undef, double %add, i32 0
+ %e2 = insertelement <2 x double> %e1, double %add, i32 1
+ %sub2 = fsub fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00>
+ %e3 = insertelement <2 x double> undef, double %mul, i32 0
+ %e4 = insertelement <2 x double> %e3, double %mul, i32 1
+ %mul2 = fmul fast <2 x double> %sub2,<double 3.000000e+00, double -3.000000e+00>
+ %e5 = insertelement <2 x double> undef, double %add, i32 0
+ %e6 = insertelement <2 x double> %e5, double %add, i32 1
+ %sub3 = fsub fast <2 x double> <double 3.000000e+00, double -3.000000e+00>, %mul2
+ %mulx = fmul fast <2 x double> %sub2, %e2
+ %subx = fsub fast <2 x double> %e4, %mulx
+ %e7 = insertelement <2 x double> undef, double %mul, i32 0
+ %e8 = insertelement <2 x double> %e7, double %mul, i32 1
+ %e9 = fmul fast <2 x double> %subx, %sub3
+ store <2 x double> %e9, <2 x double>* %tmp, align 8
+ %e10 = extractelement <2 x double> %sub3, i32 0
+ %mul3 = fmul fast double %mul, %e10
+ %sub4 = fsub fast double %mul, %mul3
+ store double %sub4, double* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_2s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <2 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <2 x float> undef, float %add, i32 0
+ %e2 = insertelement <2 x float> %e1, float %add, i32 1
+ %add2 = fsub fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00>
+ %e3 = insertelement <2 x float> undef, float %mul, i32 0
+ %e4 = insertelement <2 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00>
+ %e5 = insertelement <2 x float> undef, float %add, i32 0
+ %e6 = insertelement <2 x float> %e5, float %add, i32 1
+ %add3 = fsub fast <2 x float> <float 3.000000e+00, float -3.000000e+00>, %mul2
+ %mulx = fmul fast <2 x float> %add2, %e2
+ %addx = fsub fast <2 x float> %e4, %mulx
+ %e7 = insertelement <2 x float> undef, float %mul, i32 0
+ %e8 = insertelement <2 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <2 x float> %addx, %add3
+ store <2 x float> %e9, <2 x float>* %tmp, align 8
+ %e10 = extractelement <2 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ %add4 = fsub fast float %mul, %mul3
+ store float %add4, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+define void @foo_4s(float* %src) {
+entry:
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+ %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+ %tmp = bitcast float* %arrayidx1 to <4 x float>*
+ br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+ %tmp1 = load float, float* %arrayidx3, align 8
+ %add = fadd fast float %tmp1, %tmp1
+ %mul = fmul fast float %add, %add
+ %e1 = insertelement <4 x float> undef, float %add, i32 0
+ %e2 = insertelement <4 x float> %e1, float %add, i32 1
+ %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e3 = insertelement <4 x float> undef, float %mul, i32 0
+ %e4 = insertelement <4 x float> %e3, float %mul, i32 1
+ %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+ %e5 = insertelement <4 x float> undef, float %add, i32 0
+ %e6 = insertelement <4 x float> %e5, float %add, i32 1
+ %add3 = fsub fast <4 x float> <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> , %mul2
+ %mulx = fmul fast <4 x float> %add2, %e2
+ %addx = fsub fast <4 x float> %e4, %mulx
+ %e7 = insertelement <4 x float> undef, float %mul, i32 0
+ %e8 = insertelement <4 x float> %e7, float %mul, i32 1
+ %e9 = fmul fast <4 x float> %addx, %add3
+ store <4 x float> %e9, <4 x float>* %tmp, align 8
+ %e10 = extractelement <4 x float> %add3, i32 0
+ %mul3 = fmul fast float %mul, %e10
+ store float %mul3, float* %arrayidx2, align 8
+ %exitcond = icmp eq i64 %indvars.iv.next, 25
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
index 097fe2ca6ed9..bcb196e40456 100644
--- a/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -174,11 +174,11 @@ define i32 @test_br_cc() {
iftrue:
ret i32 42
; CHECK-NEXT: BB#
-; CHECK-NEXT: movz w0, #0x2a
+; CHECK-NEXT: mov w0, #42
; CHECK: ret
iffalse:
ret i32 29
-; CHECK: movz w0, #0x1d
+; CHECK: mov w0, #29
; CHECK: ret
}
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
index 895bfe4b3915..caaf8615cd4a 100644
--- a/test/CodeGen/AArch64/arm64-hello.ll
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -1,28 +1,25 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra -disable-fp-elim | FileCheck %s
; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX
; CHECK-LABEL: main:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #16
+; CHECK: sub sp, sp, #32
+; CHECK-NEXT: stp x29, x30, [sp, #16]
+; CHECK-NEXT: add x29, sp, #16
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK: adrp x0, L_.str@PAGE
; CHECK: add x0, x0, L_.str@PAGEOFF
; CHECK-NEXT: bl _puts
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ldp x29, x30, [sp, #16]
+; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
; CHECK-LINUX-LABEL: main:
-; CHECK-LINUX: stp x29, x30, [sp, #-16]!
-; CHECK-LINUX-NEXT: mov x29, sp
-; CHECK-LINUX-NEXT: sub sp, sp, #16
-; CHECK-LINUX-NEXT: stur wzr, [x29, #-4]
+; CHECK-LINUX: str x30, [sp, #-16]!
+; CHECK-LINUX-NEXT: str wzr, [sp, #12]
; CHECK-LINUX: adrp x0, .L.str
; CHECK-LINUX: add x0, x0, :lo12:.L.str
; CHECK-LINUX-NEXT: bl puts
-; CHECK-LINUX-NEXT: mov sp, x29
-; CHECK-LINUX-NEXT: ldp x29, x30, [sp], #16
+; CHECK-LINUX-NEXT: ldr x30, [sp], #16
; CHECK-LINUX-NEXT: ret
@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index ac6e8a7731c6..4d4adb10d556 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios -aarch64-neon-syntax=apple -no-integrated-as -disable-post-ra | FileCheck %s
; rdar://9167275
@@ -232,3 +232,17 @@ define void @test_zero_reg(i32* %addr) {
ret void
}
+
+define <2 x float> @test_vreg_64bit(<2 x float> %in) nounwind {
+ ; CHECK-LABEL: test_vreg_64bit:
+ %1 = tail call <2 x float> asm sideeffect "fadd ${0}.2s, ${1}.2s, ${1}.2s", "={v14},w"(<2 x float> %in) nounwind
+ ; CHECK fadd v14.2s, v0.2s, v0.2s:
+ ret <2 x float> %1
+}
+
+define <4 x float> @test_vreg_128bit(<4 x float> %in) nounwind {
+ ; CHECK-LABEL: test_vreg_128bit:
+ %1 = tail call <4 x float> asm sideeffect "fadd ${0}.4s, ${1}.4s, ${1}.4s", "={v14},w"(<4 x float> %in) nounwind
+ ; CHECK fadd v14.4s, v0.4s, v0.4s:
+ ret <4 x float> %1
+}
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
index c65cf95be2e5..dee034483541 100644
--- a/test/CodeGen/AArch64/arm64-join-reserved.ll
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -5,7 +5,7 @@ target triple = "arm64-apple-macosx10"
; A move isn't necessary.
; <rdar://problem/11492712>
; CHECK-LABEL: g:
-; CHECK: str xzr, [sp, #-16]!
+; CHECK: str xzr, [sp]
; CHECK: bl
; CHECK: ret
define void @g() nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/test/CodeGen/AArch64/arm64-ldp-cluster.ll
new file mode 100644
index 000000000000..0cfbe5958f4d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -0,0 +1,150 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s
+
+; Test ldr clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int:BB#0
+; CHECK: Cluster ld/st SU(1) - SU(2)
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDRWui
+; EXYNOS: ********** MI Scheduling **********
+; EXYNOS-LABEL: ldr_int:BB#0
+; EXYNOS: Cluster ld/st SU(1) - SU(2)
+; EXYNOS: SU(1): %vreg{{[0-9]+}}<def> = LDRWui
+; EXYNOS: SU(2): %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int(i32* %a) nounwind {
+ %p1 = getelementptr inbounds i32, i32* %a, i32 1
+ %tmp1 = load i32, i32* %p1, align 2
+ %p2 = getelementptr inbounds i32, i32* %a, i32 2
+ %tmp2 = load i32, i32* %p2, align 2
+ %tmp3 = add i32 %tmp1, %tmp2
+ ret i32 %tmp3
+}
+
+; Test ldpsw clustering
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_sext_int:BB#0
+; CHECK: Cluster ld/st SU(1) - SU(2)
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDRSWui
+; EXYNOS: ********** MI Scheduling **********
+; EXYNOS-LABEL: ldp_sext_int:BB#0
+; EXYNOS: Cluster ld/st SU(1) - SU(2)
+; EXYNOS: SU(1): %vreg{{[0-9]+}}<def> = LDRSWui
+; EXYNOS: SU(2): %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_sext_int(i32* %p) nounwind {
+ %tmp = load i32, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+ %tmp1 = load i32, i32* %add.ptr, align 4
+ %sexttmp = sext i32 %tmp to i64
+ %sexttmp1 = sext i32 %tmp1 to i64
+ %add = add nsw i64 %sexttmp1, %sexttmp
+ ret i64 %add
+}
+
+; Test ldur clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldur_int:BB#0
+; CHECK: Cluster ld/st SU(2) - SU(1)
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDURWi
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDURWi
+; EXYNOS: ********** MI Scheduling **********
+; EXYNOS-LABEL: ldur_int:BB#0
+; EXYNOS: Cluster ld/st SU(2) - SU(1)
+; EXYNOS: SU(1): %vreg{{[0-9]+}}<def> = LDURWi
+; EXYNOS: SU(2): %vreg{{[0-9]+}}<def> = LDURWi
+define i32 @ldur_int(i32* %a) nounwind {
+ %p1 = getelementptr inbounds i32, i32* %a, i32 -1
+ %tmp1 = load i32, i32* %p1, align 2
+ %p2 = getelementptr inbounds i32, i32* %a, i32 -2
+ %tmp2 = load i32, i32* %p2, align 2
+ %tmp3 = add i32 %tmp1, %tmp2
+ ret i32 %tmp3
+}
+
+; Test sext + zext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(4): %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+; EXYNOS: ********** MI Scheduling **********
+; EXYNOS-LABEL: ldp_half_sext_zext_int:BB#0
+; EXYNOS: Cluster ld/st SU(3) - SU(4)
+; EXYNOS: SU(3): %vreg{{[0-9]+}}<def> = LDRSWui
+; EXYNOS: SU(4): %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
+ %tmp0 = load i64, i64* %q, align 4
+ %tmp = load i32, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+ %tmp1 = load i32, i32* %add.ptr, align 4
+ %sexttmp = sext i32 %tmp to i64
+ %sexttmp1 = zext i32 %tmp1 to i64
+ %add = add nsw i64 %sexttmp1, %sexttmp
+ %add1 = add nsw i64 %add, %tmp0
+ ret i64 %add1
+}
+
+; Test zext + sext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+; CHECK: SU(4): %vreg{{[0-9]+}}<def> = LDRSWui
+; EXYNOS: ********** MI Scheduling **********
+; EXYNOS-LABEL: ldp_half_zext_sext_int:BB#0
+; EXYNOS: Cluster ld/st SU(3) - SU(4)
+; EXYNOS: SU(3): %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+; EXYNOS: SU(4): %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
+ %tmp0 = load i64, i64* %q, align 4
+ %tmp = load i32, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+ %tmp1 = load i32, i32* %add.ptr, align 4
+ %sexttmp = zext i32 %tmp to i64
+ %sexttmp1 = sext i32 %tmp1 to i64
+ %add = add nsw i64 %sexttmp1, %sexttmp
+ %add1 = add nsw i64 %add, %tmp0
+ ret i64 %add1
+}
+
+; Verify we don't cluster volatile loads.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int_volatile:BB#0
+; CHECK-NOT: Cluster ld/st
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2): %vreg{{[0-9]+}}<def> = LDRWui
+; EXYNOS: ********** MI Scheduling **********
+; EXYNOS-LABEL: ldr_int_volatile:BB#0
+; EXYNOS-NOT: Cluster ld/st
+; EXYNOS: SU(1): %vreg{{[0-9]+}}<def> = LDRWui
+; EXYNOS: SU(2): %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int_volatile(i32* %a) nounwind {
+ %p1 = getelementptr inbounds i32, i32* %a, i32 1
+ %tmp1 = load volatile i32, i32* %p1, align 2
+ %p2 = getelementptr inbounds i32, i32* %a, i32 2
+ %tmp2 = load volatile i32, i32* %p2, align 2
+ %tmp3 = add i32 %tmp1, %tmp2
+ ret i32 %tmp3
+}
+
+; Test ldq clustering (no clustering for Exynos).
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldq_cluster:BB#0
+; CHECK: Cluster ld/st SU(1) - SU(3)
+; CHECK: SU(1): %vreg{{[0-9]+}}<def> = LDRQui
+; CHECK: SU(3): %vreg{{[0-9]+}}<def> = LDRQui
+; EXYNOS: ********** MI Scheduling **********
+; EXYNOS-LABEL: ldq_cluster:BB#0
+; EXYNOS-NOT: Cluster ld/st
+define <2 x i64> @ldq_cluster(i64* %p) {
+ %a1 = bitcast i64* %p to <2 x i64>*
+ %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8
+ %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2
+ %a2 = bitcast i64* %add.ptr2 to <2 x i64>*
+ %tmp2 = add nsw <2 x i64> %tmp1, %tmp1
+ %tmp3 = load <2 x i64>, <2 x i64>* %a2, align 8
+ %res = mul nsw <2 x i64> %tmp2, %tmp3
+ ret <2 x i64> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
index 2f91ce252336..23e90100fb94 100644
--- a/test/CodeGen/AArch64/arm64-memcpy-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -40,7 +40,7 @@ entry:
define void @t2(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t2:
-; CHECK: movz [[REG3:w[0-9]+]]
+; CHECK: mov [[REG3:w[0-9]+]]
; CHECK: movk [[REG3]],
; CHECK: str [[REG3]], [x0, #32]
; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
@@ -75,9 +75,9 @@ define void @t5(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t5:
; CHECK: strb wzr, [x0, #6]
-; CHECK: movz [[REG7:w[0-9]+]], #0x5453
+; CHECK: mov [[REG7:w[0-9]+]], #21587
; CHECK: strh [[REG7]], [x0, #4]
-; CHECK: movz [[REG8:w[0-9]+]],
+; CHECK: mov [[REG8:w[0-9]+]],
; CHECK: movk [[REG8]],
; CHECK: str [[REG8]], [x0]
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 6db21043f670..8b270abef59a 100644
--- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -1,5 +1,6 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s
;
; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
; much higher than the ADD instructions in order to hide latency. When not
diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
index 770521b75280..292fbb744cea 100644
--- a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
+++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
@@ -9,6 +9,9 @@
; CHECK: Successors:
; CHECK-NEXT: val SU(5): Latency=4 Reg=%vreg2
; CHECK-NEXT: ch SU(4): Latency=0
+; CHECK: SU(3): STRWui %WZR, %vreg0, 0; mem:ST4[%ptr1] GPR64common:%vreg0
+; CHECK: Successors:
+; CHECK: ch SU(4): Latency=0
; CHECK: SU(4): STRWui %WZR, %vreg1, 0; mem:ST4[%ptr2] GPR64common:%vreg1
; CHECK: SU(5): %W0<def> = COPY %vreg2; GPR32:%vreg2
; CHECK: ** ScheduleDAGMI::schedule picking next node
diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
new file mode 100644
index 000000000000..d4e8aa1a0a06
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
@@ -0,0 +1,23 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+
+
+@G1 = common global [100 x i32] zeroinitializer, align 4
+@G2 = common global [100 x i32] zeroinitializer, align 4
+
+; Check that no scheduling dependencies are created between the paired loads and the store during post-RA MI scheduling.
+;
+; CHECK-LABEL: # Machine code for function foo: Properties: <Post SSA
+; CHECK: SU(2): %W{{[0-9]+}}<def>, %W{{[0-9]+}}<def> = LDPWi
+; CHECK: Successors:
+; CHECK-NOT: ch SU(4)
+; CHECK: SU(3)
+; CHECK: SU(4): STRWui %WZR, %X{{[0-9]+}}
+define i32 @foo() {
+entry:
+ %0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 0), align 4
+ %1 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 1), align 4
+ store i32 0, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G1, i64 0, i64 0), align 4
+ %add = add nsw i32 %1, %0
+ ret i32 %add
+}
diff --git a/test/CodeGen/AArch64/arm64-movi.ll b/test/CodeGen/AArch64/arm64-movi.ll
index 2cd368d909dc..344e2224ab43 100644
--- a/test/CodeGen/AArch64/arm64-movi.ll
+++ b/test/CodeGen/AArch64/arm64-movi.ll
@@ -7,21 +7,21 @@
; 64-bit immed with 32-bit pattern size, rotated by 0.
define i64 @test64_32_rot0() nounwind {
; CHECK-LABEL: test64_32_rot0:
-; CHECK: orr x0, xzr, #0x700000007
+; CHECK: mov x0, #30064771079
ret i64 30064771079
}
; 64-bit immed with 32-bit pattern size, rotated by 2.
define i64 @test64_32_rot2() nounwind {
; CHECK-LABEL: test64_32_rot2:
-; CHECK: orr x0, xzr, #0xc0000003c0000003
+; CHECK: mov x0, #-4611686002321260541
ret i64 13835058071388291075
}
; 64-bit immed with 4-bit pattern size, rotated by 3.
define i64 @test64_4_rot3() nounwind {
; CHECK-LABEL: test64_4_rot3:
-; CHECK: orr x0, xzr, #0xeeeeeeeeeeeeeeee
+; CHECK: mov x0, #-1229782938247303442
ret i64 17216961135462248174
}
@@ -35,7 +35,7 @@ define i32 @test32_32_rot16() nounwind {
; 32-bit immed with 2-bit pattern size, rotated by 1.
define i32 @test32_2_rot1() nounwind {
; CHECK-LABEL: test32_2_rot1:
-; CHECK: orr w0, wzr, #0xaaaaaaaa
+; CHECK: mov w0, #-1431655766
ret i32 2863311530
}
@@ -45,30 +45,30 @@ define i32 @test32_2_rot1() nounwind {
define i32 @movz() nounwind {
; CHECK-LABEL: movz:
-; CHECK: movz w0, #0x5
+; CHECK: mov w0, #5
ret i32 5
}
define i64 @movz_3movk() nounwind {
; CHECK-LABEL: movz_3movk:
-; CHECK: movz x0, #0x5, lsl #48
-; CHECK-NEXT: movk x0, #0x1234, lsl #32
-; CHECK-NEXT: movk x0, #0xabcd, lsl #16
-; CHECK-NEXT: movk x0, #0x5678
+; CHECK: mov x0, #1407374883553280
+; CHECK-NEXT: movk x0, #4660, lsl #32
+; CHECK-NEXT: movk x0, #43981, lsl #16
+; CHECK-NEXT: movk x0, #22136
ret i64 1427392313513592
}
define i64 @movz_movk_skip1() nounwind {
; CHECK-LABEL: movz_movk_skip1:
-; CHECK: movz x0, #0x5, lsl #32
-; CHECK-NEXT: movk x0, #0x4321, lsl #16
+; CHECK: mov x0, #21474836480
+; CHECK-NEXT: movk x0, #17185, lsl #16
ret i64 22601072640
}
define i64 @movz_skip1_movk() nounwind {
; CHECK-LABEL: movz_skip1_movk:
-; CHECK: movz x0, #0x8654, lsl #32
-; CHECK-NEXT: movk x0, #0x1234
+; CHECK: mov x0, #147695335374848
+; CHECK-NEXT: movk x0, #4660
ret i64 147695335379508
}
@@ -78,14 +78,14 @@ define i64 @movz_skip1_movk() nounwind {
define i64 @movn() nounwind {
; CHECK-LABEL: movn:
-; CHECK: movn x0, #0x29
+; CHECK: mov x0, #-42
ret i64 -42
}
define i64 @movn_skip1_movk() nounwind {
; CHECK-LABEL: movn_skip1_movk:
-; CHECK: movn x0, #0x29, lsl #32
-; CHECK-NEXT: movk x0, #0x1234
+; CHECK: mov x0, #-176093659137
+; CHECK-NEXT: movk x0, #4660
ret i64 -176093720012
}
@@ -96,107 +96,107 @@ define i64 @movn_skip1_movk() nounwind {
define i64 @orr_movk1() nounwind {
; CHECK-LABEL: orr_movk1:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: mov x0, #72056494543077120
+; CHECK: movk x0, #57005, lsl #16
ret i64 72056498262245120
}
define i64 @orr_movk2() nounwind {
; CHECK-LABEL: orr_movk2:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead, lsl #48
+; CHECK: mov x0, #72056494543077120
+; CHECK: movk x0, #57005, lsl #48
ret i64 -2400982650836746496
}
define i64 @orr_movk3() nounwind {
; CHECK-LABEL: orr_movk3:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead, lsl #32
+; CHECK: mov x0, #72056494543077120
+; CHECK: movk x0, #57005, lsl #32
ret i64 72020953688702720
}
define i64 @orr_movk4() nounwind {
; CHECK-LABEL: orr_movk4:
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #0xdead
+; CHECK: mov x0, #72056494543077120
+; CHECK: movk x0, #57005
ret i64 72056494543068845
}
; rdar://14987618
define i64 @orr_movk5() nounwind {
; CHECK-LABEL: orr_movk5:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: mov x0, #-71777214294589696
+; CHECK: movk x0, #57005, lsl #16
ret i64 -71777214836900096
}
define i64 @orr_movk6() nounwind {
; CHECK-LABEL: orr_movk6:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead, lsl #16
-; CHECK: movk x0, #0xdead, lsl #48
+; CHECK: mov x0, #-71777214294589696
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #57005, lsl #48
ret i64 -2400982647117578496
}
define i64 @orr_movk7() nounwind {
; CHECK-LABEL: orr_movk7:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead, lsl #48
+; CHECK: mov x0, #-71777214294589696
+; CHECK: movk x0, #57005, lsl #48
ret i64 -2400982646575268096
}
define i64 @orr_movk8() nounwind {
; CHECK-LABEL: orr_movk8:
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #0xdead
-; CHECK: movk x0, #0xdead, lsl #48
+; CHECK: mov x0, #-71777214294589696
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
ret i64 -2400982646575276371
}
; rdar://14987715
define i64 @orr_movk9() nounwind {
; CHECK-LABEL: orr_movk9:
-; CHECK: orr x0, xzr, #0xffffff000000000
-; CHECK: movk x0, #0xff00
-; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: mov x0, #1152921435887370240
+; CHECK: movk x0, #65280
+; CHECK: movk x0, #57005, lsl #16
ret i64 1152921439623315200
}
define i64 @orr_movk10() nounwind {
; CHECK-LABEL: orr_movk10:
-; CHECK: orr x0, xzr, #0xfffffffffffff00
-; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: mov x0, #1152921504606846720
+; CHECK: movk x0, #57005, lsl #16
ret i64 1152921504047824640
}
define i64 @orr_movk11() nounwind {
; CHECK-LABEL: orr_movk11:
-; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #0xdead, lsl #16
-; CHECK: movk x0, #0xffff, lsl #32
+; CHECK: mov x0, #-4503599627370241
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #65535, lsl #32
ret i64 -4222125209747201
}
define i64 @orr_movk12() nounwind {
; CHECK-LABEL: orr_movk12:
-; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #0xdead, lsl #32
+; CHECK: mov x0, #-4503599627370241
+; CHECK: movk x0, #57005, lsl #32
ret i64 -4258765016661761
}
define i64 @orr_movk13() nounwind {
; CHECK-LABEL: orr_movk13:
-; CHECK: orr x0, xzr, #0xfffff000000
-; CHECK: movk x0, #0xdead
-; CHECK: movk x0, #0xdead, lsl #48
+; CHECK: mov x0, #17592169267200
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
ret i64 -2401245434149282131
}
; rdar://13944082
define i64 @g() nounwind {
; CHECK-LABEL: g:
-; CHECK: movz x0, #0xffff, lsl #48
-; CHECK: movk x0, #0x2
+; CHECK: mov x0, #-281474976710656
+; CHECK: movk x0, #2
entry:
ret i64 -281474976710654
}
diff --git a/test/CodeGen/AArch64/arm64-mul.ll b/test/CodeGen/AArch64/arm64-mul.ll
index 2e7986d67d9e..a424dc761bc8 100644
--- a/test/CodeGen/AArch64/arm64-mul.ll
+++ b/test/CodeGen/AArch64/arm64-mul.ll
@@ -88,3 +88,65 @@ entry:
%tmp4 = sub i64 0, %tmp3
ret i64 %tmp4
}
+
+define i64 @t9(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: t9:
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp1 = zext i32 %a to i64
+ %tmp2 = mul i64 %tmp1, 139968
+ ret i64 %tmp2
+}
+
+; Check 64-bit multiplication is used for constants > 32 bits.
+define i64 @t10(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: t10:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = sext i32 %a to i64
+ %tmp2 = mul i64 %tmp1, 2147483650 ; = 2^31 + 2
+ ret i64 %tmp2
+}
+
+; Check the sext_inreg case.
+define i64 @t11(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp1 = trunc i64 %a to i32
+ %tmp2 = sext i32 %tmp1 to i64
+ %tmp3 = mul i64 %tmp2, -2395238
+ %tmp4 = sub i64 0, %tmp3
+ ret i64 %tmp4
+}
+
+define i64 @t12(i64 %a, i64 %b) nounwind {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: smaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = trunc i64 %a to i32
+ %tmp2 = sext i32 %tmp1 to i64
+ %tmp3 = mul i64 %tmp2, -34567890
+ %tmp4 = add i64 %b, %tmp3
+ ret i64 %tmp4
+}
+
+define i64 @t13(i32 %a, i64 %b) nounwind {
+entry:
+; CHECK-LABEL: t13:
+; CHECK: umsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = zext i32 %a to i64
+ %tmp3 = mul i64 %tmp1, 12345678
+ %tmp4 = sub i64 %b, %tmp3
+ ret i64 %tmp4
+}
+
+define i64 @t14(i32 %a, i64 %b) nounwind {
+entry:
+; CHECK-LABEL: t14:
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = sext i32 %a to i64
+ %tmp3 = mul i64 %tmp1, -12345678
+ %tmp4 = sub i64 %b, %tmp3
+ ret i64 %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
index 5276ac334a71..be5b7e9b2966 100644
--- a/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
+++ b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE
-; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=BE
+; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs -enable-narrow-ld-merge=true | FileCheck %s --check-prefix=CHECK --check-prefix=LE
+; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs -enable-narrow-ld-merge=true | FileCheck %s --check-prefix=CHECK --check-prefix=BE
+; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=kryo -verify-machineinstrs -enable-narrow-ld-merge=true | FileCheck %s --check-prefix=CHECK --check-prefix=LE
; CHECK-LABEL: Ldrh_merge
; CHECK-NOT: ldrh
@@ -352,6 +353,56 @@ entry:
ret void
}
+; CHECK-LABEL: Strw_zero
+; CHECK: str xzr
+define void @Strw_zero(i32* nocapture %P, i32 %n) {
+entry:
+ %idxprom = sext i32 %n to i64
+ %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+ store i32 0, i32* %arrayidx
+ %add = add nsw i32 %n, 1
+ %idxprom1 = sext i32 %add to i64
+ %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1
+ store i32 0, i32* %arrayidx2
+ ret void
+}
+
+; CHECK-LABEL: Strw_zero_nonzero
+; CHECK: stp wzr, w1
+define void @Strw_zero_nonzero(i32* nocapture %P, i32 %n) {
+entry:
+ %idxprom = sext i32 %n to i64
+ %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+ store i32 0, i32* %arrayidx
+ %add = add nsw i32 %n, 1
+ %idxprom1 = sext i32 %add to i64
+ %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1
+ store i32 %n, i32* %arrayidx2
+ ret void
+}
+
+; CHECK-LABEL: Strw_zero_4
+; CHECK: stp xzr
+define void @Strw_zero_4(i32* nocapture %P, i32 %n) {
+entry:
+ %idxprom = sext i32 %n to i64
+ %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+ store i32 0, i32* %arrayidx
+ %add = add nsw i32 %n, 1
+ %idxprom1 = sext i32 %add to i64
+ %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1
+ store i32 0, i32* %arrayidx2
+ %add3 = add nsw i32 %n, 2
+ %idxprom4 = sext i32 %add3 to i64
+ %arrayidx5 = getelementptr inbounds i32, i32* %P, i64 %idxprom4
+ store i32 0, i32* %arrayidx5
+ %add6 = add nsw i32 %n, 3
+ %idxprom7 = sext i32 %add6 to i64
+ %arrayidx8 = getelementptr inbounds i32, i32* %P, i64 %idxprom7
+ store i32 0, i32* %arrayidx8
+ ret void
+}
+
; CHECK-LABEL: Sturb_zero
; CHECK: sturh wzr
define void @Sturb_zero(i8* nocapture %P, i32 %n) #0 {
@@ -404,3 +455,42 @@ entry:
store i16 0, i16* %arrayidx9
ret void
}
+
+; CHECK-LABEL: Sturw_zero
+; CHECK: stur xzr
+define void @Sturw_zero(i32* nocapture %P, i32 %n) {
+entry:
+ %sub = add nsw i32 %n, -3
+ %idxprom = sext i32 %sub to i64
+ %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+ store i32 0, i32* %arrayidx
+ %sub1 = add nsw i32 %n, -4
+ %idxprom2 = sext i32 %sub1 to i64
+ %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2
+ store i32 0, i32* %arrayidx3
+ ret void
+}
+
+; CHECK-LABEL: Sturw_zero_4
+; CHECK: stp xzr, xzr
+define void @Sturw_zero_4(i32* nocapture %P, i32 %n) {
+entry:
+ %sub = add nsw i32 %n, -3
+ %idxprom = sext i32 %sub to i64
+ %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+ store i32 0, i32* %arrayidx
+ %sub1 = add nsw i32 %n, -4
+ %idxprom2 = sext i32 %sub1 to i64
+ %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2
+ store i32 0, i32* %arrayidx3
+ %sub4 = add nsw i32 %n, -2
+ %idxprom5 = sext i32 %sub4 to i64
+ %arrayidx6 = getelementptr inbounds i32, i32* %P, i64 %idxprom5
+ store i32 0, i32* %arrayidx6
+ %sub7 = add nsw i32 %n, -1
+ %idxprom8 = sext i32 %sub7 to i64
+ %arrayidx9 = getelementptr inbounds i32, i32* %P, i64 %idxprom8
+ store i32 0, i32* %arrayidx9
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem-high.ll b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
index 3ff1e61d0298..575acf723753 100644
--- a/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -18,7 +18,7 @@ entry:
define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 {
; CHECK-LABEL: test_vmull_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -42,7 +42,7 @@ entry:
define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 {
; CHECK-LABEL: test_vmull_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1, msl #8
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #1, msl #8
; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -68,7 +68,7 @@ entry:
define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 {
; CHECK-LABEL: test_vmull_high_n_u16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x11, lsl #8
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -92,7 +92,7 @@ entry:
define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 {
; CHECK-LABEL: test_vmull_high_n_u32_imm:
-; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].4s, #0x1, msl #8
+; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].4s, #1, msl #8
; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -118,7 +118,7 @@ entry:
define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 {
; CHECK-LABEL: test_vqdmull_high_n_s16_imm:
-; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].8h, #0x11, lsl #8
+; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -142,7 +142,7 @@ entry:
define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 {
; CHECK-LABEL: test_vqdmull_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -169,7 +169,7 @@ entry:
define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LABEL: test_vmlal_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -195,7 +195,7 @@ entry:
define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
; CHECK-LABEL: test_vmlal_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -223,7 +223,7 @@ entry:
define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LABEL: test_vmlal_high_n_u16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -249,7 +249,7 @@ entry:
define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
; CHECK-LABEL: test_vmlal_high_n_u32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -277,7 +277,7 @@ entry:
define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LABEL: test_vqdmlal_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -303,7 +303,7 @@ entry:
define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
; CHECK-LABEL: test_vqdmlal_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -331,7 +331,7 @@ entry:
define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LABEL: test_vmlsl_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -357,7 +357,7 @@ entry:
define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
; CHECK-LABEL: test_vmlsl_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -385,7 +385,7 @@ entry:
define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LABEL: test_vmlsl_high_n_u16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -411,7 +411,7 @@ entry:
define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
; CHECK-LABEL: test_vmlsl_high_n_u32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
@@ -439,7 +439,7 @@ entry:
define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
; CHECK-LABEL: test_vqdmlsl_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
; CHECK-NEXT: ret
entry:
@@ -465,7 +465,7 @@ entry:
define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
; CHECK-LABEL: test_vqdmlsl_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #0x1d
+; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
; CHECK-NEXT: ret
entry:
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 83b1cac70f5c..e91a1a42c233 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -902,6 +902,43 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
ret <8 x i8> %vecinit14
}
+; CHECK-LABEL: test_extracts_inserts_varidx_extract:
+; CHECK: str q0
+; CHECK: add x[[PTR:[0-9]+]], {{.*}}, w0, sxtw #1
+; CHECK-DAG: ld1 { v[[R:[0-9]+]].h }[0], [x[[PTR]]]
+; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
+; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
+; CHECK-DAG: ins v[[R]].h[3], v0.h[3]
+define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
+ %tmp = extractelement <8 x i16> %x, i32 %idx
+ %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 0
+ %tmp3 = extractelement <8 x i16> %x, i32 1
+ %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 1
+ %tmp5 = extractelement <8 x i16> %x, i32 2
+ %tmp6 = insertelement <4 x i16> %tmp4, i16 %tmp5, i32 2
+ %tmp7 = extractelement <8 x i16> %x, i32 3
+ %tmp8 = insertelement <4 x i16> %tmp6, i16 %tmp7, i32 3
+ ret <4 x i16> %tmp8
+}
+
+; CHECK-LABEL: test_extracts_inserts_varidx_insert:
+; CHECK: str h0, [{{.*}}, w0, sxtw #1]
+; CHECK-DAG: ldr d[[R:[0-9]+]]
+; CHECK-DAG: ins v[[R]].h[1], v0.h[1]
+; CHECK-DAG: ins v[[R]].h[2], v0.h[2]
+; CHECK-DAG: ins v[[R]].h[3], v0.h[3]
+define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) {
+ %tmp = extractelement <8 x i16> %x, i32 0
+ %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 %idx
+ %tmp3 = extractelement <8 x i16> %x, i32 1
+ %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 1
+ %tmp5 = extractelement <8 x i16> %x, i32 2
+ %tmp6 = insertelement <4 x i16> %tmp4, i16 %tmp5, i32 2
+ %tmp7 = extractelement <8 x i16> %x, i32 3
+ %tmp8 = insertelement <4 x i16> %tmp6, i16 %tmp7, i32 3
+ ret <4 x i16> %tmp8
+}
+
define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
; CHECK-LABEL: test_dup_v2i32_v4i16:
; CHECK: dup v0.4h, v0.h[2]
@@ -1368,7 +1405,7 @@ define <4 x i16> @concat_vector_v4i16_const() {
define <4 x i16> @concat_vector_v4i16_const_one() {
; CHECK-LABEL: concat_vector_v4i16_const_one:
-; CHECK: movi {{v[0-9]+}}.4h, #0x1
+; CHECK: movi {{v[0-9]+}}.4h, #1
%r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %r
}
@@ -1396,7 +1433,7 @@ define <8 x i16> @concat_vector_v8i16_const() {
define <8 x i16> @concat_vector_v8i16_const_one() {
; CHECK-LABEL: concat_vector_v8i16_const_one:
-; CHECK: movi {{v[0-9]+}}.8h, #0x1
+; CHECK: movi {{v[0-9]+}}.8h, #1
%r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %r
}
diff --git a/test/CodeGen/AArch64/arm64-nvcast.ll b/test/CodeGen/AArch64/arm64-nvcast.ll
index 3cb1bf25fc34..c3a1640ab012 100644
--- a/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -2,7 +2,7 @@
; CHECK-LABEL: _test:
; CHECK: fmov.2d v0, #2.00000000
-; CHECK: str q0, [sp]
+; CHECK: str q0, [sp, #-16]!
; CHECK: mov x8, sp
; CHECK: ldr s0, [x8, w1, sxtw #2]
; CHECK: str s0, [x0]
@@ -15,8 +15,8 @@ entry:
}
; CHECK-LABEL: _test2
-; CHECK: movi.16b v0, #0x3f
-; CHECK: str q0, [sp]
+; CHECK: movi.16b v0, #63
+; CHECK: str q0, [sp, #-16]!
; CHECK: mov x8, sp
; CHECK: ldr s0, [x8, w1, sxtw #2]
; CHECK: str s0, [x0]
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
index c2006ccdd064..caf4498276ce 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -1,5 +1,12 @@
-; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone < %s | FileCheck %s
; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -fast-isel < %s | FileCheck %s --check-prefix=FAST
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -filetype=obj -o %t %s
+; RUN: llvm-objdump -triple arm64-apple-darwin -d %t | FileCheck %s --check-prefix CHECK-ENCODING
+
+; CHECK-ENCODING-NOT: <unknown>
+; CHECK-ENCODING: mov x16, #281470681743360
+; CHECK-ENCODING: movk x16, #57005, lsl #16
+; CHECK-ENCODING: movk x16, #48879
; One argument will be passed in register, the other will be pushed on the stack.
; Return value in x0.
@@ -7,20 +14,20 @@ define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
; CHECK-LABEL: jscall_patchpoint_codegen:
; CHECK: Ltmp
-; CHECK: str x{{.+}}, [sp, #-16]!
+; CHECK: str x{{.+}}, [sp]
; CHECK-NEXT: mov x0, x{{.+}}
; CHECK: Ltmp
-; CHECK-NEXT: movz x16, #0xffff, lsl #32
-; CHECK-NEXT: movk x16, #0xdead, lsl #16
-; CHECK-NEXT: movk x16, #0xbeef
+; CHECK-NEXT: mov x16, #281470681743360
+; CHECK: movk x16, #57005, lsl #16
+; CHECK: movk x16, #48879
; CHECK-NEXT: blr x16
; FAST-LABEL: jscall_patchpoint_codegen:
; FAST: Ltmp
-; FAST: str x{{.+}}, [sp, #-16]!
+; FAST: str x{{.+}}, [sp]
; FAST: Ltmp
-; FAST-NEXT: movz x16, #0xffff, lsl #32
-; FAST-NEXT: movk x16, #0xdead, lsl #16
-; FAST-NEXT: movk x16, #0xbeef
+; FAST-NEXT: mov x16, #281470681743360
+; FAST-NEXT: movk x16, #57005, lsl #16
+; FAST-NEXT: movk x16, #48879
; FAST-NEXT: blr x16
%resolveCall2 = inttoptr i64 281474417671919 to i8*
%result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
@@ -41,22 +48,22 @@ entry:
; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x2
; CHECK-NEXT: str x[[REG]], [sp]
; CHECK: Ltmp
-; CHECK-NEXT: movz x16, #0xffff, lsl #32
-; CHECK-NEXT: movk x16, #0xdead, lsl #16
-; CHECK-NEXT: movk x16, #0xbeef
+; CHECK-NEXT: mov x16, #281470681743360
+; CHECK-NEXT: movk x16, #57005, lsl #16
+; CHECK-NEXT: movk x16, #48879
; CHECK-NEXT: blr x16
; FAST-LABEL: jscall_patchpoint_codegen2:
; FAST: Ltmp
; FAST: orr [[REG1:x[0-9]+]], xzr, #0x2
; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4
; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6
-; FAST-NEXT: str [[REG1]], [sp, #-32]!
+; FAST-NEXT: str [[REG1]], [sp]
; FAST-NEXT: str [[REG2]], [sp, #16]
; FAST-NEXT: str [[REG3]], [sp, #24]
; FAST: Ltmp
-; FAST-NEXT: movz x16, #0xffff, lsl #32
-; FAST-NEXT: movk x16, #0xdead, lsl #16
-; FAST-NEXT: movk x16, #0xbeef
+; FAST-NEXT: mov x16, #281470681743360
+; FAST-NEXT: movk x16, #57005, lsl #16
+; FAST-NEXT: movk x16, #48879
; FAST-NEXT: blr x16
%call = inttoptr i64 281474417671919 to i8*
%result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
@@ -68,7 +75,7 @@ define i64 @jscall_patchpoint_codegen3(i64 %callee) {
entry:
; CHECK-LABEL: jscall_patchpoint_codegen3:
; CHECK: Ltmp
-; CHECK: movz w[[REG:[0-9]+]], #0xa
+; CHECK: mov w[[REG:[0-9]+]], #10
; CHECK-NEXT: str x[[REG]], [sp, #48]
; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x8
; CHECK-NEXT: str w[[REG]], [sp, #36]
@@ -79,9 +86,9 @@ entry:
; CHECK-NEXT: orr w[[REG:[0-9]+]], wzr, #0x2
; CHECK-NEXT: str x[[REG]], [sp]
; CHECK: Ltmp
-; CHECK-NEXT: movz x16, #0xffff, lsl #32
-; CHECK-NEXT: movk x16, #0xdead, lsl #16
-; CHECK-NEXT: movk x16, #0xbeef
+; CHECK-NEXT: mov x16, #281470681743360
+; CHECK-NEXT: movk x16, #57005, lsl #16
+; CHECK-NEXT: movk x16, #48879
; CHECK-NEXT: blr x16
; FAST-LABEL: jscall_patchpoint_codegen3:
; FAST: Ltmp
@@ -89,16 +96,16 @@ entry:
; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4
; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6
; FAST-NEXT: orr [[REG4:w[0-9]+]], wzr, #0x8
-; FAST-NEXT: movz [[REG5:x[0-9]+]], #0xa
-; FAST-NEXT: str [[REG1]], [sp, #-64]!
+; FAST-NEXT: mov [[REG5:x[0-9]+]], #10
+; FAST-NEXT: str [[REG1]], [sp]
; FAST-NEXT: str [[REG2]], [sp, #16]
; FAST-NEXT: str [[REG3]], [sp, #24]
; FAST-NEXT: str [[REG4]], [sp, #36]
; FAST-NEXT: str [[REG5]], [sp, #48]
; FAST: Ltmp
-; FAST-NEXT: movz x16, #0xffff, lsl #32
-; FAST-NEXT: movk x16, #0xdead, lsl #16
-; FAST-NEXT: movk x16, #0xbeef
+; FAST-NEXT: mov x16, #281470681743360
+; FAST-NEXT: movk x16, #57005, lsl #16
+; FAST-NEXT: movk x16, #48879
; FAST-NEXT: blr x16
%call = inttoptr i64 281474417671919 to i8*
%result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
@@ -115,4 +122,3 @@ define webkit_jscc zeroext i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
-
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index d9ec7e50ff80..2f9004bb22e6 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -6,13 +6,13 @@
define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
; CHECK-LABEL: trivial_patchpoint_codegen:
-; CHECK: movz x16, #0xdead, lsl #32
-; CHECK-NEXT: movk x16, #0xbeef, lsl #16
-; CHECK-NEXT: movk x16, #0xcafe
+; CHECK: mov x16, #244834610708480
+; CHECK-NEXT: movk x16, #48879, lsl #16
+; CHECK-NEXT: movk x16, #51966
; CHECK-NEXT: blr x16
-; CHECK: movz x16, #0xdead, lsl #32
-; CHECK-NEXT: movk x16, #0xbeef, lsl #16
-; CHECK-NEXT: movk x16, #0xcaff
+; CHECK: mov x16, #244834610708480
+; CHECK-NEXT: movk x16, #48879, lsl #16
+; CHECK-NEXT: movk x16, #51967
; CHECK-NEXT: blr x16
; CHECK: ret
%resolveCall2 = inttoptr i64 244837814094590 to i8*
@@ -26,10 +26,11 @@ entry:
; as a leaf function.
;
; CHECK-LABEL: caller_meta_leaf
-; CHECK: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #32
+; CHECK: sub sp, sp, #48
+; CHECK-NEXT: stp x29, x30, [sp, #32]
+; CHECK-NEXT: add x29, sp, #32
; CHECK: Ltmp
-; CHECK: mov sp, x29
+; CHECK: add sp, sp, #48
; CHECK: ret
define void @caller_meta_leaf() {
diff --git a/test/CodeGen/AArch64/arm64-register-pairing.ll b/test/CodeGen/AArch64/arm64-register-pairing.ll
index 99defb1aad7c..eac7e5cb3363 100644
--- a/test/CodeGen/AArch64/arm64-register-pairing.ll
+++ b/test/CodeGen/AArch64/arm64-register-pairing.ll
@@ -1,4 +1,5 @@
; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck -check-prefix CHECK-NOTMACHO %s
;
; rdar://14075006
@@ -13,7 +14,7 @@ define void @odd() nounwind {
; CHECK: stp x24, x23, [sp, #96]
; CHECK: stp x22, x21, [sp, #112]
; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #0x2a
+; CHECK: mov x0, #42
; CHECK: ldp x20, x19, [sp, #128]
; CHECK: ldp x22, x21, [sp, #112]
; CHECK: ldp x24, x23, [sp, #96]
@@ -23,6 +24,19 @@ define void @odd() nounwind {
; CHECK: ldp d11, d10, [sp, #32]
; CHECK: ldp d13, d12, [sp, #16]
; CHECK: ldp d15, d14, [sp], #144
+
+; CHECK-NOTMACHO-LABEL: odd:
+; CHECK-NOTMACHO: stp d14, d12, [sp, #-80]!
+; CHECK-NOTMACHO: stp d10, d8, [sp, #16]
+; CHECK-NOTMACHO: str x27, [sp, #32]
+; CHECK-NOTMACHO: stp x25, x23, [sp, #48]
+; CHECK-NOTMACHO: stp x21, x19, [sp, #64]
+; CHECK-NOTMACHO: mov x0, #42
+; CHECK-NOTMACHO: ldp x21, x19, [sp, #64]
+; CHECK-NOTMACHO: ldp x25, x23, [sp, #48]
+; CHECK-NOTMACHO: ldr x27, [sp, #32]
+; CHECK-NOTMACHO: ldp d10, d8, [sp, #16]
+; CHECK-NOTMACHO: ldp d14, d12, [sp], #80
call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
ret void
}
@@ -38,7 +52,7 @@ define void @even() nounwind {
; CHECK: stp x24, x23, [sp, #96]
; CHECK: stp x22, x21, [sp, #112]
; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #0x2a
+; CHECK: mov x0, #42
; CHECK: ldp x20, x19, [sp, #128]
; CHECK: ldp x22, x21, [sp, #112]
; CHECK: ldp x24, x23, [sp, #96]
@@ -48,6 +62,19 @@ define void @even() nounwind {
; CHECK: ldp d11, d10, [sp, #32]
; CHECK: ldp d13, d12, [sp, #16]
; CHECK: ldp d15, d14, [sp], #144
+
+; CHECK-NOTMACHO-LABEL: even:
+; CHECK-NOTMACHO: stp d15, d13, [sp, #-80]!
+; CHECK-NOTMACHO: stp d11, d9, [sp, #16]
+; CHECK-NOTMACHO: str x28, [sp, #32]
+; CHECK-NOTMACHO: stp x26, x24, [sp, #48]
+; CHECK-NOTMACHO: stp x22, x20, [sp, #64]
+; CHECK-NOTMACHO: mov x0, #42
+; CHECK-NOTMACHO: ldp x22, x20, [sp, #64]
+; CHECK-NOTMACHO: ldp x26, x24, [sp, #48]
+; CHECK-NOTMACHO: ldr x28, [sp, #32]
+; CHECK-NOTMACHO: ldp d11, d9, [sp, #16]
+; CHECK-NOTMACHO: ldp d15, d13, [sp], #80
call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
ret void
}
diff --git a/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
new file mode 100644
index 000000000000..3948c0457bcd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
@@ -0,0 +1,42 @@
+# RUN: rm -f %S/arm64-regress-opt-cmp.s
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s 2>&1 | FileCheck %s
+# CHECK: %1 = ANDWri {{.*}}
+# CHECK-NEXT: %wzr = SUBSWri {{.*}}
+--- |
+ define i32 @test01() nounwind {
+ entry:
+ %0 = select i1 true, i32 1, i32 0
+ %1 = and i32 %0, 65535
+ %2 = icmp ugt i32 %1, 0
+ br i1 %2, label %if.then, label %if.end
+
+ if.then: ; preds = %entry
+ ret i32 1
+
+ if.end: ; preds = %entry
+ ret i32 0
+ }
+...
+---
+name: test01
+registers:
+ - { id: 0, class: gpr32 }
+ - { id: 1, class: gpr32common }
+body: |
+ bb.0.entry:
+ successors: %bb.2.if.end, %bb.1.if.then
+
+ %0 = MOVi32imm 1
+ %1 = ANDWri killed %1, 15
+ %wzr = SUBSWri killed %1, 0, 0, implicit-def %nzcv
+ Bcc 9, %bb.2.if.end, implicit %nzcv
+
+ bb.1.if.then:
+ %w0 = MOVi32imm 1
+ RET_ReallyLR implicit %w0
+
+ bb.2.if.end:
+ %w0 = MOVi32imm 0
+ RET_ReallyLR implicit %w0
+
+...
diff --git a/test/CodeGen/AArch64/arm64-rev.ll b/test/CodeGen/AArch64/arm64-rev.ll
index 74356d76d3c8..4980d7e3b275 100644
--- a/test/CodeGen/AArch64/arm64-rev.ll
+++ b/test/CodeGen/AArch64/arm64-rev.ll
@@ -16,6 +16,33 @@ entry:
ret i64 %0
}
+; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
+; of %a are zero. This optimizes rev + lsr 16 to rev16.
+define i32 @test_rev_w_srl16(i16 %a) {
+entry:
+; CHECK-LABEL: test_rev_w_srl16:
+; CHECK: and [[REG:w[0-9]+]], w0, #0xffff
+; CHECK: rev16 w0, [[REG]]
+; CHECK-NOT: lsr
+ %0 = zext i16 %a to i32
+ %1 = tail call i32 @llvm.bswap.i32(i32 %0)
+ %2 = lshr i32 %1, 16
+ ret i32 %2
+}
+
+; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
+; of %a are zero. This optimizes rev + lsr 32 to rev32.
+define i64 @test_rev_x_srl32(i32 %a) {
+entry:
+; CHECK-LABEL: test_rev_x_srl32:
+; CHECK: rev32 x0, {{x[0-9]+}}
+; CHECK-NOT: lsr
+ %0 = zext i32 %a to i64
+ %1 = tail call i64 @llvm.bswap.i64(i64 %0)
+ %2 = lshr i64 %1, 32
+ ret i64 %2
+}
+
declare i32 @llvm.bswap.i32(i32) nounwind readnone
declare i64 @llvm.bswap.i64(i64) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 4d751f501d4a..16ae7ef8e1b7 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: llc %s -o - -enable-shrink-wrap=false -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: llc %s -o - -enable-shrink-wrap=true -disable-post-ra -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -disable-post-ra -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios"
@@ -13,9 +13,9 @@ target triple = "arm64-apple-ios"
; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]]
;
; Prologue code.
-; CHECK: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #-16]!
-; CHECK-NEXT: mov [[SAVE_SP]], sp
-; CHECK-NEXT: sub sp, sp, #16
+; CHECK: sub sp, sp, #32
+; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16]
+; CHECK-NEXT: add [[SAVE_SP]], sp, #16
;
; Compare the arguments and jump to exit.
; After the prologue is set.
@@ -29,12 +29,12 @@ target triple = "arm64-apple-ios"
; Set the first argument to zero.
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: bl _doSomething
-;
+;
; Without shrink-wrapping, epilogue is in the exit block.
; DISABLE: [[EXIT_LABEL]]:
; Epilogue code.
-; CHECK-NEXT: mov sp, [[SAVE_SP]]
-; CHECK-NEXT: ldp [[SAVE_SP]], [[CSR]], [sp], #16
+; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp, #16]
+; CHECK-NEXT: add sp, sp, #32
;
; With shrink-wrapping, exit block is a simple return.
; ENABLE: [[EXIT_LABEL]]:
@@ -73,7 +73,7 @@ declare i32 @doSomething(i32, i32*)
; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
;
; CHECK: mov [[SUM:w[0-9]+]], wzr
-; CHECK-NEXT: movz [[IV:w[0-9]+]], #0xa
+; CHECK-NEXT: mov [[IV:w[0-9]+]], #10
;
; Next BB.
; CHECK: [[LOOP:LBB[0-9_]+]]: ; %for.body
@@ -140,7 +140,7 @@ declare i32 @something(...)
; CHECK-NEXT: stp [[CSR3:x[0-9]+]], [[CSR4:x[0-9]+]], [sp, #16]
; CHECK-NEXT: add [[NEW_SP:x[0-9]+]], sp, #16
; CHECK: mov [[SUM:w[0-9]+]], wzr
-; CHECK-NEXT: movz [[IV:w[0-9]+]], #0xa
+; CHECK-NEXT: mov [[IV:w[0-9]+]], #10
; Next BB.
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
@@ -184,7 +184,7 @@ for.end: ; preds = %for.body
; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
;
; CHECK: mov [[SUM:w[0-9]+]], wzr
-; CHECK-NEXT: movz [[IV:w[0-9]+]], #0xa
+; CHECK-NEXT: mov [[IV:w[0-9]+]], #10
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
@@ -255,7 +255,7 @@ declare void @somethingElse(...)
;
; CHECK: bl _somethingElse
; CHECK-NEXT: mov [[SUM:w[0-9]+]], wzr
-; CHECK-NEXT: movz [[IV:w[0-9]+]], #0xa
+; CHECK-NEXT: mov [[IV:w[0-9]+]], #10
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
@@ -332,11 +332,11 @@ entry:
; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
;
; Sum is merged with the returned register.
-; CHECK: mov [[SUM:w0]], wzr
-; CHECK-NEXT: add [[VA_BASE:x[0-9]+]], sp, #16
+; CHECK: add [[VA_BASE:x[0-9]+]], sp, #16
; CHECK-NEXT: str [[VA_BASE]], [sp, #8]
; CHECK-NEXT: cmp w1, #1
; CHECK-NEXT: b.lt [[IFEND_LABEL:LBB[0-9_]+]]
+; CHECK: mov [[SUM:w0]], wzr
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: ldr [[VA_ADDR:x[0-9]+]], [sp, #8]
@@ -347,18 +347,18 @@ entry:
; CHECK-NEXT: sub w1, w1, #1
; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
;
-; DISABLE-NEXT: b [[IFEND_LABEL]]
+; DISABLE-NEXT: b
; DISABLE: [[ELSE_LABEL]]: ; %if.else
; DISABLE: lsl w0, w1, #1
;
+; ENABLE: [[ELSE_LABEL]]: ; %if.else
+; ENABLE: lsl w0, w1, #1
+; ENABLE-NEXT: ret
+;
; CHECK: [[IFEND_LABEL]]:
; Epilogue code.
; CHECK: add sp, sp, #16
; CHECK-NEXT: ret
-;
-; ENABLE: [[ELSE_LABEL]]: ; %if.else
-; ENABLE: lsl w0, w1, #1
-; ENABLE-NEXT: ret
define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
entry:
%ap = alloca i8*, align 8
@@ -409,7 +409,7 @@ declare void @llvm.va_end(i8*)
;
; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
;
-; CHECK: movz [[IV:w[0-9]+]], #0xa
+; CHECK: mov [[IV:w[0-9]+]], #10
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; Inline asm statement.
@@ -454,9 +454,9 @@ if.end: ; preds = %for.body, %if.else
; ENABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
;
; Prologue code.
-; CHECK: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #-16]!
-; CHECK-NEXT: mov [[NEW_SP:x[0-9]+]], sp
-; CHECK-NEXT: sub sp, sp, #48
+; CHECK: sub sp, sp, #64
+; CHECK-NEXT: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #48]
+; CHECK-NEXT: add [[NEW_SP:x[0-9]+]], sp, #48
;
; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
; Setup of the varags.
@@ -473,8 +473,8 @@ if.end: ; preds = %for.body, %if.else
; DISABLE: [[IFEND_LABEL]]: ; %if.end
;
; Epilogue code.
-; CHECK: mov sp, [[NEW_SP]]
-; CHECK-NEXT: ldp [[CSR1]], [[CSR2]], [sp], #16
+; CHECK: ldp [[CSR1]], [[CSR2]], [sp, #48]
+; CHECK-NEXT: add sp, sp, #64
; CHECK-NEXT: ret
;
; ENABLE: [[ELSE_LABEL]]: ; %if.else
@@ -511,7 +511,7 @@ declare i32 @someVariadicFunc(i32, ...)
; CHECK: and [[TEST:w[0-9]+]], w0, #0xff
; CHECK-NEXT: cbnz [[TEST]], [[ABORT:LBB[0-9_]+]]
;
-; CHECK: movz w0, #0x2a
+; CHECK: mov w0, #42
;
; DISABLE-NEXT: ldp
;
@@ -631,16 +631,20 @@ end:
ret void
}
-; Don't do shrink-wrapping when we need to re-align the stack pointer.
-; See bug 26642.
+; Re-aligned stack pointer. See bug 26642. Avoid clobbering live
+; values in the prologue when re-aligning the stack pointer.
; CHECK-LABEL: stack_realign:
-; CHECK-NOT: lsl w[[LSL1:[0-9]+]], w0, w1
-; CHECK-NOT: lsl w[[LSL2:[0-9]+]], w1, w0
+; ENABLE-DAG: lsl w[[LSL1:[0-9]+]], w0, w1
+; ENABLE-DAG: lsl w[[LSL2:[0-9]+]], w1, w0
+; DISABLE-NOT: lsl w[[LSL1:[0-9]+]], w0, w1
+; DISABLE-NOT: lsl w[[LSL2:[0-9]+]], w1, w0
; CHECK: stp x29, x30, [sp, #-16]!
; CHECK: mov x29, sp
-; CHECK: sub x{{[0-9]+}}, sp, #16
-; CHECK-DAG: lsl w[[LSL1:[0-9]+]], w0, w1
-; CHECK-DAG: lsl w[[LSL2:[0-9]+]], w1, w0
+; ENABLE-NOT: sub x[[LSL1]], sp, #16
+; ENABLE-NOT: sub x[[LSL2]], sp, #16
+; DISABLE: sub x{{[0-9]+}}, sp, #16
+; DISABLE-DAG: lsl w[[LSL1:[0-9]+]], w0, w1
+; DISABLE-DAG: lsl w[[LSL2:[0-9]+]], w1, w0
; CHECK-DAG: str w[[LSL1]],
; CHECK-DAG: str w[[LSL2]],
diff --git a/test/CodeGen/AArch64/arm64-stp-aa.ll b/test/CodeGen/AArch64/arm64-stp-aa.ll
index 82d343d976b5..2a45745fedb5 100644
--- a/test/CodeGen/AArch64/arm64-stp-aa.ll
+++ b/test/CodeGen/AArch64/arm64-stp-aa.ll
@@ -109,3 +109,37 @@ define double @stp_double_aa_after(double %d0, double %a, double %b, double* noc
store double %b, double* %add.ptr, align 8
ret double %tmp
}
+
+; Check that the stores %c and %d are paired after the fadd instruction,
+; and then the stores %a and %d are paired after proving that they do not
+; depend on the the (%c, %d) pair.
+;
+; CHECK-LABEL: st1:
+; CHECK: stp q0, q1, [x{{[0-9]+}}]
+; CHECK: fadd
+; CHECK: stp q2, q0, [x{{[0-9]+}}, #32]
+define void @st1(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %base, i64 %index) {
+entry:
+ %a0 = getelementptr inbounds float, float* %base, i64 %index
+ %b0 = getelementptr float, float* %a0, i64 4
+ %c0 = getelementptr float, float* %a0, i64 8
+ %d0 = getelementptr float, float* %a0, i64 12
+
+ %a1 = bitcast float* %a0 to <4 x float>*
+ %b1 = bitcast float* %b0 to <4 x float>*
+ %c1 = bitcast float* %c0 to <4 x float>*
+ %d1 = bitcast float* %d0 to <4 x float>*
+
+ store <4 x float> %c, <4 x float> * %c1, align 4
+ store <4 x float> %a, <4 x float> * %a1, align 4
+
+ ; This fadd forces the compiler to pair %c and %e after fadd, and leave the
+ ; stores %a and %b separated by a stp. The dependence analysis needs then to
+ ; prove that it is safe to move %b past the stp to be paired with %a.
+ %e = fadd fast <4 x float> %d, %a
+
+ store <4 x float> %e, <4 x float>* %d1, align 4
+ store <4 x float> %b, <4 x float>* %b1, align 4
+
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
index 98242d0bb57e..5664c7d118c3 100644
--- a/test/CodeGen/AArch64/arm64-stp.ll
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -100,9 +100,9 @@ entry:
; Read of %b to compute %tmp2 shouldn't prevent formation of stp
; CHECK-LABEL: stp_int_rar_hazard
-; CHECK: stp w0, w1, [x2]
; CHECK: ldr [[REG:w[0-9]+]], [x2, #8]
-; CHECK: add w0, [[REG]], w1
+; CHECK: add w8, [[REG]], w1
+; CHECK: stp w0, w1, [x2]
; CHECK: ret
define i32 @stp_int_rar_hazard(i32 %a, i32 %b, i32* nocapture %p) nounwind {
store i32 %a, i32* %p, align 4
diff --git a/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
index 4ab2bee0ed16..2eedde557644 100644
--- a/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
+++ b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
@@ -4,7 +4,7 @@
; getting both the endianness wrong and the element indexing wrong.
define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
; CHECK: .section __TEXT,__literal16,16byte_literals
-; CHECK: .align 4
+; CHECK: .p2align 4
; CHECK:lCPI0_0:
; CHECK: .byte 0 ; 0x0
; CHECK: .byte 1 ; 0x1
@@ -24,7 +24,7 @@ define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
; CHECK: .byte 9 ; 0x9
; CHECK: .section __TEXT,__text,regular,pure_instructions
; CHECK: .globl _foo
-; CHECK: .align 2
+; CHECK: .p2align 2
; CHECK:_foo: ; @foo
; CHECK: adrp [[BASE:x[0-9]+]], lCPI0_0@PAGE
; CHECK: ldr q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
diff --git a/test/CodeGen/AArch64/arm64-this-return.ll b/test/CodeGen/AArch64/arm64-this-return.ll
index 3be1a69237d7..9fc68f476b77 100644
--- a/test/CodeGen/AArch64/arm64-this-return.ll
+++ b/test/CodeGen/AArch64/arm64-this-return.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-this-return-forwarding | FileCheck %s
%struct.A = type { i8 }
%struct.B = type { i32 }
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
index c95eca062ff6..bb9ad46ba63d 100644
--- a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
@@ -40,7 +40,7 @@ define i32 @test_emulated_init() {
; EMU-NOT: __emutls_v.general_dynamic_var:
-; EMU: .align 3
+; EMU: .p2align 3
; EMU-LABEL: __emutls_v.emulated_init_var:
; EMU-NEXT: .xword 4
; EMU-NEXT: .xword 8
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
index 8702b41023d0..16ddf690fe95 100644
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -32,7 +32,7 @@ define void @test_simple(i32 %n, ...) {
; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x37
+; CHECK: mov [[GR_OFFS:w[0-9]+]], #-56
; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
@@ -70,10 +70,10 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x27
+; CHECK: mov [[GR_OFFS:w[0-9]+]], #-40
; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #0x6f
+; CHECK: mov [[VR_OFFS:w[0-9]+]], #-11
; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
%addr = bitcast %va_list* @var to i8*
diff --git a/test/CodeGen/AArch64/arm64-vclz.ll b/test/CodeGen/AArch64/arm64-vclz.ll
index cf5670a0354f..10118f0d5638 100644
--- a/test/CodeGen/AArch64/arm64-vclz.ll
+++ b/test/CodeGen/AArch64/arm64-vclz.ll
@@ -48,6 +48,18 @@ define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
ret <2 x i32> %vclz1.i
}
+define <1 x i64> @test_vclz_u64(<1 x i64> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_u64:
+ %vclz1.i = tail call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %a, i1 false) nounwind
+ ret <1 x i64> %vclz1.i
+}
+
+define <1 x i64> @test_vclz_s64(<1 x i64> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_s64:
+ %vclz1.i = tail call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %a, i1 false) nounwind
+ ret <1 x i64> %vclz1.i
+}
+
define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
; CHECK-LABEL: test_vclzq_u8:
; CHECK: clz.16b v0, v0
@@ -96,12 +108,28 @@ define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
ret <4 x i32> %vclz1.i
}
+define <2 x i64> @test_vclzq_u64(<2 x i64> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_u64:
+ %vclz1.i = tail call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) nounwind
+ ret <2 x i64> %vclz1.i
+}
+
+define <2 x i64> @test_vclzq_s64(<2 x i64> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_s64:
+ %vclz1.i = tail call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) nounwind
+ ret <2 x i64> %vclz1.i
+}
+
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
+
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+declare <1 x i64> @llvm.ctlz.v1i64(<1 x i64>, i1) nounwind readnone
+
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vecCmpBr.ll b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
index c7321e4b7d07..0c496fedfc2a 100644
--- a/test/CodeGen/AArch64/arm64-vecCmpBr.ll
+++ b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
@@ -59,7 +59,7 @@ define i32 @anyNonZero64(<4 x i16> %a) #0 {
; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
+; CHECK-NEXT: mov w0, #0
entry:
%0 = bitcast <4 x i16> %a to <8 x i8>
@@ -83,7 +83,7 @@ define i32 @anyNonZero128(<8 x i16> %a) #0 {
; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
+; CHECK-NEXT: mov w0, #0
entry:
%0 = bitcast <8 x i16> %a to <16 x i8>
%vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
@@ -152,7 +152,7 @@ define i32 @allNonZero64(<4 x i16> %a) #0 {
; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
+; CHECK-NEXT: mov w0, #0
entry:
%0 = bitcast <4 x i16> %a to <8 x i8>
%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
@@ -175,7 +175,7 @@ define i32 @allNonZero128(<8 x i16> %a) #0 {
; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
+; CHECK-NEXT: mov w0, #0
entry:
%0 = bitcast <8 x i16> %a to <16 x i8>
%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 921cf6a6f0d1..241c3dcb9825 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
;CHECK: @func30
-;CHECK: movi.4h v1, #0x1
+;CHECK: movi.4h v1, #1
;CHECK: and.8b v0, v0, v1
;CHECK: ushll.4s v0, v0, #0
;CHECK: str q0, [x0]
diff --git a/test/CodeGen/AArch64/arm64-vector-imm.ll b/test/CodeGen/AArch64/arm64-vector-imm.ll
index d3de88d2049d..aa3ffd261d4b 100644
--- a/test/CodeGen/AArch64/arm64-vector-imm.ll
+++ b/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -50,35 +50,35 @@ define <2 x double> @foo(<2 x double> %bar) nounwind {
define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_4s_imm_t1:
-; CHECK: movi.4s v0, #0x4b
+; CHECK: movi.4s v0, #75
ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
}
define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_4s_imm_t2:
-; CHECK: movi.4s v0, #0x4b, lsl #8
+; CHECK: movi.4s v0, #75, lsl #8
ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
}
define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_4s_imm_t3:
-; CHECK: movi.4s v0, #0x4b, lsl #16
+; CHECK: movi.4s v0, #75, lsl #16
ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
}
define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_4s_imm_t4:
-; CHECK: movi.4s v0, #0x4b, lsl #24
+; CHECK: movi.4s v0, #75, lsl #24
ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
}
define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_8h_imm_t5:
-; CHECK: movi.8h v0, #0x4b
+; CHECK: movi.8h v0, #75
ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
}
@@ -86,28 +86,28 @@ entry:
define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_8h_imm_t6:
-; CHECK: movi.8h v0, #0x4b, lsl #8
+; CHECK: movi.8h v0, #75, lsl #8
ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
}
define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_4s_imm_t7:
-; CHECK: movi.4s v0, #0x4b, msl #8
+; CHECK: movi.4s v0, #75, msl #8
ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
}
define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_4s_imm_t8:
-; CHECK: movi.4s v0, #0x4b, msl #16
+; CHECK: movi.4s v0, #75, msl #16
ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
}
define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
entry:
; CHECK-LABEL: movi_16b_imm_t9:
-; CHECK: movi.16b v0, #0x4b
+; CHECK: movi.16b v0, #75
ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
}
diff --git a/test/CodeGen/AArch64/arm64-virtual_base.ll b/test/CodeGen/AArch64/arm64-virtual_base.ll
index 703d81a8d4fe..4ecfde4f83e2 100644
--- a/test/CodeGen/AArch64/arm64-virtual_base.ll
+++ b/test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -march arm64 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -disable-post-ra | FileCheck %s
; <rdar://13463602>
%struct.Counter_Struct = type { i64, i64 }
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
index d5a12483db40..b5a6788979e2 100644
--- a/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1315,7 +1315,7 @@ define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
define <8 x i8> @uqshli8b_1(<8 x i8>* %A) nounwind {
;CHECK-LABEL: uqshli8b_1:
-;CHECK: movi.8b [[REG:v[0-9]+]], #0x8
+;CHECK: movi.8b [[REG:v[0-9]+]], #8
;CHECK: uqshl.8b v0, v0, [[REG]]
%tmp1 = load <8 x i8>, <8 x i8>* %A
%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index 15ea21b7638d..b4f57675ace3 100644
--- a/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -36,7 +36,7 @@ bb:
}
; CHECK: test3
-; CHECK: movi.4s v{{[0-9]+}}, #0x1
+; CHECK: movi.4s v{{[0-9]+}}, #1
define <16 x i1> @test3(i1* %ptr, i32 %v) {
bb:
%Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
index 349bb6fd78af..ae77f7e099db 100644
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -1,44 +1,52 @@
-; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefix=CYCLONE --check-prefix=ALL
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefix=KRYO --check-prefix=ALL
+
; rdar://11481771
; rdar://13713797
define void @t1() nounwind ssp {
entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: fmov
-; CHECK: movi.2d v0, #0000000000000000
-; CHECK: movi.2d v1, #0000000000000000
-; CHECK: movi.2d v2, #0000000000000000
-; CHECK: movi.2d v3, #0000000000000000
+; ALL-LABEL: t1:
+; ALL-NOT: fmov
+; CYCLONE: movi.2d v0, #0000000000000000
+; CYCLONE: movi.2d v1, #0000000000000000
+; CYCLONE: movi.2d v2, #0000000000000000
+; CYCLONE: movi.2d v3, #0000000000000000
+; KRYO: movi v0.2d, #0000000000000000
+; KRYO: movi v1.2d, #0000000000000000
+; KRYO: movi v2.2d, #0000000000000000
+; KRYO: movi v3.2d, #0000000000000000
tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
ret void
}
define void @t2() nounwind ssp {
entry:
-; CHECK-LABEL: t2:
-; CHECK-NOT: mov w0, wzr
-; CHECK: movz w0, #0
-; CHECK: movz w1, #0
+; ALL-LABEL: t2:
+; ALL-NOT: mov w0, wzr
+; ALL: mov w0, #0
+; ALL: mov w1, #0
tail call void @bari(i32 0, i32 0) nounwind
ret void
}
define void @t3() nounwind ssp {
entry:
-; CHECK-LABEL: t3:
-; CHECK-NOT: mov x0, xzr
-; CHECK: movz x0, #0
-; CHECK: movz x1, #0
+; ALL-LABEL: t3:
+; ALL-NOT: mov x0, xzr
+; ALL: mov x0, #0
+; ALL: mov x1, #0
tail call void @barl(i64 0, i64 0) nounwind
ret void
}
define void @t4() nounwind ssp {
-; CHECK-LABEL: t4:
-; CHECK-NOT: fmov
-; CHECK: movi.2d v0, #0000000000000000
-; CHECK: movi.2d v1, #0000000000000000
+; ALL-LABEL: t4:
+; ALL-NOT: fmov
+; CYCLONE: movi.2d v0, #0000000000000000
+; CYCLONE: movi.2d v1, #0000000000000000
+; KRYO: movi v0.2d, #0000000000000000
+; KRYO: movi v1.2d, #0000000000000000
tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
ret void
}
@@ -47,3 +55,29 @@ declare void @bar(double, double, double, double)
declare void @bari(i32, i32)
declare void @barl(i64, i64)
declare void @barf(float, float)
+
+; We used to produce spills+reloads for a Q register with zero cycle zeroing
+; enabled.
+; ALL-LABEL: foo:
+; ALL-NOT: str {{q[0-9]+}}
+; ALL-NOT: ldr {{q[0-9]+}}
+define double @foo(i32 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
+ %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %conv21 = sitofp i32 %i.076 to double
+ %call = tail call fast double @sin(double %conv21)
+ %cmp.i = fcmp fast olt double %phi0, %call
+ %v0 = select i1 %cmp.i, double %call, double %phi0
+ %inc = add nuw nsw i32 %i.076, 1
+ %cmp = icmp slt i32 %inc, %n
+ br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+ ret double %v0
+}
+
+declare double @sin(double)
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 900d2072925f..9fac8d8a868a 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
diff --git a/test/CodeGen/AArch64/bitfield-extract.ll b/test/CodeGen/AArch64/bitfield-extract.ll
new file mode 100644
index 000000000000..5e727b669e22
--- /dev/null
+++ b/test/CodeGen/AArch64/bitfield-extract.ll
@@ -0,0 +1,98 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; CHECK-LABEL: @test1
+; CHECK: sbfx {{x[0-9]+}}, x0, #23, #9
+define i64 @test1(i32 %a) {
+ %tmp = ashr i32 %a, 23
+ %ext = sext i32 %tmp to i64
+ %res = add i64 %ext, 1
+ ret i64 %res
+}
+
+; CHECK-LABEL: @test2
+; CHECK: sbfx w0, w0, #23, #8
+define signext i8 @test2(i32 %a) {
+ %tmp = ashr i32 %a, 23
+ %res = trunc i32 %tmp to i8
+ ret i8 %res
+}
+
+; CHECK-LABEL: @test3
+; CHECK: sbfx w0, w0, #23, #8
+define signext i8 @test3(i32 %a) {
+ %tmp = lshr i32 %a, 23
+ %res = trunc i32 %tmp to i8
+ ret i8 %res
+}
+
+; CHECK-LABEL: @test4
+; CHECK: sbfx w0, w0, #15, #16
+define signext i16 @test4(i32 %a) {
+ %tmp = lshr i32 %a, 15
+ %res = trunc i32 %tmp to i16
+ ret i16 %res
+}
+
+; CHECK-LABEL: @test5
+; CHECK: sbfx w0, w0, #16, #8
+define signext i8 @test5(i64 %a) {
+ %tmp = lshr i64 %a, 16
+ %res = trunc i64 %tmp to i8
+ ret i8 %res
+}
+
+; CHECK-LABEL: @test6
+; CHECK: sbfx x0, x0, #30, #8
+define signext i8 @test6(i64 %a) {
+ %tmp = lshr i64 %a, 30
+ %res = trunc i64 %tmp to i8
+ ret i8 %res
+}
+
+; CHECK-LABEL: @test7
+; CHECK: sbfx x0, x0, #23, #16
+define signext i16 @test7(i64 %a) {
+ %tmp = lshr i64 %a, 23
+ %res = trunc i64 %tmp to i16
+ ret i16 %res
+}
+
+; CHECK-LABEL: @test8
+; CHECK: asr w0, w0, #25
+define signext i8 @test8(i32 %a) {
+ %tmp = ashr i32 %a, 25
+ %res = trunc i32 %tmp to i8
+ ret i8 %res
+}
+
+; CHECK-LABEL: @test9
+; CHECK: lsr w0, w0, #25
+define signext i8 @test9(i32 %a) {
+ %tmp = lshr i32 %a, 25
+ %res = trunc i32 %tmp to i8
+ ret i8 %res
+}
+
+; CHECK-LABEL: @test10
+; CHECK: lsr x0, x0, #49
+define signext i16 @test10(i64 %a) {
+ %tmp = lshr i64 %a, 49
+ %res = trunc i64 %tmp to i16
+ ret i16 %res
+}
+
+; SHR with multiple uses is fine as SXTH and SBFX are both aliases of SBFM.
+; However, allowing the transformation means the SHR and SBFX can execute in
+; parallel.
+;
+; CHECK-LABEL: @test11
+; CHECK: lsr x1, x0, #23
+; CHECK: sbfx x0, x0, #23, #16
+define void @test11(i64 %a) {
+ %tmp = lshr i64 %a, 23
+ %res = trunc i64 %tmp to i16
+ call void @use(i16 %res, i64 %tmp)
+ ret void
+}
+
+declare void @use(i16 signext, i64)
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 509b547a5c82..735be244d457 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
; First, a simple example from Clang. The registers could plausibly be
; different, but probably won't be.
@@ -237,3 +237,246 @@ define i32 @test_nouseful_bits(i8 %a, i32 %b) {
%shl.4 = shl i32 %or.3, 8 ; A A A 0
ret i32 %shl.4
}
+
+define void @test_nouseful_strb(i32* %ptr32, i8* %ptr8, i32 %x) {
+entry:
+; CHECK-LABEL: @test_nouseful_strb
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xf8
+; CHECK-NEXT: bfxil [[REG1]], w2, #16, #3
+; CHECK-NEXT: strb [[REG1]],
+; CHECK-NEXT: ret
+ %0 = load i32, i32* %ptr32, align 8
+ %and = and i32 %0, -8
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 7
+ %or = or i32 %and, %and1
+ %trunc = trunc i32 %or to i8
+ store i8 %trunc, i8* %ptr8
+ ret void
+}
+
+define void @test_nouseful_strh(i32* %ptr32, i16* %ptr16, i32 %x) {
+entry:
+; CHECK-LABEL: @test_nouseful_strh
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xfff0
+; CHECK-NEXT: bfxil [[REG1]], w2, #16, #4
+; CHECK-NEXT: strh [[REG1]],
+; CHECK-NEXT: ret
+ %0 = load i32, i32* %ptr32, align 8
+ %and = and i32 %0, -16
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 15
+ %or = or i32 %and, %and1
+ %trunc = trunc i32 %or to i16
+ store i16 %trunc, i16* %ptr16
+ ret void
+}
+
+define void @test_nouseful_sturb(i32* %ptr32, i8* %ptr8, i32 %x) {
+entry:
+; CHECK-LABEL: @test_nouseful_sturb
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xf8
+; CHECK-NEXT: bfxil [[REG1]], w2, #16, #3
+; CHECK-NEXT: sturb [[REG1]],
+; CHECK-NEXT: ret
+ %0 = load i32, i32* %ptr32, align 8
+ %and = and i32 %0, -8
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 7
+ %or = or i32 %and, %and1
+ %trunc = trunc i32 %or to i8
+ %gep = getelementptr i8, i8* %ptr8, i64 -1
+ store i8 %trunc, i8* %gep
+ ret void
+}
+
+define void @test_nouseful_sturh(i32* %ptr32, i16* %ptr16, i32 %x) {
+entry:
+; CHECK-LABEL: @test_nouseful_sturh
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xfff0
+; CHECK-NEXT: bfxil [[REG1]], w2, #16, #4
+; CHECK-NEXT: sturh [[REG1]],
+; CHECK-NEXT: ret
+ %0 = load i32, i32* %ptr32, align 8
+ %and = and i32 %0, -16
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 15
+ %or = or i32 %and, %and1
+ %trunc = trunc i32 %or to i16
+ %gep = getelementptr i16, i16* %ptr16, i64 -1
+ store i16 %trunc, i16* %gep
+ ret void
+}
+
+; The next set of tests generate a BFXIL from 'or (and X, Mask0Imm),
+; (and Y, Mask1Imm)' iff Mask0Imm and ~Mask1Imm are equivalent and one of the
+; MaskImms is a shifted mask (e.g., 0x000ffff0).
+
+; CHECK-LABEL: @test_or_and_and1
+; CHECK: lsr w8, w1, #4
+; CHECK: bfi w0, w8, #4, #12
+define i32 @test_or_and_and1(i32 %a, i32 %b) {
+entry:
+ %and = and i32 %a, -65521 ; 0xffff000f
+ %and1 = and i32 %b, 65520 ; 0x0000fff0
+ %or = or i32 %and1, %and
+ ret i32 %or
+}
+
+; CHECK-LABEL: @test_or_and_and2
+; CHECK: lsr w8, w0, #4
+; CHECK: bfi w1, w8, #4, #12
+define i32 @test_or_and_and2(i32 %a, i32 %b) {
+entry:
+ %and = and i32 %a, 65520 ; 0x0000fff0
+ %and1 = and i32 %b, -65521 ; 0xffff000f
+ %or = or i32 %and1, %and
+ ret i32 %or
+}
+
+; CHECK-LABEL: @test_or_and_and3
+; CHECK: lsr x8, x1, #16
+; CHECK: bfi x0, x8, #16, #32
+define i64 @test_or_and_and3(i64 %a, i64 %b) {
+entry:
+ %and = and i64 %a, -281474976645121 ; 0xffff00000000ffff
+ %and1 = and i64 %b, 281474976645120 ; 0x0000ffffffff0000
+ %or = or i64 %and1, %and
+ ret i64 %or
+}
+
+; Don't convert 'and' with multiple uses.
+; CHECK-LABEL: @test_or_and_and4
+; CHECK: and w8, w0, #0xffff000f
+; CHECK: and w9, w1, #0xfff0
+; CHECK: orr w0, w9, w8
+; CHECK: str w8, [x2
+define i32 @test_or_and_and4(i32 %a, i32 %b, i32* %ptr) {
+entry:
+ %and = and i32 %a, -65521
+ store i32 %and, i32* %ptr, align 4
+ %and2 = and i32 %b, 65520
+ %or = or i32 %and2, %and
+ ret i32 %or
+}
+
+; Don't convert 'and' with multiple uses.
+; CHECK-LABEL: @test_or_and_and5
+; CHECK: and w8, w1, #0xfff0
+; CHECK: and w9, w0, #0xffff000f
+; CHECK: orr w0, w8, w9
+; CHECK: str w8, [x2]
+define i32 @test_or_and_and5(i32 %a, i32 %b, i32* %ptr) {
+entry:
+ %and = and i32 %b, 65520
+ store i32 %and, i32* %ptr, align 4
+ %and1 = and i32 %a, -65521
+ %or = or i32 %and, %and1
+ ret i32 %or
+}
+
+; CHECK-LABEL: @test1
+; CHECK: mov [[REG:w[0-9]+]], #5
+; CHECK: bfxil w0, [[REG]], #0, #4
+define i32 @test1(i32 %a) {
+ %1 = and i32 %a, -16 ; 0xfffffff0
+ %2 = or i32 %1, 5 ; 0x00000005
+ ret i32 %2
+}
+
+; CHECK-LABEL: @test2
+; CHECK: mov [[REG:w[0-9]+]], #10
+; CHECK: bfi w0, [[REG]], #22, #4
+define i32 @test2(i32 %a) {
+ %1 = and i32 %a, -62914561 ; 0xfc3fffff
+ %2 = or i32 %1, 41943040 ; 0x06400000
+ ret i32 %2
+}
+
+; CHECK-LABEL: @test3
+; CHECK: mov [[REG:x[0-9]+]], #5
+; CHECK: bfxil x0, [[REG]], #0, #3
+define i64 @test3(i64 %a) {
+ %1 = and i64 %a, -8 ; 0xfffffffffffffff8
+ %2 = or i64 %1, 5 ; 0x0000000000000005
+ ret i64 %2
+}
+
+; CHECK-LABEL: @test4
+; CHECK: mov [[REG:x[0-9]+]], #9
+; CHECK: bfi x0, [[REG]], #1, #7
+define i64 @test4(i64 %a) {
+ %1 = and i64 %a, -255 ; 0xffffffffffffff01
+ %2 = or i64 %1, 18 ; 0x0000000000000012
+ ret i64 %2
+}
+
+; Don't generate BFI/BFXIL if the immediate can be encoded in the ORR.
+; CHECK-LABEL: @test5
+; CHECK: and [[REG:w[0-9]+]], w0, #0xfffffff0
+; CHECK: orr w0, [[REG]], #0x6
+define i32 @test5(i32 %a) {
+ %1 = and i32 %a, 4294967280 ; 0xfffffff0
+ %2 = or i32 %1, 6 ; 0x00000006
+ ret i32 %2
+}
+
+; BFXIL will use the same constant as the ORR, so we don't care how the constant
+; is materialized (it's an equal cost either way).
+; CHECK-LABEL: @test6
+; CHECK: mov [[REG:w[0-9]+]], #720896
+; CHECK: movk [[REG]], #23250
+; CHECK: bfxil w0, [[REG]], #0, #20
+define i32 @test6(i32 %a) {
+ %1 = and i32 %a, 4293918720 ; 0xfff00000
+ %2 = or i32 %1, 744146 ; 0x000b5ad2
+ ret i32 %2
+}
+
+; BFIs that require the same number of instruction to materialize the constant
+; as the original ORR are okay.
+; CHECK-LABEL: @test7
+; CHECK: mov [[REG:w[0-9]+]], #327680
+; CHECK: movk [[REG]], #44393
+; CHECK: bfi w0, [[REG]], #1, #19
+define i32 @test7(i32 %a) {
+ %1 = and i32 %a, 4293918721 ; 0xfff00001
+ %2 = or i32 %1, 744146 ; 0x000b5ad2
+ ret i32 %2
+}
+
+; BFIs that require more instructions to materialize the constant as compared
+; to the original ORR are not okay. In this case we would be replacing the
+; 'and' with a 'movk', which would decrease ILP while using the same number of
+; instructions.
+; CHECK-LABEL: @test8
+; CHECK: mov [[REG2:x[0-9]+]], #157599529959424
+; CHECK: and [[REG1:x[0-9]+]], x0, #0xff000000000000ff
+; CHECK: movk [[REG2]], #31059, lsl #16
+; CHECK: orr x0, [[REG1]], [[REG2]]
+define i64 @test8(i64 %a) {
+ %1 = and i64 %a, -72057594037927681 ; 0xff000000000000ff
+ %2 = or i64 %1, 157601565442048 ; 0x00008f5679530000
+ ret i64 %2
+}
+
+; This test exposed an issue with an overly aggressive assert. The bit of code
+; that is expected to catch this case is unable to deal with the trunc, which
+; results in a failing check due to a mismatch between the BFI opcode and
+; the expected value type of the OR.
+; CHECK-LABEL: @test9
+; CHECK: lsr x0, x0, #12
+; CHECK: lsr [[REG:w[0-9]+]], w1, #23
+; CHECK: bfi w0, [[REG]], #23, #9
+define i32 @test9(i64 %b, i32 %e) {
+ %c = lshr i64 %b, 12
+ %d = trunc i64 %c to i32
+ %f = and i32 %d, 8388607
+ %g = and i32 %e, -8388608
+ %h = or i32 %g, %f
+ ret i32 %h
+}
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 5f19b6943b8e..8bd1279544b8 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
@var32 = global i32 0
@var64 = global i64 0
diff --git a/test/CodeGen/AArch64/bitreverse.ll b/test/CodeGen/AArch64/bitreverse.ll
index 936e3554b397..2eee7cfd8b97 100644
--- a/test/CodeGen/AArch64/bitreverse.ll
+++ b/test/CodeGen/AArch64/bitreverse.ll
@@ -7,6 +7,7 @@ declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
define <2 x i16> @f(<2 x i16> %a) {
; CHECK-LABEL: f:
+; CHECK: rev32
; CHECK: ushr
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
ret <2 x i16> %b
@@ -47,14 +48,14 @@ define <8 x i8> @g_vec(<8 x i8> %a) {
; Try and match as much of the sequence as precisely as possible.
; CHECK-LABEL: g_vec:
-; CHECK-DAG: movi [[M1:v.*]], #0x80
-; CHECK-DAG: movi [[M2:v.*]], #0x40
-; CHECK-DAG: movi [[M3:v.*]], #0x20
-; CHECK-DAG: movi [[M4:v.*]], #0x10
-; CHECK-DAG: movi [[M5:v.*]], #0x8
-; CHECK-DAG: movi [[M6:v.*]], #0x4{{$}}
-; CHECK-DAG: movi [[M7:v.*]], #0x2{{$}}
-; CHECK-DAG: movi [[M8:v.*]], #0x1{{$}}
+; CHECK-DAG: movi [[M1:v.*]], #128
+; CHECK-DAG: movi [[M2:v.*]], #64
+; CHECK-DAG: movi [[M3:v.*]], #32
+; CHECK-DAG: movi [[M4:v.*]], #16
+; CHECK-DAG: movi [[M5:v.*]], #8{{$}}
+; CHECK-DAG: movi [[M6:v.*]], #4{{$}}
+; CHECK-DAG: movi [[M7:v.*]], #2{{$}}
+; CHECK-DAG: movi [[M8:v.*]], #1{{$}}
; CHECK-DAG: shl [[S1:v.*]], v0.8b, #7
; CHECK-DAG: shl [[S2:v.*]], v0.8b, #5
; CHECK-DAG: shl [[S3:v.*]], v0.8b, #3
diff --git a/test/CodeGen/AArch64/branch-folder-merge-mmos.ll b/test/CodeGen/AArch64/branch-folder-merge-mmos.ll
index 3f9c0239fe41..e3af90ae4831 100644
--- a/test/CodeGen/AArch64/branch-folder-merge-mmos.ll
+++ b/test/CodeGen/AArch64/branch-folder-merge-mmos.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=aarch64 -mtriple=aarch64-none-linux-gnu -stop-after branch-folder -o /dev/null < %s | FileCheck %s
+; RUN: llc -march=aarch64 -mtriple=aarch64-none-linux-gnu -stop-after branch-folder -o - < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; Function Attrs: norecurse nounwind
diff --git a/test/CodeGen/AArch64/bswap-known-bits.ll b/test/CodeGen/AArch64/bswap-known-bits.ll
new file mode 100644
index 000000000000..e5de7953d1b8
--- /dev/null
+++ b/test/CodeGen/AArch64/bswap-known-bits.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -mtriple=aarch64-apple-darwin | FileCheck %s
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+
+; CHECK-LABEL: @test1
+; CHECK: orr w0, wzr, #0x1
+define i1 @test1(i16 %arg) {
+ %a = or i16 %arg, 511
+ %b = call i16 @llvm.bswap.i16(i16 %a)
+ %and = and i16 %b, 256
+ %res = icmp eq i16 %and, 256
+ ret i1 %res
+}
+
+; CHECK-LABEL: @test2
+; CHECK: orr w0, wzr, #0x1
+define i1 @test2(i16 %arg) {
+ %a = or i16 %arg, 1
+ %b = call i16 @llvm.bswap.i16(i16 %a)
+ %and = and i16 %b, 256
+ %res = icmp eq i16 %and, 256
+ ret i1 %res
+}
+
+; CHECK-LABEL: @test3
+; CHECK: orr w0, wzr, #0x1
+define i1 @test3(i16 %arg) {
+ %a = or i16 %arg, 256
+ %b = call i16 @llvm.bswap.i16(i16 %a)
+ %and = and i16 %b, 1
+ %res = icmp eq i16 %and, 1
+ ret i1 %res
+}
+
+; CHECK-LABEL: @test4
+; CHECK: orr w0, wzr, #0x1
+define i1 @test4(i32 %arg) {
+ %a = or i32 %arg, 2147483647 ; i32_MAX
+ %b = call i32 @llvm.bswap.i32(i32 %a)
+ %and = and i32 %b, 127
+ %res = icmp eq i32 %and, 127
+ ret i1 %res
+}
diff --git a/test/CodeGen/AArch64/cmpxchg-O0.ll b/test/CodeGen/AArch64/cmpxchg-O0.ll
new file mode 100644
index 000000000000..c79d82a63774
--- /dev/null
+++ b/test/CodeGen/AArch64/cmpxchg-O0.ll
@@ -0,0 +1,75 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -O0 %s -o - | FileCheck %s
+
+define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_8:
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxrb [[OLD:w[0-9]+]], [x0]
+; CHECK: cmp [[OLD]], w1, uxtb
+; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stlxrb [[STATUS:w[3-9]]], w2, [x0]
+; CHECK: cbnz [[STATUS]], [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: subs {{w[0-9]+}}, [[OLD]], w1
+; CHECK: cset {{w[0-9]+}}, eq
+ %res = cmpxchg i8* %addr, i8 %desired, i8 %new seq_cst monotonic
+ ret { i8, i1 } %res
+}
+
+define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_16:
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxrh [[OLD:w[0-9]+]], [x0]
+; CHECK: cmp [[OLD]], w1, uxth
+; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stlxrh [[STATUS:w[3-9]]], w2, [x0]
+; CHECK: cbnz [[STATUS]], [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: subs {{w[0-9]+}}, [[OLD]], w1
+; CHECK: cset {{w[0-9]+}}, eq
+ %res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst monotonic
+ ret { i16, i1 } %res
+}
+
+define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_32:
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[OLD:w[0-9]+]], [x0]
+; CHECK: cmp [[OLD]], w1
+; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stlxr [[STATUS:w[3-9]]], w2, [x0]
+; CHECK: cbnz [[STATUS]], [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: subs {{w[0-9]+}}, [[OLD]], w1
+; CHECK: cset {{w[0-9]+}}, eq
+ %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+ ret { i32, i1 } %res
+}
+
+define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_64:
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[OLD:x[0-9]+]], [x0]
+; CHECK: cmp [[OLD]], x1
+; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stlxr [[STATUS:w[3-9]]], x2, [x0]
+; CHECK: cbnz [[STATUS]], [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: subs {{x[0-9]+}}, [[OLD]], x1
+; CHECK: cset {{w[0-9]+}}, eq
+ %res = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst monotonic
+ ret { i64, i1 } %res
+}
+
+define { i128, i1 } @test_cmpxchg_128(i128* %addr, i128 %desired, i128 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_128:
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [x0]
+; CHECK: cmp [[OLD_LO]], x2
+; CHECK: sbcs xzr, [[OLD_HI]], x3
+; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stlxp [[STATUS:w[0-9]+]], x4, x5, [x0]
+; CHECK: cbnz [[STATUS]], [[RETRY]]
+; CHECK: [[DONE]]:
+ %res = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst monotonic
+ ret { i128, i1 } %res
+}
diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index 004267f4e4e0..1f8e0efa0675 100644
--- a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -405,11 +405,11 @@ return: ; preds = %land.lhs.true, %con
define void @cmp_shifted(i32 %in, i32 %lhs, i32 %rhs) {
; CHECK-LABEL: cmp_shifted:
-; CHECK: cmp w0, #1
-; [...]
; CHECK: cmp w0, #2, lsl #12
+; [...]
+; CHECK: cmp w0, #1
- %tst_low = icmp sgt i32 %in, 0
+ %tst_low = icmp sgt i32 %in, 8191
br i1 %tst_low, label %true, label %false
true:
@@ -417,7 +417,7 @@ true:
ret void
false:
- %tst = icmp sgt i32 %in, 8191
+ %tst = icmp sgt i32 %in, 0
br i1 %tst, label %truer, label %falser
truer:
@@ -429,6 +429,42 @@ falser:
ret void
}
+define i32 @combine_gt_ge_sel(i64 %v, i64* %p) #0 {
+; CHECK-LABEL: combine_gt_ge_sel
+; CHECK: ldr [[reg1:w[0-9]*]],
+; CHECK: cmp [[reg1]], #0
+; CHECK: csel {{.*}}, gt
+entry:
+ %0 = load i32, i32* @a, align 4
+ %cmp = icmp sgt i32 %0, 0
+ %m = select i1 %cmp, i64 %v, i64 0
+ store i64 %m, i64* %p
+ br i1 %cmp, label %lor.lhs.false, label %land.lhs.true
+
+land.lhs.true: ; preds = %entry
+ %1 = load i32, i32* @b, align 4
+ %2 = load i32, i32* @c, align 4
+ %cmp1 = icmp eq i32 %1, %2
+ br i1 %cmp1, label %return, label %land.lhs.true3
+
+lor.lhs.false: ; preds = %entry
+ %cmp2 = icmp sgt i32 %0, 1
+ br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3: ; preds = %lor.lhs.false, %land.lhs.true
+ %3 = load i32, i32* @b, align 4
+ %4 = load i32, i32* @d, align 4
+ %cmp4 = icmp eq i32 %3, %4
+ br i1 %cmp4, label %return, label %if.end
+
+if.end: ; preds = %land.lhs.true3, %lor.lhs.false
+ br label %return
+
+return: ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+ %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+ ret i32 %retval.0
+}
+
declare i32 @zoo(i32)
declare double @yoo(i32)
diff --git a/test/CodeGen/AArch64/complex-int-to-fp.ll b/test/CodeGen/AArch64/complex-int-to-fp.ll
index 1102553ab551..227c626ba15d 100644
--- a/test/CodeGen/AArch64/complex-int-to-fp.ll
+++ b/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -155,7 +155,7 @@ define <4 x float> @test_signed_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
}
define <4 x float> @test_unsigned_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
; CHECK-LABEL: test_unsigned_v4i8_to_v4f32
-; CHECK: bic.4h v0, #0xff, lsl #8
+; CHECK: bic.4h v0, #255, lsl #8
; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
; CHECK: ucvtf.4s v0, [[VAL32]]
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index dfc83aacfcfc..b39cea1f6192 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
@var32 = global i32 0
@@ -10,8 +10,8 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
%tst1 = icmp ugt i32 %lhs32, %rhs32
%val1 = select i1 %tst1, i32 42, i32 52
store i32 %val1, i32* @var32
-; CHECK-DAG: movz [[W52:w[0-9]+]], #{{52|0x34}}
-; CHECK-DAG: movz [[W42:w[0-9]+]], #{{42|0x2a}}
+; CHECK-DAG: mov [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK-DAG: mov [[W42:w[0-9]+]], #{{42|0x2a}}
; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi
%rhs64 = sext i32 %rhs32 to i64
@@ -34,8 +34,8 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
; CHECK-NOFP-NOT: fcmp
%val1 = select i1 %tst1, i32 42, i32 52
store i32 %val1, i32* @var32
-; CHECK: movz [[W52:w[0-9]+]], #{{52|0x34}}
-; CHECK: movz [[W42:w[0-9]+]], #{{42|0x2a}}
+; CHECK: mov [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK: mov [[W42:w[0-9]+]], #{{42|0x2a}}
; CHECK: csel [[MAYBETRUE:w[0-9]+]], [[W42]], [[W52]], mi
; CHECK: csel {{w[0-9]+}}, [[W42]], [[MAYBETRUE]], gt
@@ -46,7 +46,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
%val2 = select i1 %tst2, i64 9, i64 15
store i64 %val2, i64* @var64
; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf
-; CHECK: movz {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
+; CHECK: mov {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq
; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs
@@ -135,6 +135,34 @@ define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
; CHECK: ret
}
+define void @test_csinv0(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) minsize {
+; CHECK-LABEL: test_csinv0:
+
+ %tst1 = icmp ugt i32 %lhs32, %rhs32
+ %val1 = select i1 %tst1, i32 0, i32 -1
+ store volatile i32 %val1, i32* @var32
+; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
+; CHECK: csetm {{w[0-9]+}}, ls
+
+ %rhs2 = add i32 %rhs32, 42
+ %tst2 = icmp sle i32 %lhs32, %rhs2
+ %val2 = select i1 %tst2, i32 -1, i32 %rhs2
+ store volatile i32 %val2, i32* @var32
+; CHECK: cmp [[LHS2:w[0-9]+]], [[RHS2:w[0-9]+]]
+; CHECK: csinv {{w[0-9]+}}, [[RHS2]], wzr, gt
+
+; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
+ %rhs3 = mul i64 %rhs64, 19
+ %tst3 = icmp ugt i64 %lhs64, %rhs3
+ %val3 = select i1 %tst3, i64 %rhs3, i64 -1
+ store volatile i64 %val3, i64* @var64
+; CHECK: cmp [[LHS3:x[0-9]+]], [[RHS3:x[0-9]+]]
+; CHECK: csinv {{x[0-9]+}}, [[RHS3]], xzr, hi
+
+ ret void
+; CHECK: ret
+}
+
define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
; CHECK-LABEL: test_csneg:
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 9c2a4fd55d1b..3296e38b64f4 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -6,7 +6,10 @@
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a73 2>&1 | FileCheck %s
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m1 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=vulcan 2>&1 | FileCheck %s
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
; CHECK-NOT: {{.*}} is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll
index 9996c0d3aba8..a36aad51ca82 100644
--- a/test/CodeGen/AArch64/cxx-tlscc.ll
+++ b/test/CodeGen/AArch64/cxx-tlscc.ll
@@ -1,8 +1,10 @@
; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-apple-ios -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; RUN: llc < %s -mtriple=aarch64-apple-ios -enable-shrink-wrap=true | FileCheck %s
; Shrink wrapping currently does not kick in because we have a TLS CALL
; in the entry block and it will clobber the link register.
+; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 | FileCheck --check-prefix=CHECK-O0 %s
+
%struct.S = type { i8 }
@sg = internal thread_local global %struct.S zeroinitializer, align 1
@@ -42,7 +44,9 @@ __tls_init.exit:
; CHECK-NOT: stp d3, d2
; CHECK-NOT: stp d1, d0
; CHECK-NOT: stp x20, x19
-; CHECK-NOT: stp x14, x13
+; FIXME: The splitting logic in the register allocator fails to split along
+; control flow here, we used to get this right by accident before...
+; CHECK-NOTXX: stp x14, x13
; CHECK-NOT: stp x12, x11
; CHECK-NOT: stp x10, x9
; CHECK-NOT: stp x8, x7
@@ -61,7 +65,7 @@ __tls_init.exit:
; CHECK-NOT: ldp x8, x7
; CHECK-NOT: ldp x10, x9
; CHECK-NOT: ldp x12, x11
-; CHECK-NOT: ldp x14, x13
+; CHECK-NOTXX: ldp x14, x13
; CHECK-NOT: ldp x20, x19
; CHECK-NOT: ldp d1, d0
; CHECK-NOT: ldp d3, d2
@@ -76,6 +80,52 @@ __tls_init.exit:
; CHECK-NOT: ldp d29, d28
; CHECK-NOT: ldp d31, d30
+; CHECK-O0-LABEL: _ZTW2sg
+; CHECK-O0: stp d31, d30
+; CHECK-O0: stp d29, d28
+; CHECK-O0: stp d27, d26
+; CHECK-O0: stp d25, d24
+; CHECK-O0: stp d23, d22
+; CHECK-O0: stp d21, d20
+; CHECK-O0: stp d19, d18
+; CHECK-O0: stp d17, d16
+; CHECK-O0: stp d7, d6
+; CHECK-O0: stp d5, d4
+; CHECK-O0: stp d3, d2
+; CHECK-O0: stp d1, d0
+; CHECK-O0: stp x14, x13
+; CHECK-O0: stp x12, x11
+; CHECK-O0: stp x10, x9
+; CHECK-O0: stp x8, x7
+; CHECK-O0: stp x6, x5
+; CHECK-O0: stp x4, x3
+; CHECK-O0: stp x2, x1
+; CHECK-O0: blr
+; CHECK-O0: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
+; CHECK-O0: blr
+; CHECK-O0: tlv_atexit
+; CHECK-O0: [[BB_end]]:
+; CHECK-O0: blr
+; CHECK-O0: ldp x2, x1
+; CHECK-O0: ldp x4, x3
+; CHECK-O0: ldp x6, x5
+; CHECK-O0: ldp x8, x7
+; CHECK-O0: ldp x10, x9
+; CHECK-O0: ldp x12, x11
+; CHECK-O0: ldp x14, x13
+; CHECK-O0: ldp d1, d0
+; CHECK-O0: ldp d3, d2
+; CHECK-O0: ldp d5, d4
+; CHECK-O0: ldp d7, d6
+; CHECK-O0: ldp d17, d16
+; CHECK-O0: ldp d19, d18
+; CHECK-O0: ldp d21, d20
+; CHECK-O0: ldp d23, d22
+; CHECK-O0: ldp d25, d24
+; CHECK-O0: ldp d27, d26
+; CHECK-O0: ldp d29, d28
+; CHECK-O0: ldp d31, d30
+
; CHECK-LABEL: _ZTW4sum1
; CHECK-NOT: stp d31, d30
; CHECK-NOT: stp d29, d28
@@ -98,6 +148,77 @@ __tls_init.exit:
; CHECK-NOT: stp x4, x3
; CHECK-NOT: stp x2, x1
; CHECK: blr
+
+; CHECK-O0-LABEL: _ZTW4sum1
+; CHECK-O0-NOT: vstr
+; CHECK-O0-NOT: vldr
define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
ret i32* @sum1
}
+
+; Make sure at O0, we don't generate spilling/reloading of the CSRs.
+; CHECK-O0-LABEL: tls_test2
+; CHECK-O0-NOT: stp d31, d30
+; CHECK-O0-NOT: stp d29, d28
+; CHECK-O0-NOT: stp d27, d26
+; CHECK-O0-NOT: stp d25, d24
+; CHECK-O0-NOT: stp d23, d22
+; CHECK-O0-NOT: stp d21, d20
+; CHECK-O0-NOT: stp d19, d18
+; CHECK-O0-NOT: stp d17, d16
+; CHECK-O0-NOT: stp d7, d6
+; CHECK-O0-NOT: stp d5, d4
+; CHECK-O0-NOT: stp d3, d2
+; CHECK-O0-NOT: stp d1, d0
+; CHECK-O0-NOT: stp x20, x19
+; CHECK-O0-NOT: stp x14, x13
+; CHECK-O0-NOT: stp x12, x11
+; CHECK-O0-NOT: stp x10, x9
+; CHECK-O0-NOT: stp x8, x7
+; CHECK-O0-NOT: stp x6, x5
+; CHECK-O0-NOT: stp x4, x3
+; CHECK-O0-NOT: stp x2, x1
+; CHECK-O0: bl {{.*}}tls_helper
+; CHECK-O0-NOT: ldp x2, x1
+; CHECK-O0-NOT: ldp x4, x3
+; CHECK-O0-NOT: ldp x6, x5
+; CHECK-O0-NOT: ldp x8, x7
+; CHECK-O0-NOT: ldp x10, x9
+; CHECK-O0-NOT: ldp x12, x11
+; CHECK-O0-NOT: ldp x14, x13
+; CHECK-O0-NOT: ldp x20, x19
+; CHECK-O0-NOT: ldp d1, d0
+; CHECK-O0-NOT: ldp d3, d2
+; CHECK-O0-NOT: ldp d5, d4
+; CHECK-O0-NOT: ldp d7, d6
+; CHECK-O0-NOT: ldp d17, d16
+; CHECK-O0-NOT: ldp d19, d18
+; CHECK-O0-NOT: ldp d21, d20
+; CHECK-O0-NOT: ldp d23, d22
+; CHECK-O0-NOT: ldp d25, d24
+; CHECK-O0-NOT: ldp d27, d26
+; CHECK-O0-NOT: ldp d29, d28
+; CHECK-O0-NOT: ldp d31, d30
+; CHECK-O0: ret
+%class.C = type { i32 }
+@tC = internal thread_local global %class.C zeroinitializer, align 4
+declare cxx_fast_tlscc void @tls_helper()
+define cxx_fast_tlscc %class.C* @tls_test2() #1 {
+ call cxx_fast_tlscc void @tls_helper()
+ ret %class.C* @tC
+}
+
+; Make sure we do not allow tail call when caller and callee have different
+; calling conventions.
+declare %class.C* @_ZN1CD1Ev(%class.C* readnone returned %this)
+; CHECK-LABEL: tls_test
+; CHECK: bl __tlv_atexit
+define cxx_fast_tlscc void @__tls_test() {
+entry:
+ store i32 0, i32* getelementptr inbounds (%class.C, %class.C* @tC, i64 0, i32 0), align 4
+ %0 = tail call i32 @_tlv_atexit(void (i8*)* bitcast (%class.C* (%class.C*)* @_ZN1CD1Ev to void (i8*)*), i8* bitcast (%class.C* @tC to i8*), i8* nonnull @__dso_handle) #1
+ ret void
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
index ac2d057ff3c9..20ba3fea8377 100644
--- a/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -24,7 +24,7 @@ main_:
ret i32 0
; CHECK: main:
-; CHECK-DAG: movz
+; CHECK-DAG: mov
; CHECK-DAG: orr
; CHECK: csel
}
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index fbea4a6e5838..f89d7603fd3e 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-NOFP %s
define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
diff --git a/test/CodeGen/AArch64/div_minsize.ll b/test/CodeGen/AArch64/div_minsize.ll
new file mode 100644
index 000000000000..43f12340f19f
--- /dev/null
+++ b/test/CodeGen/AArch64/div_minsize.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+define i32 @testsize1(i32 %x) minsize nounwind {
+entry:
+ %div = sdiv i32 %x, 32
+ ret i32 %div
+; CHECK-LABEL: testsize1
+; CHECK: sdiv
+}
+
+define i32 @testsize2(i32 %x) minsize nounwind {
+entry:
+ %div = sdiv i32 %x, 33
+ ret i32 %div
+; CHECK-LABEL: testsize2
+; CHECK: sdiv
+}
+
+define i32 @testsize3(i32 %x) minsize nounwind {
+entry:
+ %div = udiv i32 %x, 32
+ ret i32 %div
+; CHECK-LABEL: testsize3
+; CHECK: lsr
+}
+
+define i32 @testsize4(i32 %x) minsize nounwind {
+entry:
+ %div = udiv i32 %x, 33
+ ret i32 %div
+; CHECK-LABEL: testsize4
+; CHECK: udiv
+}
+
+define <8 x i16> @sdiv_vec8x16_minsize(<8 x i16> %var) minsize {
+entry:
+; CHECK: sdiv_vec8x16_minsize
+; CHECK: sshr v1.8h, v0.8h, #15
+; CHECK: usra v0.8h, v1.8h, #11
+; CHECK: sshr v0.8h, v0.8h, #5
+; CHECK: ret
+ %0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+ ret <8 x i16> %0
+}
+
diff --git a/test/CodeGen/AArch64/emutls.ll b/test/CodeGen/AArch64/emutls.ll
index ac5762edba98..36b0ae47bd4a 100644
--- a/test/CodeGen/AArch64/emutls.ll
+++ b/test/CodeGen/AArch64/emutls.ll
@@ -1,5 +1,5 @@
; RUN: llc -emulated-tls -mtriple=aarch64-linux-android \
-; RUN: -relocation-model=pic < %s | FileCheck -check-prefix=ARM64 %s
+; RUN: -relocation-model=pic -disable-fp-elim < %s | FileCheck -check-prefix=ARM64 %s
; Copied from X86/emutls.ll
@@ -54,63 +54,160 @@ entry:
ret i32* @i1
}
+define i32 @f5() nounwind {
+; ARM64-LABEL: f5:
+; ARM64: adrp x0, __emutls_v.i3
+; ARM64: add x0, x0, :lo12:__emutls_v.i3
+; ARM64: bl __emutls_get_address
+; ARM64-NEXT: ldr w0, [x0]
+
+entry:
+ %tmp1 = load i32, i32* @i3
+ ret i32 %tmp1
+}
+
+define i32* @f6() {
+; ARM64-LABEL: f6:
+; ARM64: adrp x0, __emutls_v.i3
+; ARM64: add x0, x0, :lo12:__emutls_v.i3
+; ARM64-NEXT: bl __emutls_get_address
+; ARM64-NEXT: ldp x29, x30, [sp]
+
+entry:
+ ret i32* @i3
+}
+
+; Simple test of comdat __thread variables.
+; template <class T> struct A { static __thread T x; };
+; template <class T> T __thread A<T>::x;
+; int getIntX() { return A<int>::x++; }
+; float getFloatX() { return A<float>::x++; }
+
+$_ZN1AIiE1xE = comdat any
+$_ZN1AIfE1xE = comdat any
+@_ZN1AIiE1xE = linkonce_odr thread_local global i32 0, comdat, align 4
+@_ZN1AIfE1xE = linkonce_odr thread_local global float 0.000000e+00, comdat, align 4
+
+define i32 @_Z7getIntXv() {
+; ARM64-LABEL: _Z7getIntXv:
+; ARM64: adrp x0, :got:__emutls_v._ZN1AIiE1xE
+; ARM64: ldr x0, [x0, :got_lo12:__emutls_v._ZN1AIiE1xE]
+; ARM64-NEXT: bl __emutls_get_address
+; ARM64-NEXT: ldr {{.*}}, [x0]
+; ARM64: add
+; ARM64: str {{.*}}, [x0]
+
+entry:
+ %0 = load i32, i32* @_ZN1AIiE1xE, align 4
+ %inc = add nsw i32 %0, 1
+ store i32 %inc, i32* @_ZN1AIiE1xE, align 4
+ ret i32 %0
+}
+
+define float @_Z9getFloatXv() {
+; ARM64-LABEL: _Z9getFloatXv:
+; ARM64: adrp x0, :got:__emutls_v._ZN1AIfE1xE
+; ARM64: ldr x0, [x0, :got_lo12:__emutls_v._ZN1AIfE1xE]
+; ARM64-NEXT: bl __emutls_get_address
+; ARM64-NEXT: ldr {{.*}}, [x0]
+; ARM64: fadd s{{.*}}, s
+; ARM64: str s{{.*}}, [x0]
+
+entry:
+ %0 = load float, float* @_ZN1AIfE1xE, align 4
+ %inc = fadd float %0, 1.000000e+00
+ store float %inc, float* @_ZN1AIfE1xE, align 4
+ ret float %0
+}
+
+
;;;;;;;;;;;;;; 64-bit __emutls_v. and __emutls_t.
-; ARM64 .section .data.rel.local,
+; ARM64: .data{{$}}
+; ARM64: .globl __emutls_v.i1
; ARM64-LABEL: __emutls_v.i1:
; ARM64-NEXT: .xword 4
; ARM64-NEXT: .xword 4
; ARM64-NEXT: .xword 0
; ARM64-NEXT: .xword __emutls_t.i1
-; ARM64 .section .rodata,
+; ARM64: .section .rodata,
; ARM64-LABEL: __emutls_t.i1:
; ARM64-NEXT: .word 15
; ARM64-NOT: __emutls_v.i2
-; ARM64 .section .data.rel.local,
+; ARM64: .data{{$}}
+; ARM64-NOT: .globl
; ARM64-LABEL: __emutls_v.i3:
; ARM64-NEXT: .xword 4
; ARM64-NEXT: .xword 4
; ARM64-NEXT: .xword 0
; ARM64-NEXT: .xword __emutls_t.i3
-; ARM64 .section .rodata,
+; ARM64: .section .rodata,
; ARM64-LABEL: __emutls_t.i3:
; ARM64-NEXT: .word 15
-; ARM64 .section .data.rel.local,
+; ARM64: .hidden __emutls_v.i4
+; ARM64: .data{{$}}
+; ARM64: .globl __emutls_v.i4
; ARM64-LABEL: __emutls_v.i4:
; ARM64-NEXT: .xword 4
; ARM64-NEXT: .xword 4
; ARM64-NEXT: .xword 0
; ARM64-NEXT: .xword __emutls_t.i4
-; ARM64 .section .rodata,
+; ARM64: .section .rodata,
; ARM64-LABEL: __emutls_t.i4:
; ARM64-NEXT: .word 15
; ARM64-NOT: __emutls_v.i5:
-; ARM64 .hidden __emutls_v.i5
+; ARM64: .hidden __emutls_v.i5
; ARM64-NOT: __emutls_v.i5:
-; ARM64 .section .data.rel.local,
+; ARM64: .data{{$}}
+; ARM64: .globl __emutls_v.s1
; ARM64-LABEL: __emutls_v.s1:
; ARM64-NEXT: .xword 2
; ARM64-NEXT: .xword 2
; ARM64-NEXT: .xword 0
; ARM64-NEXT: .xword __emutls_t.s1
-; ARM64 .section .rodata,
+; ARM64: .section .rodata,
; ARM64-LABEL: __emutls_t.s1:
; ARM64-NEXT: .hword 15
-; ARM64 .section .data.rel.local,
+; ARM64: .data{{$}}
; ARM64-LABEL: __emutls_v.b1:
; ARM64-NEXT: .xword 1
; ARM64-NEXT: .xword 1
; ARM64-NEXT: .xword 0
; ARM64-NEXT: .xword 0
-; ARM64-NOT: __emutls_t.b1
+; ARM64-NOT: __emutls_t.b1
+
+; ARM64: .section .data.__emutls_v._ZN1AIiE1xE,{{.*}},__emutls_v._ZN1AIiE1xE,comdat
+; ARM64: .weak __emutls_v._ZN1AIiE1xE
+; ARM64: .p2align 3
+; ARM64-LABEL: __emutls_v._ZN1AIiE1xE:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword 0
+
+; ARM64: .section .data.__emutls_v._ZN1AIfE1xE,{{.*}},__emutls_v._ZN1AIfE1xE,comdat
+; ARM64: .weak __emutls_v._ZN1AIfE1xE
+; ARM64: .p2align 3
+; ARM64-LABEL: __emutls_v._ZN1AIfE1xE:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t._ZN1AIfE1xE
+
+; ARM64: .section .rodata.__emutls_t._ZN1AIfE1xE,{{.*}},__emutls_t._ZN1AIfE1xE,comdat
+; ARM64: .weak __emutls_t._ZN1AIfE1xE
+; ARM64: .p2align 2
+; ARM64-LABEL: __emutls_t._ZN1AIfE1xE:
+; ARM64-NEXT: .word 0
+; ARM64-NEXT: .size
diff --git a/test/CodeGen/AArch64/emutls_generic.ll b/test/CodeGen/AArch64/emutls_generic.ll
index 7664db3df8d2..03473cf80ee4 100644
--- a/test/CodeGen/AArch64/emutls_generic.ll
+++ b/test/CodeGen/AArch64/emutls_generic.ll
@@ -37,7 +37,9 @@ entry:
; ARM_64: __emutls_get_address
; ARM_64-NOT: __emutls_t.external_x
; ARM_64-NOT: __emutls_v.external_x:
-; ARM_64: .align 3
+; ARM_64: .data{{$}}
+; ARM_64: .globl __emutls_v.external_y
+; ARM_64: .p2align 3
; ARM_64-LABEL: __emutls_v.external_y:
; ARM_64-NEXT: .xword 1
; ARM_64-NEXT: .xword 2
@@ -47,8 +49,9 @@ entry:
; ARM_64: .section .rodata,
; ARM_64-LABEL: __emutls_t.external_y:
; ARM_64-NEXT: .byte 7
-; ARM_64: .data
-; ARM_64: .align 3
+; ARM_64: .data{{$}}
+; ARM_64-NOT: .globl __emutls_v
+; ARM_64: .p2align 3
; ARM_64-LABEL: __emutls_v.internal_y:
; ARM_64-NEXT: .xword 8
; ARM_64-NEXT: .xword 16
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index 2203c0c4e698..921009cf821d 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s
; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
declare extern_weak i32 @var()
@@ -14,11 +14,6 @@ define i32()* @foo() {
; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
-; CHECK-STATIC: .LCPI0_0:
-; CHECK-STATIC-NEXT: .xword var
-; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
-; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
-
; In the large model, the usual relocations are absolute and can
; materialise 0.
; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -38,11 +33,6 @@ define i32* @bar() {
; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
; CHECK: add x0, [[BASE]], #20
-; CHECK-STATIC: .LCPI1_0:
-; CHECK-STATIC-NEXT: .xword arr_var
-; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
-; CHECK-STATIC: add x0, [[BASE]], #20
-
ret i32* %addr
; In the large model, the usual relocations are absolute and can
@@ -61,9 +51,6 @@ define i32* @wibble() {
; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
-; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
-
; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll
index e8ecb13b3564..f50504a9a260 100644
--- a/test/CodeGen/AArch64/f16-instructions.ll
+++ b/test/CodeGen/AArch64/f16-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra -disable-fp-elim | FileCheck %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -446,6 +446,34 @@ define half @test_sitofp_i64(i64 %a) #0 {
ret half %r
}
+; CHECK-LABEL: test_uitofp_i32_fadd:
+; CHECK-NEXT: ucvtf s1, w0
+; CHECK-NEXT: fcvt h1, s1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = uitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32_fadd:
+; CHECK-NEXT: scvtf s1, w0
+; CHECK-NEXT: fcvt h1, s1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = sitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
; CHECK-LABEL: test_fptrunc_float:
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
@@ -695,7 +723,7 @@ define half @test_maxnum(half %a, half %b) #0 {
; CHECK-LABEL: test_copysign:
; CHECK-NEXT: fcvt s1, h1
; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.4s v2, #128, lsl #24
; CHECK-NEXT: bit.16b v0, v1, v2
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
@@ -706,7 +734,7 @@ define half @test_copysign(half %a, half %b) #0 {
; CHECK-LABEL: test_copysign_f32:
; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.4s v2, #128, lsl #24
; CHECK-NEXT: bit.16b v0, v1, v2
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
@@ -719,7 +747,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
; CHECK-LABEL: test_copysign_f64:
; CHECK-NEXT: fcvt s1, d1
; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.4s v2, #128, lsl #24
; CHECK-NEXT: bit.16b v0, v1, v2
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
@@ -735,7 +763,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
; CHECK-LABEL: test_copysign_extended:
; CHECK-NEXT: fcvt s1, h1
; CHECK-NEXT: fcvt s0, h0
-; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.4s v2, #128, lsl #24
; CHECK-NEXT: bit.16b v0, v1, v2
; CHECK-NEXT: ret
define float @test_copysign_extended(half %a, half %b) #0 {
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
index e04a62b85c8e..2240296c89ff 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
@@ -19,8 +19,8 @@ bb4:
}
; CHECK-LABEL: test_and
-; CHECK: cbz w0, {{LBB[0-9]+_2}}
-; CHECK: cbnz w1, {{LBB[0-9]+_3}}
+; CHECK: cbnz w0, {{LBB[0-9]+_2}}
+; CHECK: cbz w1, {{LBB[0-9]+_1}}
define i64 @test_and(i32 %a, i32 %b) {
bb1:
%0 = icmp ne i32 %a, 0
diff --git a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
index 2855419a1ca0..2a0139ed9b08 100644
--- a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
+++ b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
@@ -11,7 +11,7 @@ define <2 x i32> @icmp_v2i32(<2 x i32> %a) {
; CHECK: ; BB#0:
; CHECK-NEXT: cmeq.2s [[CMP:v[0-9]+]], v0, #0
; CHECK-NEXT: ; BB#1:
-; CHECK-NEXT: movi.2s [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT: movi.2s [[MASK:v[0-9]+]], #1
; CHECK-NEXT: and.8b v0, [[CMP]], [[MASK]]
; CHECK-NEXT: ret
%c = icmp eq <2 x i32> %a, zeroinitializer
@@ -26,7 +26,7 @@ define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) {
; CHECK: ; BB#0:
; CHECK-NEXT: movi d[[CMP:[0-9]+]], #0xffffffffffffffff
; CHECK-NEXT: ; BB#1:
-; CHECK-NEXT: movi.2s [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT: movi.2s [[MASK:v[0-9]+]], #1
; CHECK-NEXT: and.8b v0, v[[CMP]], [[MASK]]
; CHECK-NEXT: ret
%1 = icmp eq <2 x i32> %a, %a
@@ -42,7 +42,7 @@ define <4 x i32> @icmp_v4i32(<4 x i32> %a) {
; CHECK-NEXT: cmeq.4s [[CMP:v[0-9]+]], v0, #0
; CHECK-NEXT: xtn.4h [[CMPV4I16:v[0-9]+]], [[CMP]]
; CHECK-NEXT: ; BB#1:
-; CHECK-NEXT: movi.4h [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT: movi.4h [[MASK:v[0-9]+]], #1
; CHECK-NEXT: and.8b [[ZEXT:v[0-9]+]], [[CMPV4I16]], [[MASK]]
; CHECK-NEXT: ushll.4s v0, [[ZEXT]], #0
; CHECK-NEXT: ret
@@ -58,7 +58,7 @@ define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) {
; CHECK: ; BB#0:
; CHECK-NEXT: movi d[[CMP:[0-9]+]], #0xffffffffffffffff
; CHECK-NEXT: ; BB#1:
-; CHECK-NEXT: movi.4h [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT: movi.4h [[MASK:v[0-9]+]], #1
; CHECK-NEXT: and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]]
; CHECK-NEXT: ushll.4s v0, [[ZEXT]], #0
; CHECK-NEXT: ret
@@ -74,7 +74,7 @@ define <16 x i8> @icmp_v16i8(<16 x i8> %a) {
; CHECK: ; BB#0:
; CHECK-NEXT: cmeq.16b [[CMP:v[0-9]+]], v0, #0
; CHECK-NEXT: ; BB#1:
-; CHECK-NEXT: movi.16b [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT: movi.16b [[MASK:v[0-9]+]], #1
; CHECK-NEXT: and.16b v0, [[CMP]], [[MASK]]
; CHECK-NEXT: ret
%c = icmp eq <16 x i8> %a, zeroinitializer
@@ -89,7 +89,7 @@ define <16 x i8> @icmp_constfold_v16i8(<16 x i8> %a) {
; CHECK: ; BB#0:
; CHECK-NEXT: movi.2d [[CMP:v[0-9]+]], #0xffffffffffffffff
; CHECK-NEXT: ; BB#1:
-; CHECK-NEXT: movi.16b [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT: movi.16b [[MASK:v[0-9]+]], #1
; CHECK-NEXT: and.16b v0, [[CMP]], [[MASK]]
; CHECK-NEXT: ret
%1 = icmp eq <16 x i8> %a, %a
diff --git a/test/CodeGen/AArch64/fast-isel-gep.ll b/test/CodeGen/AArch64/fast-isel-gep.ll
index 33adcdc3c464..0cb1fd8465d4 100644
--- a/test/CodeGen/AArch64/fast-isel-gep.ll
+++ b/test/CodeGen/AArch64/fast-isel-gep.ll
@@ -33,7 +33,7 @@ define i32* @test_array3(i32* %a) {
define i32* @test_array4(i32* %a) {
; CHECK-LABEL: test_array4
-; CHECK: movz [[REG:x[0-9]+]], #0x1008
+; CHECK: mov [[REG:x[0-9]+]], #4104
; CHECK-NEXR: add x0, x0, [[REG]]
%1 = getelementptr inbounds i32, i32* %a, i64 1026
ret i32* %1
diff --git a/test/CodeGen/AArch64/fast-isel-tbz.ll b/test/CodeGen/AArch64/fast-isel-tbz.ll
index 598826763787..c35ae4230dd4 100644
--- a/test/CodeGen/AArch64/fast-isel-tbz.ll
+++ b/test/CodeGen/AArch64/fast-isel-tbz.ll
@@ -1,5 +1,5 @@
-; RUN: llc -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -fast-isel -fast-isel-abort=1 -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefix=CHECK --check-prefix=FAST %s
+; RUN: llc -disable-peephole -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc -disable-peephole -fast-isel -fast-isel-abort=1 -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck --check-prefix=CHECK --check-prefix=FAST %s
define i32 @icmp_eq_i8(i8 zeroext %a) {
; CHECK-LABEL: icmp_eq_i8
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index f021eb232618..fcc852263b48 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -1,18 +1,21 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim -tailcallopt -aarch64-redzone | FileCheck %s -check-prefix CHECK-TAIL-RZ
; Without tailcallopt fastcc still means the caller cleans up the
; stack, so try to make sure this is respected.
define fastcc void @func_stack0() {
; CHECK-LABEL: func_stack0:
-; CHECK: mov x29, sp
-; CHECK: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK: sub sp, sp, #48
+; CHECK: add x29, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp]
; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-TAIL-NEXT: mov x29, sp
-; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL-NEXT: stp x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add x29, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp]
call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -41,27 +44,29 @@ define fastcc void @func_stack0() {
; CHECK-TAIL-NOT: sub sp, sp
ret void
-; CHECK: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK: ldp x29, x30, [sp, #32]
+; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
-; CHECK-TAIL: mov sp, x29
-; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16
+; CHECK-TAIL: ldp x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add sp, sp, #48
; CHECK-TAIL-NEXT: ret
}
define fastcc void @func_stack8([8 x i32], i32 %stacked) {
; CHECK-LABEL: func_stack8:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #32]
+; CHECK: add x29, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp]
; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-TAIL: mov x29, sp
-; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #32]
+; CHECK-TAIL: add x29, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp]
call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -90,22 +95,22 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
; CHECK-TAIL-NOT: sub sp, sp
ret void
-; CHECK: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ldp x29, x30, [sp, #32]
+; CHECK: add sp, sp, #48
; CHECK-NEXT: ret
-; CHECK-TAIL: mov sp, x29
-; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16
+; CHECK-TAIL: ldp x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add sp, sp, #64
; CHECK-TAIL-NEXT: ret
}
define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
; CHECK-LABEL: func_stack32:
-; CHECK: mov x29, sp
+; CHECK: add x29, sp, #32
; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: add x29, sp, #32
call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -134,11 +139,99 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
; CHECK-TAIL-NOT: sub sp, sp
ret void
-; CHECK: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK: ldp x29, x30, [sp, #32]
+; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
-; CHECK-TAIL: mov sp, x29
-; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16
+; CHECK-TAIL: ldp x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add sp, sp, #80
; CHECK-TAIL-NEXT: ret
}
+
+; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
+define fastcc void @func_stack32_leaf([8 x i32], i128 %stacked0, i128 %stacked1) {
+; CHECK-LABEL: func_stack32_leaf:
+; CHECK: str x20, [sp, #-16]!
+; CHECK: nop
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: ldr x20, [sp], #16
+; CHECK-NEXT: ret
+
+; CHECK-TAIL-LABEL: func_stack32_leaf:
+; CHECK-TAIL: str x20, [sp, #-16]!
+; CHECK-TAIL: nop
+; CHECK-TAIL-NEXT: //NO_APP
+; CHECK-TAIL-NEXT: ldr x20, [sp], #16
+; CHECK-TAIL-NEXT: add sp, sp, #32
+; CHECK-TAIL-NEXT: ret
+
+; CHECK-TAIL-RZ-LABEL: func_stack32_leaf:
+; CHECK-TAIL-RZ: str x20, [sp, #-16]!
+; CHECK-TAIL-RZ-NOT: sub sp, sp
+; CHECK-TAIL-RZ: nop
+; CHECK-TAIL-RZ-NEXT: //NO_APP
+; CHECK-TAIL-RZ-NEXT: ldr x20, [sp], #16
+; CHECK-TAIL-RZ-NEXT: add sp, sp, #32
+; CHECK-TAIL-RZ-NEXT: ret
+
+ ; Make sure there is a callee-save register to save/restore.
+ call void asm sideeffect "nop", "~{x20}"() nounwind
+ ret void
+}
+
+; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
+define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) {
+; CHECK-LABEL: func_stack32_leaf_local:
+; CHECK: sub sp, sp, #32
+; CHECK-NEXT: str x20, [sp, #16]
+; CHECK: nop
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: ldr x20, [sp, #16]
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+
+; CHECK-TAIL-LABEL: func_stack32_leaf_local:
+; CHECK-TAIL: sub sp, sp, #32
+; CHECK-TAIL-NEXT: str x20, [sp, #16]
+; CHECK-TAIL: nop
+; CHECK-TAIL-NEXT: //NO_APP
+; CHECK-TAIL-NEXT: ldr x20, [sp, #16]
+; CHECK-TAIL-NEXT: add sp, sp, #64
+; CHECK-TAIL-NEXT: ret
+
+; CHECK-TAIL-RZ-LABEL: func_stack32_leaf_local:
+; CHECK-TAIL-RZ: str x20, [sp, #-16]!
+; CHECK-TAIL-RZ-NOT: sub sp, sp
+; CHECK-TAIL-RZ: nop
+; CHECK-TAIL-RZ-NEXT: //NO_APP
+; CHECK-TAIL-RZ-NEXT: ldr x20, [sp], #16
+; CHECK-TAIL-RZ-NEXT: add sp, sp, #32
+; CHECK-TAIL-RZ-NEXT: ret
+
+ %val0 = alloca [2 x i64], align 8
+
+ ; Make sure there is a callee-save register to save/restore.
+ call void asm sideeffect "nop", "~{x20}"() nounwind
+ ret void
+}
+
+; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
+define fastcc void @func_stack32_leaf_local_nocs([8 x i32], i128 %stacked0, i128 %stacked1) {
+; CHECK-LABEL: func_stack32_leaf_local_nocs:
+; CHECK: sub sp, sp, #16
+; CHECK: add sp, sp, #16
+; CHECK-NEXT: ret
+
+; CHECK-TAIL-LABEL: func_stack32_leaf_local_nocs:
+; CHECK-TAIL: sub sp, sp, #16
+; CHECK-TAIL: add sp, sp, #48
+; CHECK-TAIL-NEXT: ret
+
+; CHECK-TAIL-RZ-LABEL: func_stack32_leaf_local_nocs:
+; CHECK-TAIL-RZ: add sp, sp, #32
+; CHECK-TAIL-RZ-NEXT: ret
+
+ %val0 = alloca [2 x i64], align 8
+
+ ret void
+}
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index d549c7e78421..e52b601b1454 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -149,3 +149,28 @@ define double @test_bitcasti64todouble(i64 %in) {
ret double %res
}
+
+define double @bitcast_fabs(double %x) {
+; CHECK-LABEL: bitcast_fabs:
+; CHECK: ; BB#0:
+; CHECK-NEXT: fabs d0, d0
+; CHECK-NEXT: ret
+;
+ %bc1 = bitcast double %x to i64
+ %and = and i64 %bc1, 9223372036854775807
+ %bc2 = bitcast i64 %and to double
+ ret double %bc2
+}
+
+define float @bitcast_fneg(float %x) {
+; CHECK-LABEL: bitcast_fneg:
+; CHECK: ; BB#0:
+; CHECK-NEXT: fneg s0, s0
+; CHECK-NEXT: ret
+;
+ %bc1 = bitcast float %x to i32
+ %xor = xor i32 %bc1, 2147483648
+ %bc2 = bitcast i32 %xor to float
+ ret float %bc2
+}
+
diff --git a/test/CodeGen/AArch64/fcvt_combine.ll b/test/CodeGen/AArch64/fcvt_combine.ll
index 093ce4a4cd85..5644fa28533b 100644
--- a/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/test/CodeGen/AArch64/fcvt_combine.ll
@@ -152,3 +152,11 @@ define <2 x i32> @test14(<2 x float> %f) {
%vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
ret <2 x i32> %vcvt.i
}
+
+; CHECK-LABEL: test_illegal_fp_to_int:
+; CHECK: fcvtzs.4s v0, v0, #2
+define <3 x i32> @test_illegal_fp_to_int(<3 x float> %in) {
+ %scale = fmul <3 x float> %in, <float 4.0, float 4.0, float 4.0>
+ %val = fptosi <3 x float> %scale to <3 x i32>
+ ret <3 x i32> %val
+}
diff --git a/test/CodeGen/AArch64/fdiv-combine.ll b/test/CodeGen/AArch64/fdiv-combine.ll
index 389eefd97b28..9ec64a854ca3 100644
--- a/test/CodeGen/AArch64/fdiv-combine.ll
+++ b/test/CodeGen/AArch64/fdiv-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=aarch64 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
; Following test cases check:
; a / D; b / D; c / D;
@@ -6,8 +6,8 @@
; recip = 1.0 / D; a * recip; b * recip; c * recip;
define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
; CHECK-LABEL: three_fdiv_float:
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fdiv
+; CHECK: fdiv s
+; CHECK-NOT: fdiv
; CHECK: fmul
; CHECK: fmul
; CHECK: fmul
@@ -20,8 +20,8 @@ define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
; CHECK-LABEL: three_fdiv_double:
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fdiv
+; CHECK: fdiv d
+; CHECK-NOT: fdiv
; CHECK: fmul
; CHECK: fmul
; CHECK: fmul
@@ -34,8 +34,8 @@ define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
; CHECK-LABEL: three_fdiv_4xfloat:
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fdiv
+; CHECK: fdiv v
+; CHECK-NOT: fdiv
; CHECK: fmul
; CHECK: fmul
; CHECK: fmul
@@ -48,8 +48,8 @@ define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b,
define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
; CHECK-LABEL: three_fdiv_2xdouble:
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fdiv
+; CHECK: fdiv v
+; CHECK-NOT: fdiv
; CHECK: fmul
; CHECK: fmul
; CHECK: fmul
@@ -64,9 +64,9 @@ define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double>
; calculates a reciprocal.
define void @two_fdiv_float(float %D, float %a, float %b) #0 {
; CHECK-LABEL: two_fdiv_float:
-; CHECK: fdiv
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fmul
+; CHECK: fdiv s
+; CHECK: fdiv s
+; CHECK-NOT: fmul
%div = fdiv float %a, %D
%div1 = fdiv float %b, %D
tail call void @foo_2f(float %div, float %div1)
@@ -75,9 +75,9 @@ define void @two_fdiv_float(float %D, float %a, float %b) #0 {
define void @two_fdiv_double(double %D, double %a, double %b) #0 {
; CHECK-LABEL: two_fdiv_double:
-; CHECK: fdiv
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fmul
+; CHECK: fdiv d
+; CHECK: fdiv d
+; CHECK-NOT: fmul
%div = fdiv double %a, %D
%div1 = fdiv double %b, %D
tail call void @foo_2d(double %div, double %div1)
diff --git a/test/CodeGen/AArch64/fdiv_combine.ll b/test/CodeGen/AArch64/fdiv_combine.ll
index 6f38a267ec3f..8ebee3c68287 100644
--- a/test/CodeGen/AArch64/fdiv_combine.ll
+++ b/test/CodeGen/AArch64/fdiv_combine.ll
@@ -38,7 +38,7 @@ entry:
; Test which should not fold due to power of 2 out of range.
; CHECK-LABEL: @test4
; CHECK: scvtf.2s v0, v0
-; CHECK: movi.2s v1, #0x50, lsl #24
+; CHECK: movi.2s v1, #80, lsl #24
; CHECK: fdiv.2s v0, v0, v1
; CHECK: ret
define <2 x float> @test4(<2 x i32> %in) {
@@ -96,7 +96,7 @@ define <4 x float> @test8(<4 x i16> %in) {
; CHECK-LABEL: @test9
; CHECK: ucvtf.2d v0, v0
; CHECK: fcvtn v0.2s, v0.2d
-; CHECK: movi.2s v1, #0x40, lsl #24
+; CHECK: movi.2s v1, #64, lsl #24
; CHECK: fdiv.2s v0, v0, v1
; CHECK: ret
define <2 x float> @test9(<2 x i64> %in) {
diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll
index b4f4d77cd0bc..4d9cb21ddc3d 100644
--- a/test/CodeGen/AArch64/fp-cond-sel.ll
+++ b/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
@varfloat = global float 0.0
@vardouble = global double 0.0
@@ -12,8 +12,8 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
%tst1 = icmp ugt i32 %lhs32, %rhs32
%val1 = select i1 %tst1, float 0.0, float 1.0
store float %val1, float* @varfloat
-; CHECK: movi v[[FLT0:[0-9]+]].2d, #0
-; CHECK: fmov s[[FLT1:[0-9]+]], #1.0
+; CHECK-DAG: movi v[[FLT0:[0-9]+]].2d, #0
+; CHECK-DAG: fmov s[[FLT1:[0-9]+]], #1.0
; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi
%rhs64 = sext i32 %rhs32 to i64
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
index b892f1902b03..b39ff08db39a 100644
--- a/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -15,7 +15,7 @@ entry:
define <4 x half> @build_h4(<4 x half> %a) {
entry:
; CHECK-LABEL: build_h4:
-; CHECK: movz [[GPR:w[0-9]+]], #0x3ccd
+; CHECK: mov [[GPR:w[0-9]+]], #15565
; CHECK: dup v0.4h, [[GPR]]
ret <4 x half> <half 0xH3CCD, half 0xH3CCD, half 0xH3CCD, half 0xH3CCD>
}
@@ -176,7 +176,7 @@ define <4 x half> @sitofp_i64(<4 x i64> %a) #0 {
define <4 x half> @uitofp_i8(<4 x i8> %a) #0 {
; CHECK-LABEL: uitofp_i8:
-; CHECK-NEXT: bic v0.4h, #0xff, lsl #8
+; CHECK-NEXT: bic v0.4h, #255, lsl #8
; CHECK-NEXT: ushll [[OP1:v[0-9]+\.4s]], v0.4h, #0
; CHECK-NEXT: ucvtf [[OP2:v[0-9]+\.4s]], [[OP1]]
; CHECK-NEXT: fcvtn v0.4h, [[OP2]]
@@ -277,10 +277,10 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, ne
-; CHECK-DAG: csel {{.*}}, wzr, ne
-; CHECK-DAG: csel {{.*}}, wzr, ne
-; CHECK-DAG: csel {{.*}}, wzr, ne
+; CHECK-DAG: csetm {{.*}}, ne
+; CHECK-DAG: csetm {{.*}}, ne
+; CHECK-DAG: csetm {{.*}}, ne
+; CHECK-DAG: csetm {{.*}}, ne
define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp une <4 x half> %a, %b
ret <4 x i1> %1
@@ -296,14 +296,14 @@ define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, eq
-; CHECK-DAG: csel {{.*}}, wzr, eq
-; CHECK-DAG: csel {{.*}}, wzr, eq
-; CHECK-DAG: csel {{.*}}, wzr, eq
-; CHECK-DAG: csel {{.*}}, vs
-; CHECK-DAG: csel {{.*}}, vs
-; CHECK-DAG: csel {{.*}}, vs
-; CHECK-DAG: csel {{.*}}, vs
+; CHECK-DAG: csetm [[REG1:w[0-9]+]], eq
+; CHECK-DAG: csetm [[REG2:w[0-9]+]], eq
+; CHECK-DAG: csetm [[REG3:w[0-9]+]], eq
+; CHECK-DAG: csetm [[REG4:w[0-9]+]], eq
+; CHECK-DAG: csinv {{.*}}, [[REG1]], wzr, vc
+; CHECK-DAG: csinv {{.*}}, [[REG2]], wzr, vc
+; CHECK-DAG: csinv {{.*}}, [[REG3]], wzr, vc
+; CHECK-DAG: csinv {{.*}}, [[REG4]], wzr, vc
define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ueq <4 x half> %a, %b
ret <4 x i1> %1
@@ -319,10 +319,10 @@ define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, hi
-; CHECK-DAG: csel {{.*}}, wzr, hi
-; CHECK-DAG: csel {{.*}}, wzr, hi
-; CHECK-DAG: csel {{.*}}, wzr, hi
+; CHECK-DAG: csetm {{.*}}, hi
+; CHECK-DAG: csetm {{.*}}, hi
+; CHECK-DAG: csetm {{.*}}, hi
+; CHECK-DAG: csetm {{.*}}, hi
define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ugt <4 x half> %a, %b
ret <4 x i1> %1
@@ -338,10 +338,10 @@ define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, pl
-; CHECK-DAG: csel {{.*}}, wzr, pl
-; CHECK-DAG: csel {{.*}}, wzr, pl
-; CHECK-DAG: csel {{.*}}, wzr, pl
+; CHECK-DAG: csetm {{.*}}, pl
+; CHECK-DAG: csetm {{.*}}, pl
+; CHECK-DAG: csetm {{.*}}, pl
+; CHECK-DAG: csetm {{.*}}, pl
define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp uge <4 x half> %a, %b
ret <4 x i1> %1
@@ -357,10 +357,10 @@ define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, lt
-; CHECK-DAG: csel {{.*}}, wzr, lt
-; CHECK-DAG: csel {{.*}}, wzr, lt
-; CHECK-DAG: csel {{.*}}, wzr, lt
+; CHECK-DAG: csetm {{.*}}, lt
+; CHECK-DAG: csetm {{.*}}, lt
+; CHECK-DAG: csetm {{.*}}, lt
+; CHECK-DAG: csetm {{.*}}, lt
define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ult <4 x half> %a, %b
ret <4 x i1> %1
@@ -376,10 +376,10 @@ define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, le
-; CHECK-DAG: csel {{.*}}, wzr, le
-; CHECK-DAG: csel {{.*}}, wzr, le
-; CHECK-DAG: csel {{.*}}, wzr, le
+; CHECK-DAG: csetm {{.*}}, le
+; CHECK-DAG: csetm {{.*}}, le
+; CHECK-DAG: csetm {{.*}}, le
+; CHECK-DAG: csetm {{.*}}, le
define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ule <4 x half> %a, %b
ret <4 x i1> %1
@@ -395,10 +395,10 @@ define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, vs
-; CHECK-DAG: csel {{.*}}, wzr, vs
-; CHECK-DAG: csel {{.*}}, wzr, vs
-; CHECK-DAG: csel {{.*}}, wzr, vs
+; CHECK-DAG: csetm {{.*}}, vs
+; CHECK-DAG: csetm {{.*}}, vs
+; CHECK-DAG: csetm {{.*}}, vs
+; CHECK-DAG: csetm {{.*}}, vs
define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp uno <4 x half> %a, %b
ret <4 x i1> %1
@@ -414,14 +414,15 @@ define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, mi
-; CHECK-DAG: csel {{.*}}, wzr, mi
-; CHECK-DAG: csel {{.*}}, wzr, mi
-; CHECK-DAG: csel {{.*}}, wzr, mi
-; CHECK-DAG: csel {{.*}}, gt
-; CHECK-DAG: csel {{.*}}, gt
-; CHECK-DAG: csel {{.*}}, gt
-; CHECK-DAG: csel {{.*}}, gt
+; CHECK-DAG: csetm [[REG1:w[0-9]+]], mi
+; CHECK-DAG: csetm [[REG2:w[0-9]+]], mi
+; CHECK-DAG: csetm [[REG3:w[0-9]+]], mi
+; CHECK-DAG: csetm [[REG4:w[0-9]+]], mi
+; CHECK-DAG: csinv {{.*}}, [[REG1]], wzr, le
+; CHECK-DAG: csinv {{.*}}, [[REG2]], wzr, le
+; CHECK-DAG: csinv {{.*}}, [[REG3]], wzr, le
+; CHECK-DAG: csinv {{.*}}, [[REG4]], wzr, le
+
define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp one <4 x half> %a, %b
ret <4 x i1> %1
@@ -437,10 +438,10 @@ define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, eq
-; CHECK-DAG: csel {{.*}}, wzr, eq
-; CHECK-DAG: csel {{.*}}, wzr, eq
-; CHECK-DAG: csel {{.*}}, wzr, eq
+; CHECK-DAG: csetm {{.*}}, eq
+; CHECK-DAG: csetm {{.*}}, eq
+; CHECK-DAG: csetm {{.*}}, eq
+; CHECK-DAG: csetm {{.*}}, eq
define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp oeq <4 x half> %a, %b
ret <4 x i1> %1
@@ -456,10 +457,10 @@ define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, gt
-; CHECK-DAG: csel {{.*}}, wzr, gt
-; CHECK-DAG: csel {{.*}}, wzr, gt
-; CHECK-DAG: csel {{.*}}, wzr, gt
+; CHECK-DAG: csetm {{.*}}, gt
+; CHECK-DAG: csetm {{.*}}, gt
+; CHECK-DAG: csetm {{.*}}, gt
+; CHECK-DAG: csetm {{.*}}, gt
define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ogt <4 x half> %a, %b
ret <4 x i1> %1
@@ -475,10 +476,10 @@ define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, ge
-; CHECK-DAG: csel {{.*}}, wzr, ge
-; CHECK-DAG: csel {{.*}}, wzr, ge
-; CHECK-DAG: csel {{.*}}, wzr, ge
+; CHECK-DAG: csetm {{.*}}, ge
+; CHECK-DAG: csetm {{.*}}, ge
+; CHECK-DAG: csetm {{.*}}, ge
+; CHECK-DAG: csetm {{.*}}, ge
define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp oge <4 x half> %a, %b
ret <4 x i1> %1
@@ -494,10 +495,10 @@ define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, mi
-; CHECK-DAG: csel {{.*}}, wzr, mi
-; CHECK-DAG: csel {{.*}}, wzr, mi
-; CHECK-DAG: csel {{.*}}, wzr, mi
+; CHECK-DAG: csetm {{.*}}, mi
+; CHECK-DAG: csetm {{.*}}, mi
+; CHECK-DAG: csetm {{.*}}, mi
+; CHECK-DAG: csetm {{.*}}, mi
define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp olt <4 x half> %a, %b
ret <4 x i1> %1
@@ -513,10 +514,10 @@ define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, ls
-; CHECK-DAG: csel {{.*}}, wzr, ls
-; CHECK-DAG: csel {{.*}}, wzr, ls
-; CHECK-DAG: csel {{.*}}, wzr, ls
+; CHECK-DAG: csetm {{.*}}, ls
+; CHECK-DAG: csetm {{.*}}, ls
+; CHECK-DAG: csetm {{.*}}, ls
+; CHECK-DAG: csetm {{.*}}, ls
define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ole <4 x half> %a, %b
ret <4 x i1> %1
@@ -532,10 +533,10 @@ define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
-; CHECK-DAG: csel {{.*}}, wzr, vc
-; CHECK-DAG: csel {{.*}}, wzr, vc
-; CHECK-DAG: csel {{.*}}, wzr, vc
-; CHECK-DAG: csel {{.*}}, wzr, vc
+; CHECK-DAG: csetm {{.*}}, vc
+; CHECK-DAG: csetm {{.*}}, vc
+; CHECK-DAG: csetm {{.*}}, vc
+; CHECK-DAG: csetm {{.*}}, vc
define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ord <4 x half> %a, %b
ret <4 x i1> %1
diff --git a/test/CodeGen/AArch64/fp16-vector-nvcast.ll b/test/CodeGen/AArch64/fp16-vector-nvcast.ll
index 83e0df74c3da..018c88c5f3eb 100644
--- a/test/CodeGen/AArch64/fp16-vector-nvcast.ll
+++ b/test/CodeGen/AArch64/fp16-vector-nvcast.ll
@@ -3,7 +3,7 @@
; Test pattern (v4f16 (AArch64NvCast (v2i32 FPR64:$src)))
define void @nvcast_v2i32(<4 x half>* %a) #0 {
; CHECK-LABEL: nvcast_v2i32:
-; CHECK-NEXT: movi v[[REG:[0-9]+]].2s, #0xab, lsl #16
+; CHECK-NEXT: movi v[[REG:[0-9]+]].2s, #171, lsl #16
; CHECK-NEXT: str d[[REG]], [x0]
; CHECK-NEXT: ret
store volatile <4 x half> <half 0xH0000, half 0xH00AB, half 0xH0000, half 0xH00AB>, <4 x half>* %a
@@ -14,7 +14,7 @@ define void @nvcast_v2i32(<4 x half>* %a) #0 {
; Test pattern (v4f16 (AArch64NvCast (v4i16 FPR64:$src)))
define void @nvcast_v4i16(<4 x half>* %a) #0 {
; CHECK-LABEL: nvcast_v4i16:
-; CHECK-NEXT: movi v[[REG:[0-9]+]].4h, #0xab
+; CHECK-NEXT: movi v[[REG:[0-9]+]].4h, #171
; CHECK-NEXT: str d[[REG]], [x0]
; CHECK-NEXT: ret
store volatile <4 x half> <half 0xH00AB, half 0xH00AB, half 0xH00AB, half 0xH00AB>, <4 x half>* %a
@@ -25,7 +25,7 @@ define void @nvcast_v4i16(<4 x half>* %a) #0 {
; Test pattern (v4f16 (AArch64NvCast (v8i8 FPR64:$src)))
define void @nvcast_v8i8(<4 x half>* %a) #0 {
; CHECK-LABEL: nvcast_v8i8:
-; CHECK-NEXT: movi v[[REG:[0-9]+]].8b, #0xab
+; CHECK-NEXT: movi v[[REG:[0-9]+]].8b, #171
; CHECK-NEXT: str d[[REG]], [x0]
; CHECK-NEXT: ret
store volatile <4 x half> <half 0xHABAB, half 0xHABAB, half 0xHABAB, half 0xHABAB>, <4 x half>* %a
@@ -46,7 +46,7 @@ define void @nvcast_f64(<4 x half>* %a) #0 {
; Test pattern (v8f16 (AArch64NvCast (v4i32 FPR128:$src)))
define void @nvcast_v4i32(<8 x half>* %a) #0 {
; CHECK-LABEL: nvcast_v4i32:
-; CHECK-NEXT: movi v[[REG:[0-9]+]].4s, #0xab, lsl #16
+; CHECK-NEXT: movi v[[REG:[0-9]+]].4s, #171, lsl #16
; CHECK-NEXT: str q[[REG]], [x0]
; CHECK-NEXT: ret
store volatile <8 x half> <half 0xH0000, half 0xH00AB, half 0xH0000, half 0xH00AB, half 0xH0000, half 0xH00AB, half 0xH0000, half 0xH00AB>, <8 x half>* %a
@@ -57,7 +57,7 @@ define void @nvcast_v4i32(<8 x half>* %a) #0 {
; Test pattern (v8f16 (AArch64NvCast (v8i16 FPR128:$src)))
define void @nvcast_v8i16(<8 x half>* %a) #0 {
; CHECK-LABEL: nvcast_v8i16:
-; CHECK-NEXT: movi v[[REG:[0-9]+]].8h, #0xab
+; CHECK-NEXT: movi v[[REG:[0-9]+]].8h, #171
; CHECK-NEXT: str q[[REG]], [x0]
; CHECK-NEXT: ret
store volatile <8 x half> <half 0xH00AB, half 0xH00AB, half 0xH00AB, half 0xH00AB, half 0xH00AB, half 0xH00AB, half 0xH00AB, half 0xH00AB>, <8 x half>* %a
@@ -68,7 +68,7 @@ define void @nvcast_v8i16(<8 x half>* %a) #0 {
; Test pattern (v8f16 (AArch64NvCast (v16i8 FPR128:$src)))
define void @nvcast_v16i8(<8 x half>* %a) #0 {
; CHECK-LABEL: nvcast_v16i8:
-; CHECK-NEXT: movi v[[REG:[0-9]+]].16b, #0xab
+; CHECK-NEXT: movi v[[REG:[0-9]+]].16b, #171
; CHECK-NEXT: str q[[REG]], [x0]
; CHECK-NEXT: ret
store volatile <8 x half> <half 0xHABAB, half 0xHABAB, half 0xHABAB, half 0xHABAB, half 0xHABAB, half 0xHABAB, half 0xHABAB, half 0xHABAB>, <8 x half>* %a
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index 6acb11108afc..b4faef750a2c 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -38,20 +38,19 @@ define void @check_double() {
}
; LARGE-LABEL: check_float2
-; LARGE: movz [[REG:w[0-9]+]], #0x4049, lsl #16
-; LARGE-NEXT: movk [[REG]], #0xfdb
+; LARGE: mov [[REG:w[0-9]+]], #1078525952
+; LARGE-NEXT: movk [[REG]], #4059
; LARGE-NEXT: fmov s0, [[REG]]
define float @check_float2() {
ret float 3.14159274101257324218750
}
; LARGE-LABEL: check_double2
-; LARGE: movz [[REG:x[0-9]+]], #0x4009, lsl #48
-; LARGE-NEXT: movk [[REG]], #0x21fb, lsl #32
-; LARGE-NEXT: movk [[REG]], #0x5444, lsl #16
-; LARGE-NEXT: movk [[REG]], #0x2d18
+; LARGE: mov [[REG:x[0-9]+]], #4614219293217783808
+; LARGE-NEXT: movk [[REG]], #8699, lsl #32
+; LARGE-NEXT: movk [[REG]], #21572, lsl #16
+; LARGE-NEXT: movk [[REG]], #11544
; LARGE-NEXT: fmov d0, [[REG]]
define double @check_double2() {
ret double 3.1415926535897931159979634685441851615905761718750
}
-
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 2ea13e388867..cf6545dab385 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-post-ra | FileCheck --check-prefix=CHECK %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-post-ra | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -disable-post-ra | FileCheck --check-prefix=CHECK-NOFP %s
%myStruct = type { i64 , i8, i32 }
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 2f45666ba13a..40ed607b06cc 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
@@ -89,11 +89,11 @@ define void @check_stack_args() {
; that varstruct is passed on the stack. Rather dependent on how a
; memcpy gets created, but the following works for now.
-; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16]
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
-; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]!
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
@@ -104,10 +104,10 @@ define void @check_stack_args() {
float -2.0, float -8.0, float 16.0, float 1.0,
float 64.0)
-; CHECK: movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK: mov [[SIXTY_FOUR:w[0-9]+]], #1115684864
; CHECK: str [[SIXTY_FOUR]], [sp]
-; CHECK-NONEON: movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK-NONEON: mov [[SIXTY_FOUR:w[0-9]+]], #1115684864
; CHECK-NONEON: str [[SIXTY_FOUR]], [sp]
; CHECK: bl stacked_fpu
@@ -139,9 +139,9 @@ define void @check_i128_align() {
call void @check_i128_regalign(i32 0, i128 42)
; CHECK-NOT: mov x1
-; CHECK-LE: movz x2, #{{0x2a|42}}
+; CHECK-LE: mov x2, #{{0x2a|42}}
; CHECK-LE: mov x3, xzr
-; CHECK-BE: movz {{x|w}}3, #{{0x2a|42}}
+; CHECK-BE: mov {{x|w}}3, #{{0x2a|42}}
; CHECK-BE: mov x2, xzr
; CHECK: bl check_i128_regalign
diff --git a/test/CodeGen/AArch64/gep-nullptr.ll b/test/CodeGen/AArch64/gep-nullptr.ll
new file mode 100644
index 000000000000..4c2bc504cd04
--- /dev/null
+++ b/test/CodeGen/AArch64/gep-nullptr.ll
@@ -0,0 +1,23 @@
+; RUN: llc -O3 -aarch64-gep-opt=true < %s |FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%structA = type { i8, i8, i8, i8, i8, i8, [4 x i8], i8, i8, [2 x i32], [2 x %unionMV], [4 x [2 x %unionMV]], [4 x [2 x %unionMV]], [4 x i8], i8*, i8*, i32, i8* }
+%unionMV = type { i32 }
+
+; Function Attrs: nounwind
+define void @test(%structA* %mi_block) {
+entry:
+ br i1 undef, label %for.body13.us, label %if.else
+
+; Just make sure we don't get a compiler ICE due to dereferncing a nullptr.
+; CHECK-LABEL: test
+for.body13.us: ; preds = %entry
+ %indvars.iv.next40 = or i64 0, 1
+ %packed4.i.us.1 = getelementptr inbounds %structA, %structA* %mi_block, i64 0, i32 11, i64 0, i64 %indvars.iv.next40, i32 0
+ unreachable
+
+if.else: ; preds = %entry
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
index 6895380ca63e..481be4017b00 100644
--- a/test/CodeGen/AArch64/global-merge-3.ll
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -21,7 +21,7 @@ define void @f1(i32 %a1, i32 %a2, i32 %a3) {
}
;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals
-;CHECK: .align 4
+;CHECK: .p2align 4
;CHECK: .L_MergedGlobals:
;CHECK: .size .L_MergedGlobals, 4004
@@ -29,7 +29,7 @@ define void @f1(i32 %a1, i32 %a2, i32 %a3) {
;CHECK: .local .L_MergedGlobals.1
;CHECK: .comm .L_MergedGlobals.1,4000,16
-;CHECK-APPLE-IOS: .align 4
+;CHECK-APPLE-IOS: .p2align 4
;CHECK-APPLE-IOS: l__MergedGlobals:
;CHECK-APPLE-IOS: .long 1
;CHECK-APPLE-IOS: .space 4000
diff --git a/test/CodeGen/AArch64/global-merge-group-by-use.ll b/test/CodeGen/AArch64/global-merge-group-by-use.ll
index 8b3fc97c9e2e..434c787b28da 100644
--- a/test/CodeGen/AArch64/global-merge-group-by-use.ll
+++ b/test/CodeGen/AArch64/global-merge-group-by-use.ll
@@ -64,8 +64,8 @@ define void @f3(i32 %a1, i32 %a2) #0 {
define void @f4(i32 %a1, i32 %a2, i32 %a3) #0 {
; CHECK-NEXT: adrp x8, [[SET3]]@PAGE
; CHECK-NEXT: add x8, x8, [[SET3]]@PAGEOFF
-; CHECK-NEXT: stp w0, w1, [x8, #4]
-; CHECK-NEXT: str w2, [x8]
+; CHECK-NEXT: stp w2, w0, [x8]
+; CHECK-NEXT: str w1, [x8, #8]
; CHECK-NEXT: ret
store i32 %a1, i32* @m4, align 4
store i32 %a2, i32* @n4, align 4
diff --git a/test/CodeGen/AArch64/half.ll b/test/CodeGen/AArch64/half.ll
index d4cbbc918a84..154d85c9bb61 100644
--- a/test/CodeGen/AArch64/half.ll
+++ b/test/CodeGen/AArch64/half.ll
@@ -81,3 +81,15 @@ define void @test_trunc64(double %in, half* %addr) {
store half %val16, half* %addr
ret void
}
+
+define i16 @test_fccmp(i1 %a) {
+;CHECK-LABEL: test_fccmp:
+;CHECK: fcmp
+ %cmp0 = fcmp ogt half 0xH3333, undef
+ %cmp1 = fcmp ogt half 0xH2222, undef
+ %x = select i1 %cmp0, i16 0, i16 undef
+ %or = or i1 %cmp1, %cmp0
+ %y = select i1 %or, i16 4, i16 undef
+ %r = add i16 %x, %y
+ ret i16 %r
+}
diff --git a/test/CodeGen/AArch64/hints.ll b/test/CodeGen/AArch64/hints.ll
index d7d9e23af1f1..f23c7b00f224 100644
--- a/test/CodeGen/AArch64/hints.ll
+++ b/test/CodeGen/AArch64/hints.ll
@@ -63,5 +63,5 @@ entry:
}
; CHECK-LABEL: hint_undefined
-; CHECK: hint #0x8
+; CHECK: hint #8
diff --git a/test/CodeGen/AArch64/inlineasm-X-allocation.ll b/test/CodeGen/AArch64/inlineasm-X-allocation.ll
new file mode 100644
index 000000000000..1d7a24e3e6e7
--- /dev/null
+++ b/test/CodeGen/AArch64/inlineasm-X-allocation.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=-fp-armv8 %s -o - | FileCheck %s -check-prefix=nofp
+
+; In the novfp case, the compiler is forced to assign a core register,
+; even if the input is a float.
+
+; nofp-LABEL: f1
+; nofp-CHECK: ldr x0, [sp]
+
+; This can be generated by a function such as:
+; void f1(float f) {asm volatile ("ldr $0, [sp]" : : "X" (f));}
+
+define void @f1(float %f) {
+entry:
+ call void asm sideeffect "ldr $0, [sp]", "X" (float %f) nounwind
+
+ ret void
+}
diff --git a/test/CodeGen/AArch64/inlineasm-X-constraint.ll b/test/CodeGen/AArch64/inlineasm-X-constraint.ll
new file mode 100644
index 000000000000..77652cc071ef
--- /dev/null
+++ b/test/CodeGen/AArch64/inlineasm-X-constraint.ll
@@ -0,0 +1,152 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o - | FileCheck %s
+
+; The following functions test the use case where an X constraint is used to
+; add a dependency between an assembly instruction (vmsr in this case) and
+; another instruction. In each function, we use a different type for the
+; X constraint argument.
+;
+; We can something similar from the following C code:
+; double f1(double f, int pscr_value) {
+; asm volatile("msr fpsr,%1" : "=X" ((f)): "r" (pscr_value));
+; return f+f;
+; }
+
+; CHECK-LABEL: f1
+; CHECK: msr FPSR
+; CHECK: fadd d
+
+define double @f1(double %f, i32 %pscr_value) {
+entry:
+ %f.addr = alloca double, align 8
+ store double %f, double* %f.addr, align 8
+ call void asm sideeffect "msr fpsr,$1", "=*X,r"(double* nonnull %f.addr, i32 %pscr_value) nounwind
+ %0 = load double, double* %f.addr, align 8
+ %add = fadd double %0, %0
+ ret double %add
+}
+
+; int f2(int f, int pscr_value) {
+; asm volatile("msr fpsr,$1" : "=X" ((f)): "r" (pscr_value));
+; return f*f;
+; }
+
+; CHECK-LABEL: f2
+; CHECK: msr FPSR
+; CHECK: mul
+define i32 @f2(i32 %f, i32 %pscr_value) {
+entry:
+ %f.addr = alloca i32, align 4
+ store i32 %f, i32* %f.addr, align 4
+ call void asm sideeffect "msr fpsr,$1", "=*X,r"(i32* nonnull %f.addr, i32 %pscr_value) nounwind
+ %0 = load i32, i32* %f.addr, align 4
+ %mul = mul i32 %0, %0
+ ret i32 %mul
+}
+
+; typedef signed char int8_t;
+; typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+; void f3 (void)
+; {
+; int8x8_t vector_res_int8x8;
+; unsigned int fpscr;
+; asm volatile ("msr fpsr,$1" : "=X" ((vector_res_int8x8)) : "r" (fpscr));
+; return vector_res_int8x8 * vector_res_int8x8;
+; }
+
+; CHECK-LABEL: f3
+; CHECK: msr FPSR
+; CHECK: mul
+define <8 x i8> @f3() {
+entry:
+ %vector_res_int8x8 = alloca <8 x i8>, align 8
+ %0 = getelementptr inbounds <8 x i8>, <8 x i8>* %vector_res_int8x8, i32 0, i32 0
+ call void asm sideeffect "msr fpsr,$1", "=*X,r"(<8 x i8>* nonnull %vector_res_int8x8, i32 undef) nounwind
+ %1 = load <8 x i8>, <8 x i8>* %vector_res_int8x8, align 8
+ %mul = mul <8 x i8> %1, %1
+ ret <8 x i8> %mul
+}
+
+; We can emit integer constants.
+; We can get this from:
+; void f() {
+; int x = 2;
+; asm volatile ("add x0, x0, %0" : : "X" (x));
+; }
+;
+; CHECK-LABEL: f4
+; CHECK: add x0, x0, #2
+define void @f4() {
+entry:
+ tail call void asm sideeffect "add x0, x0, $0", "X"(i32 2)
+ ret void
+}
+
+; We can emit function labels. This is equivalent to the following C code:
+; void f(void) {
+; void (*x)(void) = &foo;
+; asm volatile ("bl %0" : : "X" (x));
+; }
+; CHECK-LABEL: f5
+; CHECK: bl f4
+define void @f5() {
+entry:
+ tail call void asm sideeffect "bl $0", "X"(void ()* nonnull @f4)
+ ret void
+}
+
+declare void @foo(...)
+
+; This tests the behavior of the X constraint when used on functions pointers,
+; or functions with a cast. In the first asm call we figure out that this
+; is a function pointer and emit the label. However, in the second asm call
+; we can't see through the bitcast and we end up having to lower this constraint
+; to something else. This is not ideal, but it is a correct behaviour according
+; to the definition of the X constraint.
+;
+; In this case (and other cases where we could have emitted something else),
+; what we're doing with the X constraint is not particularly useful either,
+; since the user could have used "r" in this situation for the same effect.
+
+; CHECK-LABEL: f6
+; CHECK: bl foo
+; CHECK: br x
+
+define void @f6() nounwind {
+entry:
+ tail call void asm sideeffect "bl $0", "X"(void (...)* @foo) nounwind
+ tail call void asm sideeffect "br $0", "X"(void (...)* bitcast (void ()* @f4 to void (...)*)) nounwind
+ ret void
+}
+
+; The following IR can be generated from C code with a function like:
+; void a() {
+; void* a = &&A;
+; asm volatile ("bl %0" : : "X" (a));
+; A:
+; return;
+; }
+;
+; Ideally this would give the block address of bb, but it requires us to see
+; through blockaddress, which we can't do at the moment. This might break some
+; existing use cases where a user would expect to get a block label and instead
+; gets the block address in a register. However, note that according to the
+; "no constraints" definition this behaviour is correct (although not very nice).
+
+; CHECK-LABEL: f7
+; CHECK: bl
+define void @f7() {
+ call void asm sideeffect "br $0", "X"( i8* blockaddress(@f7, %bb) )
+ br label %bb
+bb:
+ ret void
+}
+
+; If we use a constraint "=*X", we should get a store back to *%x (in x0).
+; CHECK-LABEL: f8
+; CHECK: add [[Dest:x[0-9]+]], x0, x0
+; CHECK: str [[Dest]], [x0]
+define void @f8(i64 *%x) {
+entry:
+ tail call void asm sideeffect "add $0, x0, x0", "=*X"(i64 *%x)
+ ret void
+}
diff --git a/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
index 645214ac8ec7..ca24fc9c8807 100644
--- a/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
+++ b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
@@ -5,7 +5,7 @@
; RUN: llc -mtriple=aarch64 < %s -filetype=obj | llvm-objdump -arch=aarch64 -d - | FileCheck %s
; CHECK-LABEL: foo:
-; CHECK: a0 79 95 d2 movz x0, #0xabcd
+; CHECK: a0 79 95 d2 mov x0, #43981
; CHECK: c0 03 5f d6 ret
define i32 @foo() nounwind {
entry:
@@ -22,5 +22,3 @@ entry:
%0 = tail call i32 asm sideeffect "ldr $0,=0x10001", "=r"() nounwind
ret i32 %0
}
-
-
diff --git a/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll b/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
new file mode 100644
index 000000000000..f65694ab80a1
--- /dev/null
+++ b/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=aarch64 -aarch64-neon-syntax=apple -aarch64-stp-suppress=false -verify-machineinstrs -asm-verbose=false | FileCheck %s
+
+; CHECK-LABEL: test_strd_sturd:
+; CHECK-NEXT: stp d0, d1, [x0, #-8]
+; CHECK-NEXT: ret
+define void @test_strd_sturd(float* %ptr, <2 x float> %v1, <2 x float> %v2) #0 {
+ %tmp1 = bitcast float* %ptr to <2 x float>*
+ store <2 x float> %v2, <2 x float>* %tmp1, align 16
+ %add.ptr = getelementptr inbounds float, float* %ptr, i64 -2
+ %tmp = bitcast float* %add.ptr to <2 x float>*
+ store <2 x float> %v1, <2 x float>* %tmp, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_sturd_strd:
+; CHECK-NEXT: stp d0, d1, [x0, #-8]
+; CHECK-NEXT: ret
+define void @test_sturd_strd(float* %ptr, <2 x float> %v1, <2 x float> %v2) #0 {
+ %add.ptr = getelementptr inbounds float, float* %ptr, i64 -2
+ %tmp = bitcast float* %add.ptr to <2 x float>*
+ store <2 x float> %v1, <2 x float>* %tmp, align 16
+ %tmp1 = bitcast float* %ptr to <2 x float>*
+ store <2 x float> %v2, <2 x float>* %tmp1, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_strq_sturq:
+; CHECK-NEXT: stp q0, q1, [x0, #-16]
+; CHECK-NEXT: ret
+define void @test_strq_sturq(double* %ptr, <2 x double> %v1, <2 x double> %v2) #0 {
+ %tmp1 = bitcast double* %ptr to <2 x double>*
+ store <2 x double> %v2, <2 x double>* %tmp1, align 16
+ %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2
+ %tmp = bitcast double* %add.ptr to <2 x double>*
+ store <2 x double> %v1, <2 x double>* %tmp, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_sturq_strq:
+; CHECK-NEXT: stp q0, q1, [x0, #-16]
+; CHECK-NEXT: ret
+define void @test_sturq_strq(double* %ptr, <2 x double> %v1, <2 x double> %v2) #0 {
+ %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2
+ %tmp = bitcast double* %add.ptr to <2 x double>*
+ store <2 x double> %v1, <2 x double>* %tmp, align 16
+ %tmp1 = bitcast double* %ptr to <2 x double>*
+ store <2 x double> %v2, <2 x double>* %tmp1, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_ldrx_ldurx:
+; CHECK-NEXT: ldp [[V0:x[0-9]+]], [[V1:x[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add x0, [[V0]], [[V1]]
+; CHECK-NEXT: ret
+define i64 @test_ldrx_ldurx(i64* %p) #0 {
+ %tmp = load i64, i64* %p, align 4
+ %add.ptr = getelementptr inbounds i64, i64* %p, i64 -1
+ %tmp1 = load i64, i64* %add.ptr, align 4
+ %add = add nsw i64 %tmp1, %tmp
+ ret i64 %add
+}
+
+; CHECK-LABEL: test_ldurx_ldrx:
+; CHECK-NEXT: ldp [[V0:x[0-9]+]], [[V1:x[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add x0, [[V0]], [[V1]]
+; CHECK-NEXT: ret
+define i64 @test_ldurx_ldrx(i64* %p) #0 {
+ %add.ptr = getelementptr inbounds i64, i64* %p, i64 -1
+ %tmp1 = load i64, i64* %add.ptr, align 4
+ %tmp = load i64, i64* %p, align 4
+ %add = add nsw i64 %tmp1, %tmp
+ ret i64 %add
+}
+
+; CHECK-LABEL: test_ldrsw_ldursw:
+; CHECK-NEXT: ldpsw [[V0:x[0-9]+]], [[V1:x[0-9]+]], [x0, #-4]
+; CHECK-NEXT: add x0, [[V0]], [[V1]]
+; CHECK-NEXT: ret
+define i64 @test_ldrsw_ldursw(i32* %p) #0 {
+ %tmp = load i32, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32, i32* %p, i64 -1
+ %tmp1 = load i32, i32* %add.ptr, align 4
+ %sexttmp = sext i32 %tmp to i64
+ %sexttmp1 = sext i32 %tmp1 to i64
+ %add = add nsw i64 %sexttmp1, %sexttmp
+ ret i64 %add
+}
+
+; Also make sure we only match valid offsets.
+; CHECK-LABEL: test_ldrq_ldruq_invalidoffset:
+; CHECK-NEXT: ldr q[[V0:[0-9]+]], [x0]
+; CHECK-NEXT: ldur q[[V1:[0-9]+]], [x0, #24]
+; CHECK-NEXT: add.2d v0, v[[V0]], v[[V1]]
+; CHECK-NEXT: ret
+define <2 x i64> @test_ldrq_ldruq_invalidoffset(i64* %p) #0 {
+ %a1 = bitcast i64* %p to <2 x i64>*
+ %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8
+ %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 3
+ %a2 = bitcast i64* %add.ptr2 to <2 x i64>*
+ %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8
+ %add = add nsw <2 x i64> %tmp1, %tmp2
+ ret <2 x i64> %add
+}
+
+; Pair an unscaled store with a scaled store where the scaled store has a
+; non-zero offset. This should not hit an assert.
+; CHECK-LABEL: test_stur_str_no_assert
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: ret
+define void @test_stur_str_no_assert() #0 {
+entry:
+ %a1 = alloca i64, align 4
+ %a2 = alloca [12 x i8], align 4
+ %0 = bitcast i64* %a1 to i8*
+ %C = getelementptr inbounds [12 x i8], [12 x i8]* %a2, i64 0, i64 4
+ %1 = bitcast i8* %C to i64*
+ store i64 0, i64* %1, align 4
+ call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 8, i32 8, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index f4626c7e0a43..f4f77c5aa312 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -1,7 +1,5 @@
import re
-config.suffixes = ['.ll']
-
if not 'AArch64' in config.root.targets:
config.unsupported = True
diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll
index 332d660eef36..6e33ab2d0beb 100644
--- a/test/CodeGen/AArch64/local_vars.ll
+++ b/test/CodeGen/AArch64/local_vars.ll
@@ -24,24 +24,25 @@ define void @trivial_func() nounwind {
}
define void @trivial_fp_func() {
-; CHECK-WITHFP-AARCH64-LABEL: trivial_fp_func:
-; CHECK-WITHFP-AARCH64: sub sp, sp, #16
-; CHECK-WITHFP-AARCH64: stp x29, x30, [sp]
-; CHECK-WITHFP-AARCH64-NEXT: mov x29, sp
+; CHECK-LABEL: trivial_fp_func:
+; CHECK: str x30, [sp, #-16]!
+; CHECK-NOT: mov x29, sp
; CHECK-WITHFP-ARM64-LABEL: trivial_fp_func:
; CHECK-WITHFP-ARM64: stp x29, x30, [sp, #-16]!
; CHECK-WITHFP-ARM64-NEXT: mov x29, sp
; Dont't really care, but it would be a Bad Thing if this came after the epilogue.
+; CHECK-WITHFP-ARM64: bl foo
; CHECK: bl foo
call void @foo()
ret void
-; CHECK-WITHFP: ldp x29, x30, [sp]
-; CHECK-WITHFP: add sp, sp, #16
+; CHECK: ldr x30, [sp], #16
+; CHECK-NEXT: ret
-; CHECK-WITHFP: ret
+; CHECK-WITHFP-ARM64: ldp x29, x30, [sp], #16
+; CHECK-WITHFP-ARM64-NEXT: ret
}
define void @stack_local() {
diff --git a/test/CodeGen/AArch64/logical-imm.ll b/test/CodeGen/AArch64/logical-imm.ll
index a5e4a9956de7..6f562230d937 100644
--- a/test/CodeGen/AArch64/logical-imm.ll
+++ b/test/CodeGen/AArch64/logical-imm.ll
@@ -73,11 +73,11 @@ define void @test_mov(i32 %in32, i64 %in64) {
; CHECK-LABEL: test_mov:
%val0 = add i32 %in32, 2863311530
store i32 %val0, i32* @var32
-; CHECK: orr {{w[0-9]+}}, wzr, #0xaaaaaaaa
+; CHECK: mov {{w[0-9]+}}, #-1431655766
%val1 = add i64 %in64, 11068046444225730969
store i64 %val1, i64* @var64
-; CHECK: orr {{x[0-9]+}}, xzr, #0x9999999999999999
+; CHECK: mov {{x[0-9]+}}, #-7378697629483820647
ret void
; CHECK: ret
diff --git a/test/CodeGen/AArch64/lower-range-metadata-func-call.ll b/test/CodeGen/AArch64/lower-range-metadata-func-call.ll
new file mode 100644
index 000000000000..fd4b2f5ba305
--- /dev/null
+++ b/test/CodeGen/AArch64/lower-range-metadata-func-call.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=aarch64 -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; and can be eliminated
+; CHECK-LABEL: {{^}}test_call_known_max_range:
+; CHECK: bl foo
+; CHECK-NOT: and
+; CHECK: ret
+define i32 @test_call_known_max_range() #0 {
+entry:
+ %id = tail call i32 @foo(), !range !0
+ %and = and i32 %id, 1023
+ ret i32 %and
+}
+
+; CHECK-LABEL: {{^}}test_call_known_trunc_1_bit_range:
+; CHECK: bl foo
+; CHECK: and w{{[0-9]+}}, w0, #0x1ff
+; CHECK: ret
+define i32 @test_call_known_trunc_1_bit_range() #0 {
+entry:
+ %id = tail call i32 @foo(), !range !0
+ %and = and i32 %id, 511
+ ret i32 %and
+}
+
+; CHECK-LABEL: {{^}}test_call_known_max_range_m1:
+; CHECK: bl foo
+; CHECK: and w{{[0-9]+}}, w0, #0xff
+; CHECK: ret
+define i32 @test_call_known_max_range_m1() #0 {
+entry:
+ %id = tail call i32 @foo(), !range !1
+ %and = and i32 %id, 255
+ ret i32 %and
+}
+
+
+declare i32 @foo()
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{i32 0, i32 1024}
+!1 = !{i32 0, i32 1023}
diff --git a/test/CodeGen/AArch64/machine-combiner.ll b/test/CodeGen/AArch64/machine-combiner.ll
index 56a742fd6c3a..0bd416ad1721 100644
--- a/test/CodeGen/AArch64/machine-combiner.ll
+++ b/test/CodeGen/AArch64/machine-combiner.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -disable-post-ra < %s | FileCheck %s
; Verify that the first two adds are independent regardless of how the inputs are
; commuted. The destination registers are used as source registers for the third add.
diff --git a/test/CodeGen/AArch64/machine-copy-remove.ll b/test/CodeGen/AArch64/machine-copy-remove.ll
new file mode 100644
index 000000000000..75954f83c19c
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-copy-remove.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: f_XX:
+; CHECK: cbz x[[REG:[0-9]+]], [[BB:.LBB.*]]
+; CHECK: [[BB]]:
+; CHECK-NOT: mov x[[REG]], xzr
+define i64 @f_XX(i64 %n, i64* nocapture readonly %P) {
+entry:
+ %tobool = icmp eq i64 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %0 = load i64, i64* %P
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %a.0 = phi i64 [ %0, %if.then ], [ 0, %entry ]
+ ret i64 %a.0
+}
+
+; CHECK-LABEL: f_WW:
+; CHECK: cbz w[[REG:[0-9]+]], [[BB:.LBB.*]]
+; CHECK: [[BB]]:
+; CHECK-NOT: mov w[[REG]], wzr
+define i32 @f_WW(i32 %n, i32* nocapture readonly %P) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %0 = load i32, i32* %P
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %a.0 = phi i32 [ %0, %if.then ], [ 0, %entry ]
+ ret i32 %a.0
+}
+
+; CHECK-LABEL: f_XW:
+; CHECK: cbz x[[REG:[0-9]+]], [[BB:.LBB.*]]
+; CHECK: [[BB]]:
+; CHECK-NOT: mov w[[REG]], wzr
+define i32 @f_XW(i64 %n, i32* nocapture readonly %P) {
+entry:
+ %tobool = icmp eq i64 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %0 = load i32, i32* %P
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %a.0 = phi i32 [ %0, %if.then ], [ 0, %entry ]
+ ret i32 %a.0
+}
+
+; CHECK-LABEL: f_WX:
+; CHECK: cbz w[[REG:[0-9]+]], [[BB:.LBB.*]]
+; CHECK: [[BB]]:
+; CHECK: mov x[[REG]], xzr
+; Do not remove the mov in this case because we do not know if the upper bits
+; of the X register are zero.
+define i64 @f_WX(i32 %n, i64* nocapture readonly %P) {
+entry:
+ %tobool = icmp eq i32 %n, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %0 = load i64, i64* %P
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %a.0 = phi i64 [ %0, %if.then ], [ 0, %entry ]
+ ret i64 %a.0
+}
+
+; CHECK-LABEL: test_superreg:
+; CHECK: cbz x[[REG:[0-9]+]], [[BB:.LBB.*]]
+; CHECK: [[BB]]:
+; CHECK: str x[[REG]], [x1]
+; CHECK-NOT: mov w[[REG]], wzr
+; Because we returned w0 but x0 was marked live-in to the block, we didn't
+; remove the <kill> on the str leading to a verification failure.
+define i32 @test_superreg(i64 %in, i64* %dest) {
+ %tst = icmp eq i64 %in, 0
+ br i1 %tst, label %true, label %false
+
+false:
+ ret i32 42
+
+true:
+ store volatile i64 %in, i64* %dest
+ ret i32 0
+}
diff --git a/test/CodeGen/AArch64/merge-store-dependency.ll b/test/CodeGen/AArch64/merge-store-dependency.ll
new file mode 100644
index 000000000000..c68cee91a3cf
--- /dev/null
+++ b/test/CodeGen/AArch64/merge-store-dependency.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mcpu cortex-a53 -march aarch64 %s -o - | FileCheck %s --check-prefix=A53
+
+; PR26827 - Merge stores causes wrong dependency.
+%struct1 = type { %struct1*, %struct1*, i32, i32, i16, i16, void (i32, i32, i8*)*, i8* }
+@gv0 = internal unnamed_addr global i32 0, align 4
+@gv1 = internal unnamed_addr global %struct1** null, align 8
+
+define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg) {
+;CHECK-LABEL: test
+entry:
+; A53: mov [[DATA:w[0-9]+]], w1
+; A53: str q{{[0-9]+}}, {{.*}}
+; A53: str q{{[0-9]+}}, {{.*}}
+; A53: str [[DATA]], {{.*}}
+
+ %0 = bitcast %struct1* %fde to i8*
+ tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 40, i32 8, i1 false)
+ %state = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 4
+ store i16 256, i16* %state, align 8
+ %fd1 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 2
+ store i32 %fd, i32* %fd1, align 8
+ %force_eof = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 3
+ store i32 0, i32* %force_eof, align 4
+ %func2 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 6
+ store void (i32, i32, i8*)* %func, void (i32, i32, i8*)** %func2, align 8
+ %arg3 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 7
+ store i8* %arg, i8** %arg3, align 8
+ %call = tail call i32 (i32, i32, ...) @fcntl(i32 %fd, i32 4, i8* %0) #6
+ %1 = load i32, i32* %fd1, align 8
+ %cmp.i = icmp slt i32 %1, 0
+ br i1 %cmp.i, label %if.then.i, label %while.body.i.preheader
+if.then.i:
+ unreachable
+
+while.body.i.preheader:
+ %2 = load i32, i32* @gv0, align 4
+ %3 = icmp eq i32* %fd1, @gv0
+ br i1 %3, label %while.body.i.split, label %while.body.i.split.ver.us.preheader
+
+while.body.i.split.ver.us.preheader:
+ br label %while.body.i.split.ver.us
+
+while.body.i.split.ver.us:
+ %.reg2mem21.0 = phi i32 [ %mul.i.ver.us, %while.body.i.split.ver.us ], [ %2, %while.body.i.split.ver.us.preheader ]
+ %mul.i.ver.us = shl nsw i32 %.reg2mem21.0, 1
+ %4 = icmp sgt i32 %mul.i.ver.us, %1
+ br i1 %4, label %while.end.i, label %while.body.i.split.ver.us
+
+while.body.i.split:
+ br label %while.body.i.split
+
+while.end.i:
+ %call.i = tail call i8* @foo()
+ store i8* %call.i, i8** bitcast (%struct1*** @gv1 to i8**), align 8
+ br label %exit
+
+exit:
+ ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+declare i32 @fcntl(i32, i32, ...)
+declare noalias i8* @foo()
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
index 86f5edd5da1d..981d16f762ff 100644
--- a/test/CodeGen/AArch64/merge-store.ll
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march aarch64 %s -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone | FileCheck %s --check-prefix=CYCLONE
+; RUN: llc -mtriple=aarch64-unknown-unknown %s -mcpu=cyclone -o - | FileCheck %s --check-prefix=CYCLONE --check-prefix=CHECK
+; RUN: llc -march aarch64 %s -mattr=-slow-misaligned-128store -o - | FileCheck %s --check-prefix=MISALIGNED --check-prefix=CHECK
@g0 = external global <3 x float>, align 16
@g1 = external global <3 x float>, align 4
@@ -38,9 +38,12 @@ define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
ret void
-; CHECK-LABEL: merge_vec_extract_stores
-; CHECK: stur q0, [x0, #24]
-; CHECK-NEXT: ret
+; MISALIGNED-LABEL: merge_vec_extract_stores
+; MISALIGNED: stur q0, [x0, #24]
+; MISALIGNED-NEXT: ret
+
+; FIXME: Ideally we would like to use a generic target for this test, but this relies
+; on suppressing store pairs.
; CYCLONE-LABEL: merge_vec_extract_stores
; CYCLONE: ext v1.16b, v0.16b, v0.16b, #8
diff --git a/test/CodeGen/AArch64/misched-fusion.ll b/test/CodeGen/AArch64/misched-fusion.ll
index d38869329034..0f4c0ac84ce5 100644
--- a/test/CodeGen/AArch64/misched-fusion.ll
+++ b/test/CodeGen/AArch64/misched-fusion.ll
@@ -1,4 +1,6 @@
+; RUN: llc -o - %s -mattr=+macroop-fusion,+use-postra-scheduler | FileCheck %s
; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
+
target triple = "arm64-apple-ios"
declare void @foobar(i32 %v0, i32 %v1)
@@ -8,12 +10,12 @@ declare void @foobar(i32 %v0, i32 %v1)
; CHECK: add w[[ADDRES:[0-9]+]], w1, #7
; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13
; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]]
-; CHECK: mov x0, x[[ADDRES]]
-; CHECK: mov x1, x[[SUBRES]]
+; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]]
+; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]]
; CHECK: bl _foobar
; CHECK: [[SKIPBLOCK]]:
-; CHECK: mov x0, x[[SUBRES]]
-; CHECK: mov x1, x[[ADDRES]]
+; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]]
+; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]]
; CHECK: bl _foobar
define void @test_sub_cbz(i32 %a0, i32 %a1) {
entry:
diff --git a/test/CodeGen/AArch64/movimm-wzr.mir b/test/CodeGen/AArch64/movimm-wzr.mir
new file mode 100644
index 000000000000..d54e7bef54cd
--- /dev/null
+++ b/test/CodeGen/AArch64/movimm-wzr.mir
@@ -0,0 +1,46 @@
+# RUN: llc -run-pass=aarch64-expand-pseudo %s -o - 2>&1 | FileCheck %s
+
+--- |
+ ; ModuleID = 'simple.ll'
+ source_filename = "simple.ll"
+ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+ target triple = "aarch64--linux-gnu"
+
+ define i32 @test_mov_0() {
+ ret i32 42
+ }
+
+...
+---
+name: test_mov_0
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: false
+tracksSubRegLiveness: false
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %wzr = MOVi32imm 42
+ %xzr = MOVi64imm 42
+ RET_ReallyLR implicit killed %w0
+
+...
+
+# CHECK: bb.0
+# CHECK-NEXT: RET %lr
diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll
index 93c181271755..def6072e0bca 100644
--- a/test/CodeGen/AArch64/movw-consts.ll
+++ b/test/CodeGen/AArch64/movw-consts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
define i64 @test0() {
; CHECK-LABEL: test0:
@@ -53,19 +53,19 @@ define i64 @test7() {
; couldn't. Useful even for i64
define i64 @test8() {
; CHECK-LABEL: test8:
-; CHECK: movn w0, #{{60875|0xedcb}}
+; CHECK: mov w0, #-60876
ret i64 4294906420
}
define i64 @test9() {
; CHECK-LABEL: test9:
-; CHECK: movn x0, #0
+; CHECK: mov x0, #-1
ret i64 -1
}
define i64 @test10() {
; CHECK-LABEL: test10:
-; CHECK: movn x0, #{{60875|0xedcb}}, lsl #16
+; CHECK: mov x0, #-3989504001
ret i64 18446744069720047615
}
@@ -110,7 +110,7 @@ define void @test15() {
define void @test16() {
; CHECK-LABEL: test16:
-; CHECK: movn {{w[0-9]+}}, #0
+; CHECK: mov {{w[0-9]+}}, #-1
store i32 -1, i32* @var32
ret void
}
diff --git a/test/CodeGen/AArch64/neg-imm.ll b/test/CodeGen/AArch64/neg-imm.ll
new file mode 100644
index 000000000000..375d3dbfd0d5
--- /dev/null
+++ b/test/CodeGen/AArch64/neg-imm.ll
@@ -0,0 +1,46 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; LSR used to pick a sub-optimal solution due to the target responding
+; conservatively to isLegalAddImmediate for negative values.
+
+declare void @foo(i32)
+
+define void @test(i32 %px) {
+; CHECK_LABEL: test:
+; CHECK_LABEL: %entry
+; CHECK: subs
+; CHECK-NEXT: csel
+entry:
+ %sub = add nsw i32 %px, -1
+ %cmp = icmp slt i32 %px, 1
+ %.sub = select i1 %cmp, i32 0, i32 %sub
+ br label %for.body
+
+for.body:
+; CHECK_LABEL: %for.body
+; CHECK: cmp
+; CHECK-NEXT: b.eq
+; CHECK-LABEL: %if.then3
+ %x.015 = phi i32 [ %inc, %for.inc ], [ %.sub, %entry ]
+ %cmp2 = icmp eq i32 %x.015, %px
+ br i1 %cmp2, label %for.inc, label %if.then3
+
+if.then3:
+ tail call void @foo(i32 %x.015)
+ br label %for.inc
+
+for.inc:
+; CHECK_LABEL: %for.inc
+; CHECK: add
+; CHECK-NEXT: cmp
+; CHECK: b.le
+; CHECK_LABEL: %for.cond.cleanup
+ %inc = add nsw i32 %x.015, 1
+ %cmp1 = icmp sgt i32 %x.015, %px
+ br i1 %cmp1, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 6d89dfbacf41..887cb5dd698a 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -802,6 +802,63 @@ define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
}
+define <8 x i8> @cmgez8xi8_alt(<8 x i8> %A) {
+; CHECK-LABEL: cmgez8xi8_alt:
+; CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+ %sign = ashr <8 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %not = xor <8 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ ret <8 x i8> %not
+}
+
+define <16 x i8> @cmgez16xi8_alt(<16 x i8> %A) {
+; CHECK-LABEL: cmgez16xi8_alt:
+; CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+ %sign = ashr <16 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %not = xor <16 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ ret <16 x i8> %not
+}
+
+define <4 x i16> @cmgez4xi16_alt(<4 x i16> %A) {
+; CHECK-LABEL: cmgez4xi16_alt:
+; CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+ %sign = ashr <4 x i16> %A, <i16 15, i16 15, i16 15, i16 15>
+ %not = xor <4 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1>
+ ret <4 x i16> %not
+}
+
+define <8 x i16> @cmgez8xi16_alt(<8 x i16> %A) {
+; CHECK-LABEL: cmgez8xi16_alt:
+; CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+ %sign = ashr <8 x i16> %A, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %not = xor <8 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ ret <8 x i16> %not
+}
+
+define <2 x i32> @cmgez2xi32_alt(<2 x i32> %A) {
+; CHECK-LABEL: cmgez2xi32_alt:
+; CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+ %sign = ashr <2 x i32> %A, <i32 31, i32 31>
+ %not = xor <2 x i32> %sign, <i32 -1, i32 -1>
+ ret <2 x i32> %not
+}
+
+define <4 x i32> @cmgez4xi32_alt(<4 x i32> %A) {
+; CHECK-LABEL: cmgez4xi32_alt:
+; CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+ %sign = ashr <4 x i32> %A, <i32 31, i32 31, i32 31, i32 31>
+ %not = xor <4 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %not
+}
+
+define <2 x i64> @cmgez2xi64_alt(<2 x i64> %A) {
+; CHECK-LABEL: cmgez2xi64_alt:
+; CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+ %sign = ashr <2 x i64> %A, <i64 63, i64 63>
+ %not = xor <2 x i64> %sign, <i64 -1, i64 -1>
+ ret <2 x i64> %not
+}
+
+
define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
; CHECK-LABEL: cmgtz8xi8:
; CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 40649aeb1b8e..7882f5189413 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
define <8 x i8> @movi8b() {
; CHECK-LABEL: movi8b:
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index 41e391dcd76c..b9914356f301 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
%struct.int8x8x2_t = type { [2 x <8 x i8>] }
%struct.int16x4x2_t = type { [2 x <4 x i16>] }
diff --git a/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/test/CodeGen/AArch64/no-quad-ldp-stp.ll
new file mode 100644
index 000000000000..19d371adbdf0
--- /dev/null
+++ b/test/CodeGen/AArch64/no-quad-ldp-stp.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=aarch64 -mattr=+no-quad-ldst-pairs -verify-machineinstrs -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=aarch64 -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s
+
+; CHECK-LABEL: test_nopair_st
+; CHECK: str
+; CHECK: stur
+; CHECK-NOT: stp
+define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) {
+ %tmp1 = bitcast double* %ptr to <2 x double>*
+ store <2 x double> %v2, <2 x double>* %tmp1, align 16
+ %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2
+ %tmp = bitcast double* %add.ptr to <2 x double>*
+ store <2 x double> %v1, <2 x double>* %tmp, align 16
+ ret void
+}
+
+; CHECK-LABEL: test_nopair_ld
+; CHECK: ldr
+; CHECK: ldr
+; CHECK-NOT: ldp
+define <2 x i64> @test_nopair_ld(i64* %p) {
+ %a1 = bitcast i64* %p to <2 x i64>*
+ %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8
+ %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2
+ %a2 = bitcast i64* %add.ptr2 to <2 x i64>*
+ %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8
+ %add = add nsw <2 x i64> %tmp1, %tmp2
+ ret <2 x i64> %add
+}
diff --git a/test/CodeGen/AArch64/nontemporal.ll b/test/CodeGen/AArch64/nontemporal.ll
index db9779e03190..d8785f845c29 100644
--- a/test/CodeGen/AArch64/nontemporal.ll
+++ b/test/CodeGen/AArch64/nontemporal.ll
@@ -112,7 +112,7 @@ define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 {
define void @test_stnp_i64(i64* %p, i64 %v) #0 {
; CHECK-LABEL: test_stnp_i64:
-; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT: lsr x[[HI:[0-9]+]], x1, #32
; CHECK-NEXT: stnp w1, w[[HI]], [x0]
; CHECK-NEXT: ret
store i64 %v, i64* %p, align 1, !nontemporal !0
@@ -162,7 +162,7 @@ define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 {
define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
; CHECK-LABEL: test_stnp_i64_offset:
-; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT: lsr x[[HI:[0-9]+]], x1, #32
; CHECK-NEXT: stnp w1, w[[HI]], [x0, #8]
; CHECK-NEXT: ret
%tmp0 = getelementptr i64, i64* %p, i32 1
@@ -172,7 +172,7 @@ define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 {
; CHECK-LABEL: test_stnp_i64_offset_neg:
-; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT: lsr x[[HI:[0-9]+]], x1, #32
; CHECK-NEXT: stnp w1, w[[HI]], [x0, #-8]
; CHECK-NEXT: ret
%tmp0 = getelementptr i64, i64* %p, i32 -1
diff --git a/test/CodeGen/AArch64/nzcv-save.ll b/test/CodeGen/AArch64/nzcv-save.ll
index f8f42ec9b1a9..9329f3962934 100644
--- a/test/CodeGen/AArch64/nzcv-save.ll
+++ b/test/CodeGen/AArch64/nzcv-save.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=aarch64 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -march=aarch64 < %s | FileCheck %s
; CHECK: mrs [[NZCV_SAVE:x[0-9]+]], NZCV
; CHECK: msr NZCV, [[NZCV_SAVE]]
diff --git a/test/CodeGen/AArch64/optimize-cond-branch.ll b/test/CodeGen/AArch64/optimize-cond-branch.ll
new file mode 100644
index 000000000000..4e3ca6f16e78
--- /dev/null
+++ b/test/CodeGen/AArch64/optimize-cond-branch.ll
@@ -0,0 +1,48 @@
+; RUN: llc -verify-machineinstrs -o - %s | FileCheck %s
+target triple = "arm64--"
+
+; AArch64InstrInfo::optimizeCondBranch() optimizes the
+; "x = and y, 256; cmp x, 0; br" from an "and; cbnz" to a tbnz instruction.
+; It forgot to clear the a flag resulting in a MachineVerifier complaint.
+;
+; Writing a stable/simple test is tricky since most tbz instructions are already
+; formed in SelectionDAG, optimizeCondBranch() only triggers if the and
+; instruction is in a different block than the conditional jump.
+;
+; CHECK-LABEL: func
+; CHECK-NOT: and
+; CHECK: tbnz
+define void @func() {
+ %c0 = icmp sgt i64 0, 0
+ br i1 %c0, label %b1, label %b6
+
+b1:
+ br i1 undef, label %b3, label %b2
+
+b2:
+ %v0 = tail call i32 @extfunc()
+ br label %b5
+
+b3:
+ %v1 = load i32, i32* undef, align 4
+ %v2 = and i32 %v1, 256
+ br label %b5
+
+b5:
+ %v3 = phi i32 [ %v2, %b3 ], [ %v0, %b2 ]
+ %c1 = icmp eq i32 %v3, 0
+ br i1 %c1, label %b8, label %b7
+
+b6:
+ tail call i32 @extfunc()
+ ret void
+
+b7:
+ tail call i32 @extfunc()
+ ret void
+
+b8:
+ ret void
+}
+
+declare i32 @extfunc()
diff --git a/test/CodeGen/AArch64/pie.ll b/test/CodeGen/AArch64/pie.ll
new file mode 100644
index 000000000000..5cd27a8761cc
--- /dev/null
+++ b/test/CodeGen/AArch64/pie.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple aarch64-pc-linux -relocation-model=pic < %s | FileCheck %s
+
+@g1 = global i32 42
+
+define i32* @get_g1() {
+; CHECK: get_g1:
+; CHECK: adrp x0, g1
+; CHECK-NEXT: add x0, x0, :lo12:g1
+ ret i32* @g1
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"PIE Level", i32 2}
diff --git a/test/CodeGen/AArch64/preferred-alignment.ll b/test/CodeGen/AArch64/preferred-alignment.ll
new file mode 100644
index 000000000000..c032e83d268f
--- /dev/null
+++ b/test/CodeGen/AArch64/preferred-alignment.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=aarch64 -O0 < %s | FileCheck %s
+
+; Function Attrs: nounwind
+define i32 @foo() #0 {
+entry:
+ %c = alloca i8, align 1
+; CHECK: add x0, sp, #12
+ %s = alloca i16, align 2
+; CHECK-NEXT: add x1, sp, #8
+ %i = alloca i32, align 4
+; CHECK-NEXT: add x2, sp, #4
+ %call = call i32 @bar(i8* %c, i16* %s, i32* %i)
+ %0 = load i8, i8* %c, align 1
+ %conv = zext i8 %0 to i32
+ %add = add nsw i32 %call, %conv
+ %1 = load i16, i16* %s, align 2
+ %conv1 = sext i16 %1 to i32
+ %add2 = add nsw i32 %add, %conv1
+ %2 = load i32, i32* %i, align 4
+ %add3 = add nsw i32 %add2, %2
+ ret i32 %add3
+}
+
+declare i32 @bar(i8*, i16*, i32*) #1
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
+attributes #1 = { "no-frame-pointer-elim"="false" }
+
diff --git a/test/CodeGen/AArch64/preserve_mostcc.ll b/test/CodeGen/AArch64/preserve_mostcc.ll
new file mode 100644
index 000000000000..7f0968c8eb33
--- /dev/null
+++ b/test/CodeGen/AArch64/preserve_mostcc.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s
+
+declare void @standard_cc_func()
+declare preserve_mostcc void @preserve_mostcc_func()
+
+; Registers r9-r15 should be saved before the call of a function
+; with a standard calling convention.
+define preserve_mostcc void @preserve_mostcc1() nounwind {
+entry:
+;CHECK-LABEL: preserve_mostcc1
+;CHECK-NOT: stp
+;CHECK-NOT: str
+;CHECK: str x15
+;CHECK-NEXT: stp x14, x13,
+;CHECK-NEXT: stp x12, x11,
+;CHECK-NEXT: stp x10, x9,
+;CHECK: bl _standard_cc_func
+ call void @standard_cc_func()
+;CHECK: ldp x10, x9,
+;CHECK-NEXT: ldp x12, x11,
+;CHECK-NEXT: ldp x14, x13,
+;CHECK-NEXT: ldr x15
+ ret void
+}
+
+; Registers r9-r15 don't need to be saved if one
+; function with preserve_mostcc calling convention calls another
+; function with preserve_mostcc calling convention, because the
+; callee wil save these registers anyways.
+define preserve_mostcc void @preserve_mostcc2() nounwind {
+entry:
+;CHECK-LABEL: preserve_mostcc2
+;CHECK-NOT: x14
+;CHECK: stp x29, x30,
+;CHECK-NOT: x14
+;CHECK: bl _preserve_mostcc_func
+ call preserve_mostcc void @preserve_mostcc_func()
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/recp-fastmath.ll b/test/CodeGen/AArch64/recp-fastmath.ll
new file mode 100644
index 000000000000..710739b2cc5f
--- /dev/null
+++ b/test/CodeGen/AArch64/recp-fastmath.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!div,!vec-div | FileCheck %s --check-prefix=FAULT
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=div,vec-div | FileCheck %s
+
+define float @frecp(float %x) #0 {
+ %div = fdiv fast float 1.0, %x
+ ret float %div
+
+; FAULT-LABEL: frecp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: frecp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: frecpe
+; CHECK-NEXT: fmov
+}
+
+define <2 x float> @f2recp(<2 x float> %x) #0 {
+ %div = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
+ ret <2 x float> %div
+
+; FAULT-LABEL: f2recp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: f2recp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe
+}
+
+define <4 x float> @f4recp(<4 x float> %x) #0 {
+ %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+ ret <4 x float> %div
+
+; FAULT-LABEL: f4recp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: f4recp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe
+}
+
+define double @drecp(double %x) #0 {
+ %div = fdiv fast double 1.0, %x
+ ret double %div
+
+; FAULT-LABEL: drecp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: drecp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: frecpe
+; CHECK-NEXT: fmov
+}
+
+define <2 x double> @d2recp(<2 x double> %x) #0 {
+ %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %x
+ ret <2 x double> %div
+
+; FAULT-LABEL: d2recp:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fmov
+; FAULT-NEXT: fdiv
+
+; CHECK-LABEL: d2recp:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frecpe
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index ba34873eaa5b..4bec512403c4 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -disable-fp-elim -o - %s | FileCheck %s
; When generating DAG selection tables, TableGen used to only flag an
; instruction as needing a chain on its own account if it had a built-in pattern
diff --git a/test/CodeGen/AArch64/rem_crash.ll b/test/CodeGen/AArch64/rem_crash.ll
new file mode 100644
index 000000000000..71f1a80e24e2
--- /dev/null
+++ b/test/CodeGen/AArch64/rem_crash.ll
@@ -0,0 +1,257 @@
+; RUN: llc < %s -march=aarch64
+
+define i8 @test_minsize_uu8(i8 %x) minsize optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_minsize_ss8(i8 %x) minsize optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_minsize_us8(i8 %x) minsize optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_minsize_su8(i8 %x) minsize optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i16 @test_minsize_uu16(i16 %x) minsize optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_minsize_ss16(i16 %x) minsize optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_minsize_us16(i16 %x) minsize optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_minsize_su16(i16 %x) minsize optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i32 @test_minsize_uu32(i32 %x) minsize optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_minsize_ss32(i32 %x) minsize optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_minsize_us32(i32 %x) minsize optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_minsize_su32(i32 %x) minsize optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i64 @test_minsize_uu64(i64 %x) minsize optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_minsize_ss64(i64 %x) minsize optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_minsize_us64(i64 %x) minsize optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_minsize_su64(i64 %x) minsize optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i8 @test_uu8(i8 %x) optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_ss8(i8 %x) optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_us8(i8 %x) optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_su8(i8 %x) optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i16 @test_uu16(i16 %x) optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_ss16(i16 %x) optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_us16(i16 %x) optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_su16(i16 %x) optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i32 @test_uu32(i32 %x) optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_ss32(i32 %x) optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_us32(i32 %x) optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_su32(i32 %x) optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i64 @test_uu64(i64 %x) optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_ss64(i64 %x) optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_us64(i64 %x) optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_su64(i64 %x) optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index c2721e70190a..b2ca1cca0812 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -2,7 +2,11 @@
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a73 -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=vulcan -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s
%X = type { i64, i64, i64 }
declare void @f(%X*)
@@ -11,9 +15,11 @@ entry:
%tmp = alloca %X
call void @f(%X* %tmp)
; CHECK: add x0, sp, #8
-; CHECK-NEXT-NOT: mov
+; CHECK-NOT: mov
+; CHECK-NEXT: bl f
call void @f(%X* %tmp)
; CHECK: add x0, sp, #8
-; CHECK-NEXT-NOT: mov
+; CHECK-NOT: mov
+; CHECK-NEXT: bl f
ret void
}
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index a68fdec4cfbc..925d1881f563 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -92,6 +92,6 @@ define void @indirect_tail() {
tail call void %fptr(i32 42)
ret void
; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:func]
-; CHECK: movz w0, #{{42|0x2a}}
+; CHECK: mov w0, #{{42|0x2a}}
; CHECK: br [[FPTR]]
}
diff --git a/test/CodeGen/AArch64/special-reg.ll b/test/CodeGen/AArch64/special-reg.ll
index 91c32158d420..4b8c75b70985 100644
--- a/test/CodeGen/AArch64/special-reg.ll
+++ b/test/CodeGen/AArch64/special-reg.ll
@@ -35,7 +35,7 @@ entry:
define void @write_daifset() nounwind {
entry:
; CHECK-LABEL: write_daifset:
-; CHECK: msr DAIFSET, #2
+; CHECK: msr DAIFSet, #2
call void @llvm.write_register.i64(metadata !2, i64 2)
ret void
}
diff --git a/test/CodeGen/AArch64/sqrt-fastmath.ll b/test/CodeGen/AArch64/sqrt-fastmath.ll
new file mode 100644
index 000000000000..0d9533fd27fc
--- /dev/null
+++ b/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -0,0 +1,160 @@
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!sqrt,!vec-sqrt | FileCheck %s --check-prefix=FAULT
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=sqrt,vec-sqrt | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon,-use-reverse-square-root | FileCheck %s --check-prefix=FAULT
+; RUN: llc < %s -mtriple=aarch64 -mattr=neon,+use-reverse-square-root | FileCheck %s
+
+declare float @llvm.sqrt.f32(float) #1
+declare double @llvm.sqrt.f64(double) #1
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #1
+
+define float @fsqrt(float %a) #0 {
+ %1 = tail call fast float @llvm.sqrt.f32(float %a)
+ ret float %1
+
+; FAULT-LABEL: fsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: fsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x float> @f2sqrt(<2 x float> %a) #0 {
+ %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2
+ ret <2 x float> %1
+
+; FAULT-LABEL: f2sqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f2sqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: mov
+; CHECK-NEXT: frsqrte
+}
+
+define <4 x float> @f4sqrt(<4 x float> %a) #0 {
+ %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2
+ ret <4 x float> %1
+
+; FAULT-LABEL: f4sqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f4sqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: mov
+; CHECK-NEXT: frsqrte
+}
+
+define double @dsqrt(double %a) #0 {
+ %1 = tail call fast double @llvm.sqrt.f64(double %a)
+ ret double %1
+
+; FAULT-LABEL: dsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: dsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x double> @d2sqrt(<2 x double> %a) #0 {
+ %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2
+ ret <2 x double> %1
+
+; FAULT-LABEL: d2sqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: d2sqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: mov
+; CHECK-NEXT: frsqrte
+}
+
+define float @frsqrt(float %a) #0 {
+ %1 = tail call fast float @llvm.sqrt.f32(float %a)
+ %2 = fdiv fast float 1.000000e+00, %1
+ ret float %2
+
+; FAULT-LABEL: frsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: frsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
+ %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2
+ %2 = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %1
+ ret <2 x float> %2
+
+; FAULT-LABEL: f2rsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f2rsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
+ %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2
+ %2 = fdiv fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %1
+ ret <4 x float> %2
+
+; FAULT-LABEL: f4rsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: f4rsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define double @drsqrt(double %a) #0 {
+ %1 = tail call fast double @llvm.sqrt.f64(double %a)
+ %2 = fdiv fast double 1.000000e+00, %1
+ ret double %2
+
+; FAULT-LABEL: drsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: drsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+define <2 x double> @d2rsqrt(<2 x double> %a) #0 {
+ %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2
+ %2 = fdiv fast <2 x double> <double 1.000000e+00, double 1.000000e+00>, %1
+ ret <2 x double> %2
+
+; FAULT-LABEL: d2rsqrt:
+; FAULT-NEXT: BB#0
+; FAULT-NEXT: fsqrt
+
+; CHECK-LABEL: d2rsqrt:
+; CHECK-NEXT: BB#0
+; CHECK-NEXT: fmov
+; CHECK-NEXT: frsqrte
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
index eb4937e75f61..6f1515a98264 100644
--- a/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
+++ b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic -disable-fp-elim | FileCheck %s
@__stack_chk_guard = external global i64*
@@ -6,10 +6,14 @@
; CHECK: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
; CHECK: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; Load the stack guard for the second time, just in case the previous value gets spilled.
+; CHECK: adrp [[GUARD_PAGE:x[0-9]+]], ___stack_chk_guard@GOTPAGE
; CHECK: ldr [[R2:x[0-9]+]], {{\[}}[[R1]]{{\]}}
; CHECK: stur [[R2]], {{\[}}x29, [[SLOT0:[0-9#\-]+]]{{\]}}
; CHECK: ldur [[R3:x[0-9]+]], {{\[}}x29, [[SLOT0]]{{\]}}
-; CHECK: sub [[R4:x[0-9]+]], [[R2]], [[R3]]
+; CHECK: ldr [[GUARD_ADDR:x[0-9]+]], {{\[}}[[GUARD_PAGE]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; CHECK: ldr [[GUARD:x[0-9]+]], {{\[}}[[GUARD_ADDR]]{{\]}}
+; CHECK: sub [[R4:x[0-9]+]], [[GUARD]], [[R3]]
; CHECK: cbnz [[R4]], LBB
define i32 @test_stack_guard_remat2() {
diff --git a/test/CodeGen/AArch64/stack-protector-target.ll b/test/CodeGen/AArch64/stack-protector-target.ll
new file mode 100644
index 000000000000..d4d806289bff
--- /dev/null
+++ b/test/CodeGen/AArch64/stack-protector-target.ll
@@ -0,0 +1,19 @@
+; Test target-specific stack cookie location.
+; RUN: llc -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-AARCH64 %s
+
+define void @_Z1fv() sspreq {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @_Z7CapturePi(i32* nonnull %x)
+ ret void
+}
+
+declare void @_Z7CapturePi(i32*)
+
+; ANDROID-AARCH64: mrs [[A:.*]], TPIDR_EL0
+; ANDROID-AARCH64: ldr [[B:.*]], {{\[}}[[A]], #40]
+; ANDROID-AARCH64: str [[B]], [sp,
+; ANDROID-AARCH64: ldr [[C:.*]], {{\[}}[[A]], #40]
+; ANDROID-AARCH64: ldr [[D:.*]], [sp,
+; ANDROID-AARCH64: cmp [[C]], [[D]]
diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll
index 4712012b0d25..5646703fa403 100644
--- a/test/CodeGen/AArch64/stackmap-frame-setup.ll
+++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=aarch64-apple-darwin -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
define void @caller_meta_leaf() {
entry:
diff --git a/test/CodeGen/AArch64/stackmap-liveness.ll b/test/CodeGen/AArch64/stackmap-liveness.ll
index 6b37aac16f9e..224a9c418526 100644
--- a/test/CodeGen/AArch64/stackmap-liveness.ll
+++ b/test/CodeGen/AArch64/stackmap-liveness.ll
@@ -37,7 +37,7 @@ define i64 @stackmap_liveness(i1 %c) {
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .byte 8
; Align
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
%1 = select i1 %c, i64 1, i64 2
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 32, i8* null, i32 0)
ret i64 %1
diff --git a/test/CodeGen/AArch64/subs-to-sub-opt.ll b/test/CodeGen/AArch64/subs-to-sub-opt.ll
new file mode 100644
index 000000000000..f33e24e777fe
--- /dev/null
+++ b/test/CodeGen/AArch64/subs-to-sub-opt.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -O3 -o - %s | FileCheck %s
+
+@a = external global i8, align 1
+@b = external global i8, align 1
+
+; Test that SUBS is replaced by SUB if condition flags are not used.
+define i32 @test01() nounwind {
+; CHECK: ldrb {{.*}}
+; CHECK-NEXT: ldrb {{.*}}
+; CHECK-NEXT: sub {{.*}}
+; CHECK-NEXT: cmn {{.*}}
+entry:
+ %0 = load i8, i8* @a, align 1
+ %conv = zext i8 %0 to i32
+ %1 = load i8, i8* @b, align 1
+ %conv1 = zext i8 %1 to i32
+ %s = sub nsw i32 %conv1, %conv
+ %cmp0 = icmp eq i32 %s, -1
+ %cmp1 = sext i1 %cmp0 to i8
+ store i8 %cmp1, i8* @a
+ ret i32 0
+}
+
diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll
new file mode 100644
index 000000000000..a0bfffdef95e
--- /dev/null
+++ b/test/CodeGen/AArch64/swifterror.ll
@@ -0,0 +1,385 @@
+; RUN: llc -verify-machineinstrs -disable-fp-elim < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck --check-prefix=CHECK-APPLE %s
+; RUN: llc -verify-machineinstrs -disable-fp-elim -O0 < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck --check-prefix=CHECK-O0 %s
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+%swift_error = type {i64, i8}
+
+; This tests the basic usage of a swifterror parameter. "foo" is the function
+; that takes a swifterror parameter and "caller" is the caller of "foo".
+define float @foo(%swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo:
+; CHECK-APPLE: orr w0, wzr, #0x10
+; CHECK-APPLE: malloc
+; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-APPLE: strb [[ID]], [x0, #8]
+; CHECK-APPLE: mov x19, x0
+; CHECK-APPLE-NOT: x19
+
+; CHECK-O0-LABEL: foo:
+; CHECK-O0: orr w{{.*}}, wzr, #0x10
+; CHECK-O0: malloc
+; CHECK-O0: mov [[ID2:x[0-9]+]], x0
+; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-O0: strb [[ID]], [x0, #8]
+; CHECK-O0: mov x19, [[ID2]]
+; CHECK-O0-NOT: x19
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+}
+
+; "caller" calls "foo" that takes a swifterror parameter.
+define float @caller(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller:
+; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
+; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: bl {{.*}}foo
+; CHECK-APPLE: cbnz x19
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov x0, x19
+; CHECK_APPLE: bl {{.*}}free
+
+; CHECK-O0-LABEL: caller:
+; CHECK-O0: mov x19
+; CHECK-O0: bl {{.*}}foo
+; CHECK-O0: mov [[ID:x[0-9]+]], x19
+; CHECK-O0: cbnz [[ID]]
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "caller2" is the caller of "foo", it calls "foo" inside a loop.
+define float @caller2(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller2:
+; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
+; CHECK-APPLE: fmov [[CMP:s[0-9]+]], #1.0
+; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: bl {{.*}}foo
+; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: fcmp s0, [[CMP]]
+; CHECK-APPLE: b.le
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov x0, x19
+; CHECK_APPLE: bl {{.*}}free
+
+; CHECK-O0-LABEL: caller2:
+; CHECK-O0: mov x19
+; CHECK-O0: bl {{.*}}foo
+; CHECK-O0: mov [[ID:x[0-9]+]], x19
+; CHECK-O0: cbnz [[ID]]
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ br label %bb_loop
+bb_loop:
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %cmp = fcmp ogt float %call, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "foo_if" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition.
+define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
+; CHECK-APPLE-LABEL: foo_if:
+; CHECK-APPLE: cbz w0
+; CHECK-APPLE: orr w0, wzr, #0x10
+; CHECK-APPLE: malloc
+; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-APPLE: strb [[ID]], [x0, #8]
+; CHECK-APPLE: mov x19, x0
+; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: ret
+
+; CHECK-O0-LABEL: foo_if:
+; spill x19
+; CHECK-O0: str x19
+; CHECK-O0: cbz w0
+; CHECK-O0: orr w{{.*}}, wzr, #0x10
+; CHECK-O0: malloc
+; CHECK-O0: mov [[ID:x[0-9]+]], x0
+; CHECK-O0: orr [[ID2:w[0-9]+]], wzr, #0x1
+; CHECK-O0: strb [[ID2]], [x0, #8]
+; CHECK-O0: mov x19, [[ID]]
+; CHECK-O0: ret
+; reload from stack
+; CHECK-O0: ldr x19
+; CHECK-O0: ret
+entry:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %normal
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+
+normal:
+ ret float 0.0
+}
+
+; "foo_loop" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition inside a loop.
+define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float %cc2) {
+; CHECK-APPLE-LABEL: foo_loop:
+; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: cbz
+; CHECK-APPLE: orr w0, wzr, #0x10
+; CHECK-APPLE: malloc
+; CHECK-APPLE: strb w{{.*}}, [x0, #8]
+; CHECK-APPLE: fcmp
+; CHECK-APPLE: b.le
+; CHECK-APPLE: mov x19, x0
+; CHECK-APPLE: ret
+
+; CHECK-O0-LABEL: foo_loop:
+; spill x19
+; CHECK-O0: str x19
+; CHECk-O0: cbz
+; CHECK-O0: orr w{{.*}}, wzr, #0x10
+; CHECK-O0: malloc
+; CHECK-O0: mov [[ID:x[0-9]+]], x0
+; CHECK-O0: strb w{{.*}}, [{{.*}}[[ID]], #8]
+; spill x0
+; CHECK-O0: str x0
+; CHECK-O0: fcmp
+; CHECK-O0: b.le
+; reload from stack
+; CHECK-O0: ldr x19
+; CHECK-O0: ret
+entry:
+ br label %bb_loop
+
+bb_loop:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %bb_cont
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ br label %bb_cont
+
+bb_cont:
+ %cmp = fcmp ogt float %cc2, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ ret float 0.0
+}
+
+%struct.S = type { i32, i32, i32, i32, i32, i32 }
+
+; "foo_sret" is a function that takes a swifterror parameter, it also has a sret
+; parameter.
+define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo_sret:
+; CHECK-APPLE: mov [[SRET:x[0-9]+]], x8
+; CHECK-APPLE: orr w0, wzr, #0x10
+; CHECK-APPLE: malloc
+; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-APPLE: strb [[ID]], [x0, #8]
+; CHECK-APPLE: str w{{.*}}, [{{.*}}[[SRET]], #4]
+; CHECK-APPLE: mov x19, x0
+; CHECK-APPLE-NOT: x19
+
+; CHECK-O0-LABEL: foo_sret:
+; CHECK-O0: orr w{{.*}}, wzr, #0x10
+; spill x8
+; CHECK-O0-DAG: str x8
+; spill x19
+; CHECK-O0-DAG: str x19
+; CHECK-O0: malloc
+; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-O0: strb [[ID]], [x0, #8]
+; reload from stack
+; CHECK-O0: ldr [[SRET:x[0-9]+]]
+; CHECK-O0: str w{{.*}}, [{{.*}}[[SRET]], #4]
+; CHECK-O0: mov x19
+; CHECK-O0-NOT: x19
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ %v2 = getelementptr inbounds %struct.S, %struct.S* %agg.result, i32 0, i32 1
+ store i32 %val1, i32* %v2
+ ret void
+}
+
+; "caller3" calls "foo_sret" that takes a swifterror parameter.
+define float @caller3(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller3:
+; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
+; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: bl {{.*}}foo_sret
+; CHECK-APPLE: cbnz x19
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov x0, x19
+; CHECK_APPLE: bl {{.*}}free
+
+; CHECK-O0-LABEL: caller3:
+; spill x0
+; CHECK-O0: str x0
+; CHECK-O0: mov x19
+; CHECK-O0: bl {{.*}}foo_sret
+; CHECK-O0: mov [[ID2:x[0-9]+]], x19
+; CHECK-O0: cbnz [[ID2]]
+; Access part of the error object and save it to error_ref
+; reload from stack
+; CHECK-O0: ldrb [[CODE:w[0-9]+]]
+; CHECK-O0: ldr [[ID:x[0-9]+]]
+; CHECK-O0: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK_O0: bl {{.*}}free
+entry:
+ %s = alloca %struct.S, align 8
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ call void @foo_sret(%struct.S* sret %s, i32 1, %swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "foo_vararg" is a function that takes a swifterror parameter, it also has
+; variable number of arguments.
+declare void @llvm.va_start(i8*) nounwind
+define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
+; CHECK-APPLE-LABEL: foo_vararg:
+; CHECK-APPLE: orr w0, wzr, #0x10
+; CHECK-APPLE: malloc
+; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-APPLE: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
+; CHECK-APPLE: strb [[ID]], [x0, #8]
+
+; First vararg
+; CHECK-APPLE-DAG: orr {{x[0-9]+}}, [[ARGS]], #0x8
+; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16]
+; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Second vararg
+; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Third vararg
+; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+
+; CHECK-APPLE: mov x19, x0
+; CHECK-APPLE-NOT: x19
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+
+ %args = alloca i8*, align 8
+ %a10 = alloca i32, align 4
+ %a11 = alloca i32, align 4
+ %a12 = alloca i32, align 4
+ %v10 = bitcast i8** %args to i8*
+ call void @llvm.va_start(i8* %v10)
+ %v11 = va_arg i8** %args, i32
+ store i32 %v11, i32* %a10, align 4
+ %v12 = va_arg i8** %args, i32
+ store i32 %v12, i32* %a11, align 4
+ %v13 = va_arg i8** %args, i32
+ store i32 %v13, i32* %a12, align 4
+
+ ret float 1.0
+}
+
+; "caller4" calls "foo_vararg" that takes a swifterror parameter.
+define float @caller4(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller4:
+
+; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
+; CHECK-APPLE: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK-APPLE: str {{x[0-9]+}}, [sp]
+
+; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: bl {{.*}}foo_vararg
+; CHECK-APPLE: cbnz x19
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov x0, x19
+; CHECK_APPLE: bl {{.*}}free
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+
+ %a10 = alloca i32, align 4
+ %a11 = alloca i32, align 4
+ %a12 = alloca i32, align 4
+ store i32 10, i32* %a10, align 4
+ store i32 11, i32* %a11, align 4
+ store i32 12, i32* %a12, align 4
+ %v10 = load i32, i32* %a10, align 4
+ %v11 = load i32, i32* %a11, align 4
+ %v12 = load i32, i32* %a12, align 4
+
+ %call = call float (%swift_error**, ...) @foo_vararg(%swift_error** swifterror %error_ptr_ref, i32 %v10, i32 %v11, i32 %v12)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
diff --git a/test/CodeGen/AArch64/swiftself.ll b/test/CodeGen/AArch64/swiftself.ll
new file mode 100644
index 000000000000..a60aed6b0f2b
--- /dev/null
+++ b/test/CodeGen/AArch64/swiftself.ll
@@ -0,0 +1,67 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-unknown-linux-gnu -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+
+; Parameter with swiftself should be allocated to x20.
+; CHECK-LABEL: swiftself_param:
+; CHECK: mov x0, x20
+; CHECK-NEXT: ret
+define i8* @swiftself_param(i8* swiftself %addr0) {
+ ret i8 *%addr0
+}
+
+; Check that x20 is used to pass a swiftself argument.
+; CHECK-LABEL: call_swiftself:
+; CHECK: mov x20, x0
+; CHECK: bl {{_?}}swiftself_param
+; CHECK: ret
+define i8 *@call_swiftself(i8* %arg) {
+ %res = call i8 *@swiftself_param(i8* swiftself %arg)
+ ret i8 *%res
+}
+
+; x20 should be saved by the callee even if used for swiftself
+; CHECK-LABEL: swiftself_clobber:
+; CHECK: {{stp|str}} {{.*}}x20{{.*}}sp
+; ...
+; CHECK: {{ldp|ldr}} {{.*}}x20{{.*}}sp
+; CHECK: ret
+define i8 *@swiftself_clobber(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{x20}"()
+ ret i8 *%addr0
+}
+
+; Demonstrate that we do not need any movs when calling multiple functions
+; with swiftself argument.
+; CHECK-LABEL: swiftself_passthrough:
+; OPT-NOT: mov{{.*}}x20
+; OPT: bl {{_?}}swiftself_param
+; OPT-NOT: mov{{.*}}x20
+; OPT-NEXT: bl {{_?}}swiftself_param
+; OPT: ret
+define void @swiftself_passthrough(i8* swiftself %addr0) {
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ ret void
+}
+
+; We can use a tail call if the callee swiftself is the same as the caller one.
+; CHECK-LABEL: swiftself_tail:
+; OPT: b {{_?}}swiftself_param
+; OPT-NOT: ret
+define i8* @swiftself_tail(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{x20}"()
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr0)
+ ret i8* %res
+}
+
+; We can not use a tail call if the callee swiftself is not the same as the
+; caller one.
+; CHECK-LABEL: swiftself_notail:
+; CHECK: mov x20, x0
+; CHECK: bl {{_?}}swiftself_param
+; CHECK: ret
+define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
+ ret i8* %res
+}
diff --git a/test/CodeGen/AArch64/tailcall-ccmismatch.ll b/test/CodeGen/AArch64/tailcall-ccmismatch.ll
new file mode 100644
index 000000000000..ab96e609dd46
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-ccmismatch.ll
@@ -0,0 +1,24 @@
+; RUN: llc -o - %s | FileCheck %s
+target triple="aarch64--"
+
+declare void @somefunc()
+define preserve_mostcc void @test_ccmismatch_notail() {
+; Ensure that no tail call is used here, as the called function somefunc does
+; not preserve enough registers for preserve_mostcc.
+; CHECK-LABEL: test_ccmismatch_notail:
+; CHECK-NOT: b somefunc
+; CHECK: bl somefunc
+ tail call void @somefunc()
+ ret void
+}
+
+declare preserve_mostcc void @some_preserve_most_func()
+define void @test_ccmismatch_tail() {
+; We can perform a tail call here, because some_preserve_most_func preserves
+; all registers necessary for test_ccmismatch_tail.
+; CHECK-LABEL: test_ccmismatch_tail:
+; CHECK-NOT: bl some_preserve_most_func
+; CHECK: b some_preserve_most_func
+ tail call preserve_mostcc void @some_preserve_most_func()
+ ret void
+}
diff --git a/test/CodeGen/AArch64/tailcall-implicit-sret.ll b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
index 5d6805998d22..3955877b09b7 100644
--- a/test/CodeGen/AArch64/tailcall-implicit-sret.ll
+++ b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -disable-post-ra -asm-verbose=false | FileCheck %s
; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/tailcall_misched_graph.ll b/test/CodeGen/AArch64/tailcall_misched_graph.ll
index 343ffab57e35..59a3be905f17 100644
--- a/test/CodeGen/AArch64/tailcall_misched_graph.ll
+++ b/test/CodeGen/AArch64/tailcall_misched_graph.ll
@@ -37,6 +37,8 @@ declare void @callee2(i8*, i8*, i8*, i8*, i8*,
; CHECK: SU({{.*}}): [[VRB]]<def> = LDRXui <fi#-2>
; CHECK-NOT: SU
; CHECK: Successors:
-; CHECK: ch SU([[DEPSTORE:.*]]): Latency=0
+; CHECK: ch SU([[DEPSTOREB:.*]]): Latency=0
+; CHECK: ch SU([[DEPSTOREA:.*]]): Latency=0
-; CHECK: SU([[DEPSTORE]]): STRXui %vreg0, <fi#-4>
+; CHECK: SU([[DEPSTOREA]]): STRXui %vreg{{.*}}, <fi#-4>
+; CHECK: SU([[DEPSTOREB]]): STRXui %vreg{{.*}}, <fi#-3>
diff --git a/test/CodeGen/AArch64/tailmerging_in_mbp.ll b/test/CodeGen/AArch64/tailmerging_in_mbp.ll
new file mode 100644
index 000000000000..d850801ee54a
--- /dev/null
+++ b/test/CodeGen/AArch64/tailmerging_in_mbp.ll
@@ -0,0 +1,63 @@
+; RUN: llc <%s -march=aarch64 -verify-machine-dom-info | FileCheck %s
+
+; CHECK-LABEL: test:
+; CHECK: LBB0_7:
+; CHECK: b.hi
+; CHECK-NEXT: b
+; CHECK-NEXT: LBB0_8:
+; CHECK-NEXT: mov x8, x9
+; CHECK-NEXT: LBB0_9:
+define i64 @test(i64 %n, i64* %a, i64* %b, i64* %c, i64* %d, i64* %e, i64* %f) {
+entry:
+ %cmp28 = icmp sgt i64 %n, 1
+ br i1 %cmp28, label %for.body, label %for.end
+
+for.body: ; preds = %for.body.lr.ph, %if.end
+ %j = phi i64 [ %n, %entry ], [ %div, %if.end ]
+ %div = lshr i64 %j, 1
+ %a.arrayidx = getelementptr inbounds i64, i64* %a, i64 %div
+ %a.j = load i64, i64* %a.arrayidx
+ %b.arrayidx = getelementptr inbounds i64, i64* %b, i64 %div
+ %b.j = load i64, i64* %b.arrayidx
+ %cmp.i = icmp slt i64 %a.j, %b.j
+ br i1 %cmp.i, label %for.end.loopexit, label %cond.false.i
+
+cond.false.i: ; preds = %for.body
+ %cmp4.i = icmp sgt i64 %a.j, %b.j
+ br i1 %cmp4.i, label %if.end, label %cond.false6.i
+
+cond.false6.i: ; preds = %cond.false.i
+ %c.arrayidx = getelementptr inbounds i64, i64* %c, i64 %div
+ %c.j = load i64, i64* %c.arrayidx
+ %d.arrayidx = getelementptr inbounds i64, i64* %d, i64 %div
+ %d.j = load i64, i64* %d.arrayidx
+ %cmp9.i = icmp slt i64 %c.j, %d.j
+ br i1 %cmp9.i, label %for.end.loopexit, label %cond.false11.i
+
+cond.false11.i: ; preds = %cond.false6.i
+ %cmp14.i = icmp sgt i64 %c.j, %d.j
+ br i1 %cmp14.i, label %if.end, label %cond.false12.i
+
+cond.false12.i: ; preds = %cond.false11.i
+ %e.arrayidx = getelementptr inbounds i64, i64* %e, i64 %div
+ %e.j = load i64, i64* %e.arrayidx
+ %f.arrayidx = getelementptr inbounds i64, i64* %f, i64 %div
+ %f.j = load i64, i64* %f.arrayidx
+ %cmp19.i = icmp sgt i64 %e.j, %f.j
+ br i1 %cmp19.i, label %if.end, label %for.end.loopexit
+
+if.end: ; preds = %cond.false12.i, %cond.false11.i, %cond.false.i
+ %cmp = icmp ugt i64 %j, 3
+ br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit: ; preds = %cond.false12.i, %cond.false6.i, %for.body, %if.end
+ %j.0.lcssa.ph = phi i64 [ %j, %cond.false12.i ], [ %j, %cond.false6.i ], [ %j, %for.body ], [ %div, %if.end ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %j.0.lcssa = phi i64 [ %n, %entry ], [ %j.0.lcssa.ph, %for.end.loopexit ]
+ %j.2 = add i64 %j.0.lcssa, %n
+ %j.3 = mul i64 %j.2, %n
+ %j.4 = add i64 %j.3, 10
+ ret i64 %j.4
+}
diff --git a/test/CodeGen/AArch64/vcvt-oversize.ll b/test/CodeGen/AArch64/vcvt-oversize.ll
index 066a4b666204..b6e25cfadaa9 100644
--- a/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -2,8 +2,9 @@
define <8 x i8> @float_to_i8(<8 x float>* %in) {
; CHECK-LABEL: float_to_i8:
-; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s
-; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s
+; CHECK: ldp q1, q0, [x0]
+; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s
+; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s
; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
diff --git a/test/CodeGen/AArch64/vector-fcopysign.ll b/test/CodeGen/AArch64/vector-fcopysign.ll
index 865a0a5b8580..47d75d5ecc61 100644
--- a/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -7,7 +7,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
; WidenVecRes same
define <1 x float> @test_copysign_v1f32_v1f32(<1 x float> %a, <1 x float> %b) #0 {
; CHECK-LABEL: test_copysign_v1f32_v1f32:
-; CHECK-NEXT: movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.2s v2, #128, lsl #24
; CHECK-NEXT: bit.8b v0, v1, v2
; CHECK-NEXT: ret
%r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b)
@@ -18,7 +18,7 @@ define <1 x float> @test_copysign_v1f32_v1f32(<1 x float> %a, <1 x float> %b) #0
define <1 x float> @test_copysign_v1f32_v1f64(<1 x float> %a, <1 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v1f32_v1f64:
; CHECK-NEXT: fcvt s1, d1
-; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.4s v2, #128, lsl #24
; CHECK-NEXT: bit.16b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <1 x double> %b to <1 x float>
@@ -59,7 +59,7 @@ declare <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) #0
define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_copysign_v2f32_v2f32:
-; CHECK-NEXT: movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.2s v2, #128, lsl #24
; CHECK-NEXT: bit.8b v0, v1, v2
; CHECK-NEXT: ret
%r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
@@ -69,7 +69,7 @@ define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0
define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v2f32_v2f64:
; CHECK-NEXT: fcvtn v1.2s, v1.2d
-; CHECK-NEXT: movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.2s v2, #128, lsl #24
; CHECK-NEXT: bit.8b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <2 x double> %b to <2 x float>
@@ -83,7 +83,7 @@ declare <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) #0
define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: test_copysign_v4f32_v4f32:
-; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: movi.4s v2, #128, lsl #24
; CHECK-NEXT: bit.16b v0, v1, v2
; CHECK-NEXT: ret
%r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
@@ -94,21 +94,21 @@ define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0
define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v4f32_v4f64:
; CHECK-NEXT: mov s3, v0[1]
-; CHECK-NEXT: mov d4, v1[1]
-; CHECK-NEXT: movi.4s v5, #0x80, lsl #24
-; CHECK-NEXT: fcvt s1, d1
+; CHECK-NEXT: movi.4s v4, #128, lsl #24
+; CHECK-NEXT: fcvt s5, d1
; CHECK-NEXT: mov s6, v0[2]
; CHECK-NEXT: mov s7, v0[3]
-; CHECK-NEXT: fcvt s16, d2
-; CHECK-NEXT: bit.16b v0, v1, v5
-; CHECK-NEXT: bit.16b v6, v16, v5
-; CHECK-NEXT: fcvt s1, d4
-; CHECK-NEXT: bit.16b v3, v1, v5
+; CHECK-NEXT: bit.16b v0, v5, v4
+; CHECK-NEXT: fcvt s5, d2
+; CHECK-NEXT: bit.16b v6, v5, v4
+; CHECK-NEXT: mov d1, v1[1]
+; CHECK-NEXT: fcvt s1, d1
+; CHECK-NEXT: bit.16b v3, v1, v4
; CHECK-NEXT: mov d1, v2[1]
; CHECK-NEXT: fcvt s1, d1
; CHECK-NEXT: ins.s v0[1], v3[0]
; CHECK-NEXT: ins.s v0[2], v6[0]
-; CHECK-NEXT: bit.16b v7, v1, v5
+; CHECK-NEXT: bit.16b v7, v1, v4
; CHECK-NEXT: ins.s v0[3], v7[0]
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x float>
diff --git a/test/CodeGen/AArch64/vector_merge_dep_check.ll b/test/CodeGen/AArch64/vector_merge_dep_check.ll
new file mode 100644
index 000000000000..9220947e8362
--- /dev/null
+++ b/test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -0,0 +1,41 @@
+; RUN: llc --combiner-alias-analysis=false < %s | FileCheck %s
+; RUN: llc --combiner-alias-analysis=true < %s | FileCheck %s
+
+; This test checks that we do not merge stores together which have
+; dependencies through their non-chain operands (e.g. one store is the
+; chain ancestor of a load whose value is used in as the data for the
+; other store). Merging in such cases creates a loop in the DAG.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+%"class.std::__1::complex.0.20.56.60.64.72.76.88.92.112.140.248" = type { float, float }
+
+; Function Attrs: noinline norecurse nounwind ssp uwtable
+define void @fn(<2 x i64>* %argA, <2 x i64>* %argB, i64* %a) #0 align 2 {
+ %_p_vec_full = load <2 x i64>, <2 x i64>* %argA, align 4, !alias.scope !1, !noalias !3
+ %x = extractelement <2 x i64> %_p_vec_full, i32 1
+ store i64 %x, i64* %a, align 8, !alias.scope !4, !noalias !9
+ %_p_vec_full155 = load <2 x i64>, <2 x i64>* %argB, align 4, !alias.scope !1, !noalias !3
+ %y = extractelement <2 x i64> %_p_vec_full155, i32 0
+ %scevgep41 = getelementptr i64, i64* %a, i64 -1
+ store i64 %y, i64* %scevgep41, align 8, !alias.scope !4, !noalias !9
+ ret void
+}
+
+; CHECK: ret
+
+attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Snapdragon LLVM ARM Compiler 3.8.0 (based on LLVM 3.8.0)"}
+!1 = distinct !{!1, !2, !"polly.alias.scope.rhs"}
+!2 = distinct !{!2, !"polly.alias.scope.domain"}
+!3 = !{!4, !5, !6, !7, !8}
+!4 = distinct !{!4, !2, !"polly.alias.scope.blockB"}
+!5 = distinct !{!5, !2, !"polly.alias.scope.add28.lcssa.reg2mem"}
+!6 = distinct !{!6, !2, !"polly.alias.scope.count.0.lcssa.reg2mem"}
+!7 = distinct !{!7, !2, !"polly.alias.scope.mul"}
+!8 = distinct !{!8, !2, !"polly.alias.scope.add28.us.lcssa.reg2mem"}
+!9 = !{!1, !5, !6, !7, !8}
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index c7bcfd2ddab2..ff8c90457876 100644
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
@@ -91,12 +91,12 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
@ptr = addrspace(3) global i32 addrspace(3)* undef
-@dst = addrspace(3) global [16384 x i32] undef
+@dst = addrspace(3) global [16383 x i32] undef
; FUNC-LABEL: {{^}}global_ptr:
; SI: ds_write_b32
define void @global_ptr() nounwind {
- store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+ store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
ret void
}
diff --git a/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
new file mode 100644
index 000000000000..62b09dfedf15
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel %s -o - 2>&1 | FileCheck %s
+; REQUIRES: global-isel
+; This file checks that the translation from llvm IR to generic MachineInstr
+; is correct.
+
+; Tests for add.
+; CHECK: name: addi32
+; CHECK: G_ADD i32
+define i32 @addi32(i32 %arg1, i32 %arg2) {
+ %res = add i32 %arg1, %arg2
+ ret i32 %res
+}
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index 2ddfa9649ac9..f37247361ece 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
;FUNC-LABEL: {{^}}test1:
;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -123,12 +123,11 @@ entry:
; SI: s_add_u32
; SI: s_addc_u32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
; EG-DAG: ADDC_UINT
; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-DAG: ADD_INT {{[* ]*}}
; EG-NOT: SUB
define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
entry:
@@ -145,12 +144,11 @@ entry:
; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
; SI-NOT: v_addc_u32_e32 s
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
; EG-DAG: ADDC_UINT
; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-DAG: ADD_INT {{[* ]*}}
; EG-NOT: SUB
define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
entry:
@@ -165,12 +163,11 @@ entry:
; SI: s_add_u32
; SI: s_addc_u32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
; EG-DAG: ADDC_UINT
; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-DAG: ADD_INT {{[* ]*}}
; EG-NOT: SUB
define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
entry:
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
index 8346add7df97..3d360b7d0b7a 100644
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -1,13 +1,13 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() readnone
+declare i32 @llvm.amdgcn.workitem.id.x() readnone
; SI-LABEL: {{^}}test_i64_vreg:
; SI: v_add_i32
; SI: v_addc_u32
define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
- %tid = call i32 @llvm.r600.read.tidig.x() readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
%b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
%a = load i64, i64 addrspace(1)* %a_ptr
@@ -59,7 +59,7 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
; SI: v_add_i32
; SI: v_addc_u32
define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
- %tid = call i32 @llvm.r600.read.tidig.x() readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
%b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
%a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll
deleted file mode 100644
index 3aa2f653bf9c..000000000000
--- a/test/CodeGen/AMDGPU/address-space.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; Test that codegenprepare understands address space sizes
-
-%struct.foo = type { [3 x float], [3 x float] }
-
-; CHECK-LABEL: {{^}}do_as_ptr_calcs:
-; CHECK: s_load_dword [[SREG1:s[0-9]+]],
-; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20
-define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
-entry:
- %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
- %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
- br label %bb32
-
-bb32:
- %a = load float, float addrspace(3)* %x, align 4
- %b = load float, float addrspace(3)* %y, align 4
- %cmp = fcmp one float %a, %b
- br i1 %cmp, label %bb34, label %bb33
-
-bb33:
- unreachable
-
-bb34:
- unreachable
-}
-
-
diff --git a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
new file mode 100644
index 000000000000..67a193999204
--- /dev/null
+++ b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -0,0 +1,106 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s
+
+declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrspace(4)* nocapture, i32, i32, i1) #0
+
+@lds.i32 = unnamed_addr addrspace(3) global i32 undef, align 4
+@lds.arr = unnamed_addr addrspace(3) global [256 x i32] undef, align 4
+
+@global.i32 = unnamed_addr addrspace(1) global i32 undef, align 4
+@global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
+
+; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
+define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+ store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+ ret void
+}
+
+; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
+define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+ store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
+ ret void
+}
+
+; HSA: define void @store_constant_cast_group_gv_to_flat() #2
+define void @store_constant_cast_group_gv_to_flat() #1 {
+ store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
+ ret void
+}
+
+; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
+define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+ store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
+ ret void
+}
+
+; HSA: @store_constant_cast_global_gv_to_flat() #1
+define void @store_constant_cast_global_gv_to_flat() #1 {
+ store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
+ ret void
+}
+
+; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
+define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+ store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
+ ret void
+}
+
+; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+ %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+ %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+ %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, i32 addrspace(1)* %out
+ ret void
+}
+
+; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
+define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+ call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
+ ret void
+}
+
+; Can't just search the pointer value
+; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
+define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+ store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
+ ret void
+}
+
+; Can't just search pointer types
+; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
+define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+ store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
+ ret void
+}
+
+; Cast group to flat, do GEP, cast back to group
+; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
+define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+ store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
+ ret void
+}
+
+; HSA: @ret_constant_cast_group_gv_gep_to_flat_to_group() #2
+define i32 addrspace(3)* @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+ ret i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
+}
+
+; HSA: attributes #0 = { argmemonly nounwind }
+; HSA: attributes #1 = { nounwind }
+; HSA: attributes #2 = { nounwind "amdgpu-queue-ptr" }
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index 61bcd4b3c093..5a173e954f8d 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,18 +1,208 @@
-; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
-; ERROR: unsupported addrspacecast not implemented
+; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 1
-; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+
+; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+
+; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
+; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
+; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+ %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+ store volatile i32 7, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 1
+
+; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+
+; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+
+; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
+; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
+; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+ %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
+ store volatile i32 7, i32 addrspace(4)* %stof
+ ret void
+}
+
+; no-op
+; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+ %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
+ store volatile i32 7, i32 addrspace(4)* %stof
+ ret void
+}
+
+; no-op
+; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+ %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
+ %ld = load volatile i32, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
+; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
+define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
+ store volatile i32 0, i32 addrspace(3)* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
+; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
+ store volatile i32 0, i32* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
+; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
+ store volatile i32 0, i32 addrspace(1)* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
+; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
+define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
+ load volatile i32, i32 addrspace(2)* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
+; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_0_group_to_flat_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
+ store i32 7, i32 addrspace(4)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: ds_write_b32 [[PTR]], [[K]]
+define void @cast_0_flat_to_group_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
+ store i32 7, i32 addrspace(3)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
+ store i32 7, i32 addrspace(4)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: ds_write_b32 [[PTR]], [[K]]
+define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
+ store i32 7, i32 addrspace(3)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
+; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_0_private_to_flat_addrspacecast() #0 {
+ %cast = addrspacecast i32* null to i32 addrspace(4)*
+ store i32 7, i32 addrspace(4)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+define void @cast_0_flat_to_private_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
+ store i32 7, i32* %cast
+ ret void
+}
; Disable optimizations in case there are optimizations added that
; specialize away generic pointer accesses.
-; CHECK-LABEL: {{^}}branch_use_flat_i32:
-; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
+; HSA-LABEL: {{^}}branch_use_flat_i32:
+; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
+; HSA: s_endpgm
define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
entry:
%cmp = icmp ne i32 %c, 0
@@ -34,33 +224,30 @@ end:
ret void
}
-; TODO: This should not be zero when registers are used for small
-; scratch allocations again.
-
; Check for prologue initializing special SGPRs pointing to scratch.
-; CHECK-LABEL: {{^}}store_flat_scratch:
-; CHECK: s_movk_i32 flat_scratch_lo, 0
-; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
-; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
-; CHECK: flat_store_dword
-; CHECK: s_barrier
-; CHECK: flat_load_dword
+; HSA-LABEL: {{^}}store_flat_scratch:
+; HSA-DAG: s_mov_b32 flat_scratch_lo, s9
+; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
+; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+; HSA: flat_store_dword
+; HSA: s_barrier
+; HSA: flat_load_dword
define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
%alloca = alloca i32, i32 9, align 4
- %x = call i32 @llvm.r600.read.tidig.x() #3
+ %x = call i32 @llvm.amdgcn.workitem.id.x() #2
%pptr = getelementptr i32, i32* %alloca, i32 %x
%fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
store i32 %x, i32 addrspace(4)* %fptr
; Dummy call
- call void @llvm.AMDGPU.barrier.local() #1
+ call void @llvm.amdgcn.s.barrier() #1
%reload = load i32, i32 addrspace(4)* %fptr, align 4
store i32 %reload, i32 addrspace(1)* %out, align 4
ret void
}
-declare void @llvm.AMDGPU.barrier.local() #1
-declare i32 @llvm.r600.read.tidig.x() #3
+declare void @llvm.amdgcn.s.barrier() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind }
attributes #1 = { nounwind convergent }
-attributes #3 = { nounwind readnone }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
new file mode 100644
index 000000000000..ad6843770fd6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
+; RUN: llc -mattr=+promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE -check-prefix=HSA %s
+; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
+; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA -check-prefix=HSA %s
+; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
+; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+
+
+; Make sure we don't overwrite workitem information with private memory
+
+; GCN-LABEL: {{^}}work_item_info:
+; GCN-NOT: v0
+; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
+; GCN: buffer_store_dword [[RESULT]]
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+ %0 = alloca [2 x i32]
+ %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
+ %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
+ store i32 0, i32* %1
+ store i32 1, i32* %2
+ %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+ %4 = load i32, i32* %3
+ %5 = call i32 @llvm.amdgcn.workitem.id.x()
+ %6 = add i32 %4, %5
+ store i32 %6, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/amdgcn.work-item-intrinsics.ll b/test/CodeGen/AMDGPU/amdgcn.work-item-intrinsics.ll
new file mode 100644
index 000000000000..b1b3b9930d1f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgcn.work-item-intrinsics.ll
@@ -0,0 +1,114 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}workdim:
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+define void @workdim (i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.amdgcn.read.workdim() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; The workgroup.id values are stored in sgprs offset by the number of user
+; sgprs.
+
+; FUNC-LABEL: {{^}}workgroup_id_x:
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @workgroup_id_x(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.amdgcn.workgroup.id.x() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_id_y:
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+define void @workgroup_id_y(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.amdgcn.workgroup.id.y() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}workgroup_id_z:
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @workgroup_id_z(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.amdgcn.workgroup.id.z() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 132{{$}}
+
+; FUNC-LABEL: {{^}}workitem_id_x:
+; GCN-NOHSA: buffer_store_dword v0
+define void @workitem_id_x(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 2180{{$}}
+
+; FUNC-LABEL: {{^}}workitem_id_y:
+
+; GCN-NOHSA: buffer_store_dword v1
+define void @workitem_id_y(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.amdgcn.workitem.id.y() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 4228{{$}}
+
+; FUNC-LABEL: {{^}}workitem_id_z:
+; GCN-NOHSA: buffer_store_dword v2
+define void @workitem_id_z(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.amdgcn.workitem.id.z() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+declare i32 @llvm.amdgcn.read.workdim() #0
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
new file mode 100644
index 000000000000..a12132f425d9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
@@ -0,0 +1,8 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
+; RUN: opt -S -amdgpu-codegenprepare < %s
+; Make sure this doesn't crash with no triple
+
+; CHECK-LABEL: @foo(
+define void @foo() {
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
new file mode 100644
index 000000000000..dd16907b748c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+
+; GCN-LABEL: {{^}}shader_cc:
+; GCN: v_add_i32_e32 v0, vcc, s8, v0
+define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+ %vi = bitcast float %v to i32
+ %x = add i32 %vi, %w
+ %xf = bitcast i32 %x to float
+ ret float %xf
+}
+
+; GCN-LABEL: {{^}}kernel_cc:
+; GCN: s_endpgm
+define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+ %vi = bitcast float %v to i32
+ %x = add i32 %vi, %w
+ %xf = bitcast i32 %x to float
+ ret float %xf
+}
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
new file mode 100644
index 000000000000..7b5158629091
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -0,0 +1,530 @@
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s
+
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+
+
+; HSAOPT: @mova_same_clause.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+; HSAOPT: @high_alignment.stack = internal unnamed_addr addrspace(3) global [256 x [8 x i32]] undef, align 16
+
+
+; FUNC-LABEL: {{^}}mova_same_clause:
+; OPT-LABEL: @mova_same_clause(
+
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
+
+; HSA-PROMOTE: .amd_kernel_code_t
+; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
+; HSA-PROMOTE: .end_amd_kernel_code_t
+
+; FIXME: These should be merged
+; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1
+; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+
+; HSA-ALLOCA: .amd_kernel_code_t
+; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
+; by 4 bytes.
+; HSA-ALLOCA: workitem_private_segment_byte_size = 24
+; HSA-ALLOCA: .end_amd_kernel_code_t
+
+; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7
+; HSA-ALLOCA: s_add_u32 s6, s6, s9
+; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8
+
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+
+
+; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(2)* [[DISPATCH_PTR]] to i32 addrspace(2)*
+; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 1
+; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP0]], align 4, !invariant.load !0
+; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 2
+; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
+; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
+
+; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1
+; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1
+
+; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
+; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
+; HSAOPT: [[Y_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[WORKITEM_ID_Y]], [[LDZU]]
+; HSAOPT: [[ADD_YZ_X_X_YZ_SIZE:%[0-9]+]] = add i32 [[YZ_X_XID]], [[Y_X_Z_SIZE]]
+; HSAOPT: [[ADD_ZID:%[0-9]+]] = add i32 [[ADD_YZ_X_X_YZ_SIZE]], [[WORKITEM_ID_Z]]
+
+; HSAOPT: [[LOCAL_GEP:%[0-9]+]] = getelementptr inbounds [256 x [5 x i32]], [256 x [5 x i32]] addrspace(3)* @mova_same_clause.stack, i32 0, i32 [[ADD_ZID]]
+; HSAOPT: %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}}
+; HSAOPT: %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 {{%[0-9]+}}
+; HSAOPT: %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 0
+; HSAOPT: %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(3)* [[LOCAL_GEP]], i32 0, i32 1
+
+
+; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0
+; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; OPT-LABEL: @high_alignment(
+; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
+define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca [8 x i32], align 16
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; FUNC-LABEL: {{^}}no_replace_inbounds_gep:
+; OPT-LABEL: @no_replace_inbounds_gep(
+; OPT: alloca [5 x i32]
+
+; SI-NOT: ds_write
+define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; FUNC-LABEL: {{^}}multiple_structs:
+; OPT-LABEL: @multiple_structs(
+
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+; SI-NOT: v_movrel
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+entry:
+ %a = alloca %struct.point
+ %b = alloca %struct.point
+ %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+ %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
+ %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+ %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
+ store i32 0, i32* %a.x.ptr
+ store i32 1, i32* %a.y.ptr
+ store i32 2, i32* %b.x.ptr
+ store i32 3, i32* %b.y.ptr
+ %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+ %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+ %a.indirect = load i32, i32* %a.indirect.ptr
+ %b.indirect = load i32, i32* %b.indirect.ptr
+ %0 = add i32 %a.indirect, %b.indirect
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; Test direct access of a private array inside a loop. The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+entry:
+ %prv_array_const = alloca [2 x i32]
+ %prv_array = alloca [2 x i32]
+ %a = load i32, i32 addrspace(1)* %in
+ %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %b = load i32, i32 addrspace(1)* %b_src_ptr
+ %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+ store i32 %a, i32* %a_dst_ptr
+ %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
+ store i32 %b, i32* %b_dst_ptr
+ br label %for.body
+
+for.body:
+ %inc = phi i32 [0, %entry], [%count, %for.body]
+ %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+ %x = load i32, i32* %x_ptr
+ %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+ %y = load i32, i32* %y_ptr
+ %xy = add i32 %x, %y
+ store i32 %xy, i32* %y_ptr
+ %count = add i32 %inc, 1
+ %done = icmp eq i32 %count, 4095
+ br i1 %done, label %for.end, label %for.body
+
+for.end:
+ %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+ %value = load i32, i32* %value_ptr
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}short_array:
+
+; R600: MOVA_INT
+
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
+; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %0 = alloca [2 x i16]
+ %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
+ store i16 0, i16* %1
+ store i16 1, i16* %2
+ %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
+ %4 = load i16, i16* %3
+ %5 = sext i16 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}char_array:
+
+; R600: MOVA_INT
+
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
+define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %0 = alloca [2 x i8]
+ %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
+ store i8 0, i8* %1
+ store i8 1, i8* %2
+ %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
+ %4 = load i8, i8* %3
+ %5 = sext i8 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+
+}
+
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: {{^}}no_overlap:
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-NOT: [[CHAN]]+
+; SI: v_mov_b32_e32 v3
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+ %0 = alloca [3 x i8], align 1
+ %1 = alloca [2 x i8], align 1
+ %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
+ %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
+ %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
+ %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
+ %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
+ store i8 0, i8* %2
+ store i8 1, i8* %3
+ store i8 2, i8* %4
+ store i8 1, i8* %5
+ store i8 0, i8* %6
+ %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
+ %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
+ %9 = load i8, i8* %7
+ %10 = load i8, i8* %8
+ %11 = add i8 %9, %10
+ %12 = sext i8 %11 to i32
+ store i32 %12, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x i8]]
+ %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+ store i8 0, i8* %gep0
+ store i8 1, i8* %gep1
+ %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i8, i8* %gep2
+ %sext = sext i8 %load to i32
+ store i32 %sext, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x i32]]
+ %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x i64]]
+ %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+ store i64 0, i64* %gep0
+ store i64 1, i64* %gep1
+ %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i64, i64* %gep2
+ store i64 %load, i64 addrspace(1)* %out
+ ret void
+}
+
+%struct.pair32 = type { i32, i32 }
+
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x %struct.pair32]]
+ %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+ %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x %struct.pair32]
+ %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+ %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+ %tmp = alloca [2 x i32]
+ %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+ %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+ store i32 0, i32* %tmp1
+ store i32 1, i32* %tmp2
+ %cmp = icmp eq i32 %in, 0
+ %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
+ %load = load i32, i32* %sel
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %alloca = alloca [16 x i32]
+ %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ store i32 5, i32* %tmp0
+ %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+ %tmp2 = add i32 %tmp1, 5
+ %tmp3 = inttoptr i32 %tmp2 to i32*
+ %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
+ %tmp5 = load i32, i32* %tmp4
+ store i32 %tmp5, i32 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @pointer_typed_alloca(
+; OPT: getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
+; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
+define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+entry:
+ %A.addr = alloca i32 addrspace(1)*, align 4
+ store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+ %ld0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
+ %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0
+ store i32 1, i32 addrspace(1)* %arrayidx, align 4
+ %ld1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1
+ store i32 2, i32 addrspace(1)* %arrayidx1, align 4
+ %ld2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %ld2, i32 2
+ store i32 3, i32 addrspace(1)* %arrayidx2, align 4
+ ret void
+}
+
+; HSAOPT: !0 = !{}
+; HSAOPT: !1 = !{i32 0, i32 2048}
+
+; NOHSAOPT: !0 = !{i32 0, i32 2048}
+
+
+; FUNC-LABEL: v16i32_stack:
+
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+ %alloca = alloca [2 x <16 x i32>]
+ %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
+ %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
+ store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: v16float_stack:
+
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+; R600: MOVA_INT
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+ %alloca = alloca [2 x <16 x float>]
+ %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
+ %tmp5 = load <16 x float>, <16 x float>* %tmp0
+ store <16 x float> %tmp5, <16 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: v2float_stack:
+
+; R600: MOVA_INT
+; R600: MOVA_INT
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+ %alloca = alloca [16 x <2 x float>]
+ %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
+ %tmp5 = load <2 x float>, <2 x float>* %tmp0
+ store <2 x float> %tmp5, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e7fcd1ff3650..853788b92aae 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -1,34 +1,32 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; Legacy intrinsics that just read implicit parameters
-; FUNC-LABEL: {{^}}ngroups_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-
-; HSA: .amd_kernel_code_t
-
-; HSA: enable_sgpr_private_segment_buffer = 1
-; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 0
-; HSA: enable_sgpr_kernarg_segment_ptr = 1
-; HSA: enable_sgpr_dispatch_id = 0
-; HSA: enable_sgpr_flat_scratch_init = 0
-; HSA: enable_sgpr_private_segment_size = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
-
-; HSA: .end_amd_kernel_code_t
+; FUNC-LABEL: {{^}}workdim_legacy:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[2].Z
+define void @workdim_legacy (i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
-; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; FUNC-LABEL: {{^}}ngroups_x:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
define void @ngroups_x (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -37,13 +35,13 @@ entry:
}
; FUNC-LABEL: {{^}}ngroups_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
define void @ngroups_y (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -52,13 +50,13 @@ entry:
}
; FUNC-LABEL: {{^}}ngroups_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
define void @ngroups_z (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -67,13 +65,13 @@ entry:
}
; FUNC-LABEL: {{^}}global_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
define void @global_size_x (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -82,13 +80,13 @@ entry:
}
; FUNC-LABEL: {{^}}global_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
define void @global_size_y (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -97,13 +95,13 @@ entry:
}
; FUNC-LABEL: {{^}}global_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
define void @global_size_z (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -111,97 +109,94 @@ entry:
ret void
}
+; FUNC-LABEL: {{^}}local_size_x:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
+define void @local_size_x (i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.x() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
+define void @local_size_y (i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.y() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
+define void @local_size_z (i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.z() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; Legacy use of r600 intrinsics by GCN
+
; The tgid values are stored in sgprs offset by the number of user
; sgprs.
-; FUNC-LABEL: {{^}}tgid_x:
-; HSA: .amd_kernel_code_t
-; HSA: compute_pgm_rsrc2_user_sgpr = 6
-; HSA: compute_pgm_rsrc2_tgid_x_en = 1
-; HSA: compute_pgm_rsrc2_tgid_y_en = 0
-; HSA: compute_pgm_rsrc2_tgid_z_en = 0
-; HSA: compute_pgm_rsrc2_tg_size_en = 0
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
-; HSA: .end_amd_kernel_code_t
-
+; FUNC-LABEL: {{^}}tgid_x_legacy:
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
-; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
; GCN-NOHSA: buffer_store_dword [[VVAL]]
-; HSA: flat_store_dword [[VVAL]]
-; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x(i32 addrspace(1)* %out) {
+define void @tgid_x_legacy(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.x() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}tgid_y:
-; HSA: compute_pgm_rsrc2_user_sgpr = 6
-; HSA: compute_pgm_rsrc2_tgid_x_en = 1
-; HSA: compute_pgm_rsrc2_tgid_y_en = 1
-; HSA: compute_pgm_rsrc2_tgid_z_en = 0
-; HSA: compute_pgm_rsrc2_tg_size_en = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; FUNC-LABEL: {{^}}tgid_y_legacy:
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
-; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
; GCN-NOHSA: buffer_store_dword [[VVAL]]
-; HSA: flat_store_dword [[VVAL]]
-; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
-; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
-; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_y(i32 addrspace(1)* %out) {
+define void @tgid_y_legacy(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.y() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}tgid_z:
-; HSA: compute_pgm_rsrc2_user_sgpr = 6
-; HSA: compute_pgm_rsrc2_tgid_x_en = 1
-; HSA: compute_pgm_rsrc2_tgid_y_en = 0
-; HSA: compute_pgm_rsrc2_tgid_z_en = 1
-; HSA: compute_pgm_rsrc2_tg_size_en = 0
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
-; HSA: enable_sgpr_private_segment_buffer = 1
-; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 0
-; HSA: enable_sgpr_kernarg_segment_ptr = 1
-; HSA: enable_sgpr_dispatch_id = 0
-; HSA: enable_sgpr_flat_scratch_init = 0
-; HSA: enable_sgpr_private_segment_size = 0
-; HSA: enable_sgpr_grid_workgroup_count_x = 0
-; HSA: enable_sgpr_grid_workgroup_count_y = 0
-; HSA: enable_sgpr_grid_workgroup_count_z = 0
-
+; FUNC-LABEL: {{^}}tgid_z_legacy:
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
-; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
; GCN-NOHSA: buffer_store_dword [[VVAL]]
-; HSA: flat_store_dword [[VVAL]]
-; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z(i32 addrspace(1)* %out) {
+define void @tgid_z_legacy(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.z() #0
store i32 %0, i32 addrspace(1)* %out
@@ -212,11 +207,9 @@ entry:
; GCN-NOHSA: .long 47180
; GCN-NOHSA-NEXT: .long 132{{$}}
-; FUNC-LABEL: {{^}}tidig_x:
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; FUNC-LABEL: {{^}}tidig_x_legacy:
; GCN-NOHSA: buffer_store_dword v0
-; HSA: flat_store_dword v0
-define void @tidig_x(i32 addrspace(1)* %out) {
+define void @tidig_x_legacy(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.x() #0
store i32 %0, i32 addrspace(1)* %out
@@ -227,12 +220,10 @@ entry:
; GCN-NOHSA: .long 47180
; GCN-NOHSA-NEXT: .long 2180{{$}}
-; FUNC-LABEL: {{^}}tidig_y:
+; FUNC-LABEL: {{^}}tidig_y_legacy:
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
; GCN-NOHSA: buffer_store_dword v1
-; HSA: flat_store_dword v1
-define void @tidig_y(i32 addrspace(1)* %out) {
+define void @tidig_y_legacy(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.y() #0
store i32 %0, i32 addrspace(1)* %out
@@ -243,11 +234,9 @@ entry:
; GCN-NOHSA: .long 47180
; GCN-NOHSA-NEXT: .long 4228{{$}}
-; FUNC-LABEL: {{^}}tidig_z:
-; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
+; FUNC-LABEL: {{^}}tidig_z_legacy:
; GCN-NOHSA: buffer_store_dword v2
-; HSA: flat_store_dword v2
-define void @tidig_z(i32 addrspace(1)* %out) {
+define void @tidig_z_legacy(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.z() #0
store i32 %0, i32 addrspace(1)* %out
@@ -262,6 +251,10 @@ declare i32 @llvm.r600.read.global.size.x() #0
declare i32 @llvm.r600.read.global.size.y() #0
declare i32 @llvm.r600.read.global.size.z() #0
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
declare i32 @llvm.r600.read.tgid.x() #0
declare i32 @llvm.r600.read.tgid.y() #0
declare i32 @llvm.r600.read.tgid.z() #0
diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll
new file mode 100644
index 000000000000..dde5f8c21769
--- /dev/null
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}v_and_i64_br:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+entry:
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
+ br i1 %tmp0, label %if, label %endif
+
+if:
+ %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %b = load i64, i64 addrspace(1)* %bptr, align 8
+ %and = and i64 %a, %b
+ br label %endif
+
+endif:
+ %tmp1 = phi i64 [%and, %if], [0, %entry]
+ store i64 %tmp1, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index f83fb16101fb..0046bc93826e 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() #0
@@ -177,50 +177,78 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
ret void
}
-; FUNC-LABEL: {{^}}s_and_constant_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}s_and_constant_i64:
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
+; SI: buffer_store_dwordx2
define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
- %and = and i64 %a, 281474976710655
+ %and = and i64 %a, 549756338176
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}v_and_i64:
-; SI: v_and_b32
-; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
- %a = load i64, i64 addrspace(1)* %aptr, align 8
- %b = load i64, i64 addrspace(1)* %bptr, align 8
- %and = and i64 %a, %b
+; FUNC-LABEL: {{^}}s_and_multi_use_constant_i64:
+; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
+; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
+define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %and0 = and i64 %a, 549756338176
+ %and1 = and i64 %b, 549756338176
+ store volatile i64 %and0, i64 addrspace(1)* %out
+ store volatile i64 %and1, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_32_bit_constant_i64:
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
+; SI-NOT: and
+; SI: buffer_store_dwordx2
+define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+ %and = and i64 %a, 1234567
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}v_and_i64_br:
+; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64:
+; SI: s_load_dword [[A:s[0-9]+]]
+; SI: s_load_dword [[B:s[0-9]+]]
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_lshl_b32 [[A]], [[A]], 1
+; SI: s_lshl_b32 [[B]], [[B]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
+; SI-NOT: and
+; SI: buffer_store_dwordx2
+define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+ %shl.a = shl i64 %a, 1
+ %shl.b = shl i64 %b, 1
+ %and0 = and i64 %shl.a, 62
+ %and1 = and i64 %shl.b, 62
+ %add0 = add i64 %and0, %c
+ %add1 = add i64 %and1, %c
+ store volatile i64 %add0, i64 addrspace(1)* %out
+ store volatile i64 %add1, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i64:
; SI: v_and_b32
; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
-entry:
- %tmp0 = icmp eq i32 %cond, 0
- br i1 %tmp0, label %if, label %endif
-
-if:
+define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%b = load i64, i64 addrspace(1)* %bptr, align 8
%and = and i64 %a, %b
- br label %endif
-
-endif:
- %tmp1 = phi i64 [%and, %if], [0, %entry]
- store i64 %tmp1, i64 addrspace(1)* %out, align 8
+ store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v_and_constant_i64:
-; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207
-; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], {{v[0-9]+}}
-; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], {{v[0-9]+}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
; SI: buffer_store_dwordx2
define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
@@ -229,10 +257,54 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
ret void
}
-; FIXME: Should replace and 0
+; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
+; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
+; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
+; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
+; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+ %a = load volatile i64, i64 addrspace(1)* %aptr
+ %b = load volatile i64, i64 addrspace(1)* %aptr
+ %and0 = and i64 %a, 1231231234567
+ %and1 = and i64 %b, 1231231234567
+ store volatile i64 %and0, i64 addrspace(1)* %out
+ store volatile i64 %and1, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_multi_use_inline_imm_i64:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
+; SI-NOT: and
+; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
+; SI-NOT: and
+; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
+; SI-NOT: and
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
+; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
+; SI-NOT: and
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
+define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+ %a = load volatile i64, i64 addrspace(1)* %aptr
+ %b = load volatile i64, i64 addrspace(1)* %aptr
+ %and0 = and i64 %a, 63
+ %and1 = and i64 %b, 63
+ store volatile i64 %and0, i64 addrspace(1)* %out
+ store volatile i64 %and1, i64 addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
-; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: and
+; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%and = and i64 %a, 1234567
@@ -240,10 +312,12 @@ define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
ret void
}
-; FIXME: Replace and 0 with mov 0
; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
+; SI: buffer_load_dword v{{[0-9]+}}
+; SI-NOT: and
; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
-; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
%a = load i64, i64 addrspace(1)* %aptr, align 8
%and = and i64 %a, 64
@@ -252,15 +326,39 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt
}
; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
+; SI: s_load_dword
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
+; SI-NOT: and
+; SI: buffer_store_dword
define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 64
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
+; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
+; SI: s_load_dword [[A:s[0-9]+]]
+; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
+; SI-NOT: and
+; SI: s_add_u32
+; SI-NEXT: s_addc_u32
+define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+ %shl = shl i64 %a, 1
+ %and = and i64 %shl, 64
+ %add = add i64 %and, %b
+ store i64 %add, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 1
store i64 %and, i64 addrspace(1)* %out, align 8
@@ -268,7 +366,14 @@ define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
}
; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 4607182418800017408
store i64 %and, i64 addrspace(1)* %out, align 8
@@ -276,7 +381,14 @@ define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
}
; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 13830554455654793216
store i64 %and, i64 addrspace(1)* %out, align 8
@@ -284,47 +396,85 @@ define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
}
; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 4602678819172646912
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 13826050856027422720
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 4611686018427387904
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 13835058055282163712
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 4616189618054758400
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64:
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 13839561654909534208
store i64 %and, i64 addrspace(1)* %out, align 8
@@ -335,22 +485,26 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
; Test with the 64-bit integer bitpattern for a 32-bit float in the
; low 32-bits, which is not a valid 64-bit inline immmediate.
-; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
+; SI: s_load_dword s
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 1082130432
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FIXME: Copy of -1 register
-; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
-; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, -1065353216
store i64 %and, i64 addrspace(1)* %out, align 8
@@ -358,20 +512,25 @@ define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
}
; Shift into upper 32-bits
-; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 4647714815446351872
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
-; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
-; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI-NOT: and
+; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
+; SI-NOT: and
+; SI: buffer_store_dwordx2
define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
%and = and i64 %a, 13871086852301127680
store i64 %and, i64 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
new file mode 100644
index 000000000000..084a6933da26
--- /dev/null
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -0,0 +1,238 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA %s
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+
+; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.x()
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.y()
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.z()
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ store volatile i32 %val2, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.x()
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.y()
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+ %val = call i32 @llvm.amdgcn.workitem.id.z()
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.y()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+ %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ store volatile i32 %val2, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+ %val0 = call i32 @llvm.amdgcn.workitem.id.x()
+ %val1 = call i32 @llvm.amdgcn.workitem.id.y()
+ %val2 = call i32 @llvm.amdgcn.workitem.id.z()
+ %val3 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val4 = call i32 @llvm.amdgcn.workgroup.id.y()
+ %val5 = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %val0, i32 addrspace(1)* %ptr
+ store volatile i32 %val1, i32 addrspace(1)* %ptr
+ store volatile i32 %val2, i32 addrspace(1)* %ptr
+ store volatile i32 %val3, i32 addrspace(1)* %ptr
+ store volatile i32 %val4, i32 addrspace(1)* %ptr
+ store volatile i32 %val5, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+ %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+ %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+ %val = load i32, i32 addrspace(2)* %bc
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+ %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+ %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+ %val = load i32, i32 addrspace(2)* %bc
+ store i32 %val, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+ %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+ store volatile i32 0, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+ %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
+ store volatile i32 0, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
+ store volatile i32 0, i32 addrspace(3)* %ftos
+ ret void
+}
+
+; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
+ store volatile i32 0, i32* %ftos
+ ret void
+}
+
+; No-op addrspacecast should not use queue ptr
+; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+ %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
+ store volatile i32 0, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+ %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
+ %ld = load volatile i32, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
+ store volatile i32 0, i32 addrspace(1)* %ftos
+ ret void
+}
+
+; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
+ %ld = load volatile i32, i32 addrspace(2)* %ftos
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+; HSA: attributes #0 = { nounwind readnone }
+; HSA: attributes #1 = { nounwind }
+; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" }
+; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" }
+; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" }
+; HSA: attributes #5 = { nounwind "amdgpu-work-item-id-y" }
+; HSA: attributes #6 = { nounwind "amdgpu-work-item-id-z" }
+; HSA: attributes #7 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" }
+; HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
+; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
+; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index b116c72322bb..a4e7bb67d507 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -1,5 +1,4 @@
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=HSA -check-prefix=ALL %s
-; RUN: opt -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefix=NOHSA -check-prefix=ALL %s
declare i32 @llvm.r600.read.tgid.x() #0
declare i32 @llvm.r600.read.tgid.y() #0
@@ -13,11 +12,6 @@ declare i32 @llvm.r600.read.local.size.x() #0
declare i32 @llvm.r600.read.local.size.y() #0
declare i32 @llvm.r600.read.local.size.z() #0
-declare i32 @llvm.r600.read.global.size.x() #0
-declare i32 @llvm.r600.read.global.size.y() #0
-declare i32 @llvm.r600.read.global.size.z() #0
-
-
; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
%val = call i32 @llvm.r600.read.tgid.x()
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index f8a74222d566..b00fff0a6f9a 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -1,8 +1,9 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-declare i32 @llvm.SI.tid() nounwind readnone
-declare void @llvm.AMDGPU.barrier.local() nounwind convergent
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+declare void @llvm.amdgcn.s.barrier() #2
; The required pointer calculations for the alloca'd actually requires
; an add and won't be folded into the addressing, which fails with a
@@ -14,31 +15,38 @@ declare void @llvm.AMDGPU.barrier.local() nounwind convergent
; FIXME: We end up with zero argument for ADD, because
; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
; with the appropriate offset. We should fold this into the store.
+
; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
+; SI-ALLOCA: s_barrier
+; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
;
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
; alloca to a vector. It currently fails because it does not know how
; to interpret:
-; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+; getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
-; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16
+; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
- %alloca = alloca [4 x i32], i32 4, align 16
- %tid = call i32 @llvm.SI.tid() readnone
- %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid
- %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+ %alloca = alloca [16 x i32], align 16
+ %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
+ %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
+ %a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid
+ %b_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inB, i32 %tid
%a = load i32, i32 addrspace(1)* %a_ptr
%b = load i32, i32 addrspace(1)* %b_ptr
%result = add i32 %a, %b
- %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+ %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
store i32 %result, i32* %alloca_ptr, align 4
; Dummy call
- call void @llvm.AMDGPU.barrier.local() nounwind convergent
+ call void @llvm.amdgcn.s.barrier()
%reload = load i32, i32* %alloca_ptr, align 4
- %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
ret void
}
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index a3ae3c3aea16..b914edf2928e 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -1,13 +1,15 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.SI.tid() readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
; SI-LABEL: {{^}}test_array_ptr_calc:
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_hi_i32
; SI: s_endpgm
define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
- %tid = call i32 @llvm.SI.tid() readnone
+ %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
%a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
%b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
%a = load i32, i32 addrspace(1)* %a_ptr
@@ -16,3 +18,5 @@ define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] ad
store i32 %result, i32 addrspace(1)* %out
ret void
}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index ef2560ef1849..6a2716cc903e 100644
--- a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -3,11 +3,11 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
@@ -21,12 +21,12 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
}
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
@@ -75,8 +75,8 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll
index 4c6f45525b9e..184d07ffad9c 100644
--- a/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}atomic_sub_local:
; R600: LDS_SUB *
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index abdc4afef472..ff730a085255 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,9 +1,23 @@
-; XFAIL: *
-; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
-; CHECK-LABEL: {{^}}test_branch(
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+; GCN-LABEL: {{^}}test_branch:
+; GCNNOOPT: v_writelane_b32
+; GCNNOOPT: v_writelane_b32
+; GCNNOOPT: v_writelane_b32
+; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#1
+; GCNNOOPT: v_readlane_b32
+; GCNNOOPT: v_readlane_b32
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+
+; GCN: {{^}}[[END]]
+; GCN: s_endpgm
+define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
%cmp = icmp ne i32 %val, 0
br i1 %cmp, label %store, label %end
@@ -14,3 +28,28 @@ store:
end:
ret void
}
+
+; GCN-LABEL: {{^}}test_brcc_i1:
+; GCN: buffer_load_ubyte
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1,
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+
+; GCN: {{^}}[[END]]
+; GCN: s_endpgm
+define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+ %cmp0 = icmp ne i1 %val, 0
+ br i1 %cmp0, label %store, label %end
+
+store:
+ store i32 222, i32 addrspace(1)* %out
+ ret void
+
+end:
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/bfm.ll b/test/CodeGen/AMDGPU/bfm.ll
new file mode 100644
index 000000000000..73db87d7ae9e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfm.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}bfm_pattern:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+ %a = shl i32 1, %x
+ %b = sub i32 %a, 1
+ %c = shl i32 %b, %y
+ store i32 %c, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_pattern_simple:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
+define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+ %a = shl i32 1, %x
+ %b = sub i32 %a, 1
+ store i32 %b, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll
index 2671c5d102b3..0ab22b350f50 100644
--- a/test/CodeGen/AMDGPU/big_alu.ll
+++ b/test/CodeGen/AMDGPU/big_alu.ll
@@ -1,1173 +1,1312 @@
-;RUN: llc < %s -march=r600 -mcpu=cedar
+; RUN: llc -march=r600 -mcpu=cedar < %s
-;This test ensures that R600 backend can handle ifcvt properly
-;and do not generate ALU clauses with more than 128 instructions.
+; This test ensures that R600 backend can handle ifcvt properly
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
+define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) {
main_body:
- %0 = extractelement <4 x float> %reg0, i32 0
- %1 = extractelement <4 x float> %reg0, i32 1
- %2 = extractelement <4 x float> %reg0, i32 2
- %3 = extractelement <4 x float> %reg0, i32 3
- %4 = extractelement <4 x float> %reg1, i32 0
- %5 = extractelement <4 x float> %reg9, i32 0
- %6 = extractelement <4 x float> %reg8, i32 0
- %7 = fcmp ugt float %6, 0.000000e+00
- %8 = select i1 %7, float %4, float %5
- %9 = extractelement <4 x float> %reg1, i32 1
- %10 = extractelement <4 x float> %reg9, i32 1
- %11 = extractelement <4 x float> %reg8, i32 0
- %12 = fcmp ugt float %11, 0.000000e+00
- %13 = select i1 %12, float %9, float %10
- %14 = extractelement <4 x float> %reg1, i32 2
- %15 = extractelement <4 x float> %reg9, i32 2
- %16 = extractelement <4 x float> %reg8, i32 0
- %17 = fcmp ugt float %16, 0.000000e+00
- %18 = select i1 %17, float %14, float %15
- %19 = extractelement <4 x float> %reg1, i32 3
- %20 = extractelement <4 x float> %reg9, i32 3
- %21 = extractelement <4 x float> %reg8, i32 0
- %22 = extractelement <4 x float> %reg2, i32 0
- %23 = extractelement <4 x float> %reg2, i32 1
- %24 = extractelement <4 x float> %reg2, i32 2
- %25 = extractelement <4 x float> %reg2, i32 3
- %26 = extractelement <4 x float> %reg3, i32 0
- %27 = extractelement <4 x float> %reg3, i32 1
- %28 = extractelement <4 x float> %reg3, i32 2
- %29 = extractelement <4 x float> %reg3, i32 3
- %30 = extractelement <4 x float> %reg4, i32 0
- %31 = extractelement <4 x float> %reg4, i32 1
- %32 = extractelement <4 x float> %reg4, i32 2
- %33 = extractelement <4 x float> %reg4, i32 3
- %34 = extractelement <4 x float> %reg5, i32 0
- %35 = extractelement <4 x float> %reg5, i32 1
- %36 = extractelement <4 x float> %reg5, i32 2
- %37 = extractelement <4 x float> %reg5, i32 3
- %38 = extractelement <4 x float> %reg6, i32 0
- %39 = extractelement <4 x float> %reg6, i32 1
- %40 = extractelement <4 x float> %reg6, i32 2
- %41 = extractelement <4 x float> %reg6, i32 3
- %42 = extractelement <4 x float> %reg7, i32 0
- %43 = extractelement <4 x float> %reg7, i32 1
- %44 = extractelement <4 x float> %reg7, i32 2
- %45 = extractelement <4 x float> %reg7, i32 3
- %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
- %47 = extractelement <4 x float> %46, i32 0
- %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
- %49 = extractelement <4 x float> %48, i32 1
- %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
- %51 = extractelement <4 x float> %50, i32 2
- %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
- %53 = extractelement <4 x float> %52, i32 0
- %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
- %55 = extractelement <4 x float> %54, i32 0
- %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
- %57 = extractelement <4 x float> %56, i32 1
- %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
- %59 = extractelement <4 x float> %58, i32 2
- %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
- %61 = extractelement <4 x float> %60, i32 3
- %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
- %63 = extractelement <4 x float> %62, i32 0
- %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
- %65 = extractelement <4 x float> %64, i32 1
- %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
- %67 = extractelement <4 x float> %66, i32 2
- %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
- %69 = extractelement <4 x float> %68, i32 0
- %70 = fcmp oge float %69, 3.500000e+00
- %71 = sext i1 %70 to i32
- %72 = bitcast i32 %71 to float
- %73 = bitcast float %72 to i32
- %74 = icmp ne i32 %73, 0
- %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
- %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
- %76 = extractelement <4 x float> %75, i32 0
- %77 = fcmp oge float %76, 2.000000e+00
- %78 = sext i1 %77 to i32
- %79 = bitcast i32 %78 to float
- %80 = bitcast float %79 to i32
- %81 = icmp ne i32 %80, 0
- br i1 %81, label %IF137, label %ENDIF136
+ %tmp = extractelement <4 x float> %reg0, i32 0
+ %tmp1 = extractelement <4 x float> %reg0, i32 1
+ %tmp2 = extractelement <4 x float> %reg0, i32 2
+ %tmp3 = extractelement <4 x float> %reg0, i32 3
+ %tmp4 = extractelement <4 x float> %reg1, i32 0
+ %tmp5 = extractelement <4 x float> %reg9, i32 0
+ %tmp6 = extractelement <4 x float> %reg8, i32 0
+ %tmp7 = fcmp ugt float %tmp6, 0.000000e+00
+ %tmp8 = select i1 %tmp7, float %tmp4, float %tmp5
+ %tmp9 = extractelement <4 x float> %reg1, i32 1
+ %tmp10 = extractelement <4 x float> %reg9, i32 1
+ %tmp11 = extractelement <4 x float> %reg8, i32 0
+ %tmp12 = fcmp ugt float %tmp11, 0.000000e+00
+ %tmp13 = select i1 %tmp12, float %tmp9, float %tmp10
+ %tmp14 = extractelement <4 x float> %reg1, i32 2
+ %tmp15 = extractelement <4 x float> %reg9, i32 2
+ %tmp16 = extractelement <4 x float> %reg8, i32 0
+ %tmp17 = fcmp ugt float %tmp16, 0.000000e+00
+ %tmp18 = select i1 %tmp17, float %tmp14, float %tmp15
+ %tmp19 = extractelement <4 x float> %reg1, i32 3
+ %tmp20 = extractelement <4 x float> %reg9, i32 3
+ %tmp21 = extractelement <4 x float> %reg8, i32 0
+ %tmp22 = extractelement <4 x float> %reg2, i32 0
+ %tmp23 = extractelement <4 x float> %reg2, i32 1
+ %tmp24 = extractelement <4 x float> %reg2, i32 2
+ %tmp25 = extractelement <4 x float> %reg2, i32 3
+ %tmp26 = extractelement <4 x float> %reg3, i32 0
+ %tmp27 = extractelement <4 x float> %reg3, i32 1
+ %tmp28 = extractelement <4 x float> %reg3, i32 2
+ %tmp29 = extractelement <4 x float> %reg3, i32 3
+ %tmp30 = extractelement <4 x float> %reg4, i32 0
+ %tmp31 = extractelement <4 x float> %reg4, i32 1
+ %tmp32 = extractelement <4 x float> %reg4, i32 2
+ %tmp33 = extractelement <4 x float> %reg4, i32 3
+ %tmp34 = extractelement <4 x float> %reg5, i32 0
+ %tmp35 = extractelement <4 x float> %reg5, i32 1
+ %tmp36 = extractelement <4 x float> %reg5, i32 2
+ %tmp37 = extractelement <4 x float> %reg5, i32 3
+ %tmp38 = extractelement <4 x float> %reg6, i32 0
+ %tmp39 = extractelement <4 x float> %reg6, i32 1
+ %tmp40 = extractelement <4 x float> %reg6, i32 2
+ %tmp41 = extractelement <4 x float> %reg6, i32 3
+ %tmp42 = extractelement <4 x float> %reg7, i32 0
+ %tmp43 = extractelement <4 x float> %reg7, i32 1
+ %tmp44 = extractelement <4 x float> %reg7, i32 2
+ %tmp45 = extractelement <4 x float> %reg7, i32 3
+ %tmp46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+ %tmp47 = extractelement <4 x float> %tmp46, i32 0
+ %tmp48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+ %tmp49 = extractelement <4 x float> %tmp48, i32 1
+ %tmp50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+ %tmp51 = extractelement <4 x float> %tmp50, i32 2
+ %tmp52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+ %tmp53 = extractelement <4 x float> %tmp52, i32 0
+ %tmp54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+ %tmp55 = extractelement <4 x float> %tmp54, i32 0
+ %tmp56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+ %tmp57 = extractelement <4 x float> %tmp56, i32 1
+ %tmp58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+ %tmp59 = extractelement <4 x float> %tmp58, i32 2
+ %tmp60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+ %tmp61 = extractelement <4 x float> %tmp60, i32 3
+ %tmp62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+ %tmp63 = extractelement <4 x float> %tmp62, i32 0
+ %tmp64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+ %tmp65 = extractelement <4 x float> %tmp64, i32 1
+ %tmp66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+ %tmp67 = extractelement <4 x float> %tmp66, i32 2
+ %tmp68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+ %tmp69 = extractelement <4 x float> %tmp68, i32 0
+ %tmp70 = fcmp oge float %tmp69, 3.500000e+00
+ %tmp71 = sext i1 %tmp70 to i32
+ %tmp72 = bitcast i32 %tmp71 to float
+ %tmp73 = bitcast float %tmp72 to i32
+ %tmp74 = icmp ne i32 %tmp73, 0
+ %. = select i1 %tmp74, float 0.000000e+00, float 0.000000e+00
+ %tmp75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+ %tmp76 = extractelement <4 x float> %tmp75, i32 0
+ %tmp77 = fcmp oge float %tmp76, 2.000000e+00
+ %tmp78 = sext i1 %tmp77 to i32
+ %tmp79 = bitcast i32 %tmp78 to float
+ %tmp80 = bitcast float %tmp79 to i32
+ %tmp81 = icmp ne i32 %tmp80, 0
+ br i1 %tmp81, label %IF137, label %ENDIF136
IF137: ; preds = %main_body
- %82 = insertelement <4 x float> undef, float %30, i32 0
- %83 = insertelement <4 x float> %82, float %31, i32 1
- %84 = insertelement <4 x float> %83, float %32, i32 2
- %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
- %86 = insertelement <4 x float> undef, float %30, i32 0
- %87 = insertelement <4 x float> %86, float %31, i32 1
- %88 = insertelement <4 x float> %87, float %32, i32 2
- %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
- %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
- %91 = call float @llvm.AMDGPU.rsq.f32(float %90)
- %92 = fmul float %30, %91
- %93 = fmul float %31, %91
- %94 = fmul float %32, %91
- %95 = insertelement <4 x float> undef, float %92, i32 0
- %96 = insertelement <4 x float> %95, float %93, i32 1
- %97 = insertelement <4 x float> %96, float %94, i32 2
- %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3
- %99 = insertelement <4 x float> undef, float %37, i32 0
- %100 = insertelement <4 x float> %99, float %38, i32 1
- %101 = insertelement <4 x float> %100, float %39, i32 2
- %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3
- %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102)
- %104 = insertelement <4 x float> undef, float %92, i32 0
- %105 = insertelement <4 x float> %104, float %93, i32 1
- %106 = insertelement <4 x float> %105, float %94, i32 2
- %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3
- %108 = insertelement <4 x float> undef, float %40, i32 0
- %109 = insertelement <4 x float> %108, float %41, i32 1
- %110 = insertelement <4 x float> %109, float %42, i32 2
- %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
- %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111)
- %113 = fsub float -0.000000e+00, %92
- %114 = fsub float -0.000000e+00, %93
- %115 = fsub float -0.000000e+00, %94
- %116 = insertelement <4 x float> undef, float %34, i32 0
- %117 = insertelement <4 x float> %116, float %35, i32 1
- %118 = insertelement <4 x float> %117, float %36, i32 2
- %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
- %120 = insertelement <4 x float> undef, float %113, i32 0
- %121 = insertelement <4 x float> %120, float %114, i32 1
- %122 = insertelement <4 x float> %121, float %115, i32 2
- %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
- %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
- %125 = fdiv float 1.000000e+00, %124
- %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
- %127 = extractelement <4 x float> %126, i32 0
- %128 = fmul float %127, %125
- %129 = fmul float %103, %128
- %130 = fmul float %112, %128
- %131 = bitcast float %. to i32
- %132 = sitofp i32 %131 to float
- %133 = fdiv float 1.000000e+00, %132
- %134 = bitcast float %. to i32
- %135 = add i32 %134, -1
- %136 = bitcast i32 %135 to float
- %137 = bitcast float %136 to i32
+ %tmp82 = insertelement <4 x float> undef, float %tmp30, i32 0
+ %tmp83 = insertelement <4 x float> %tmp82, float %tmp31, i32 1
+ %tmp84 = insertelement <4 x float> %tmp83, float %tmp32, i32 2
+ %tmp85 = insertelement <4 x float> %tmp84, float 0.000000e+00, i32 3
+ %tmp86 = insertelement <4 x float> undef, float %tmp30, i32 0
+ %tmp87 = insertelement <4 x float> %tmp86, float %tmp31, i32 1
+ %tmp88 = insertelement <4 x float> %tmp87, float %tmp32, i32 2
+ %tmp89 = insertelement <4 x float> %tmp88, float 0.000000e+00, i32 3
+ %tmp90 = call float @llvm.r600.dot4(<4 x float> %tmp85, <4 x float> %tmp89)
+ %tmp91 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp90)
+ %tmp92 = fmul float %tmp30, %tmp91
+ %tmp93 = fmul float %tmp31, %tmp91
+ %tmp94 = fmul float %tmp32, %tmp91
+ %tmp95 = insertelement <4 x float> undef, float %tmp92, i32 0
+ %tmp96 = insertelement <4 x float> %tmp95, float %tmp93, i32 1
+ %tmp97 = insertelement <4 x float> %tmp96, float %tmp94, i32 2
+ %tmp98 = insertelement <4 x float> %tmp97, float 0.000000e+00, i32 3
+ %tmp99 = insertelement <4 x float> undef, float %tmp37, i32 0
+ %tmp100 = insertelement <4 x float> %tmp99, float %tmp38, i32 1
+ %tmp101 = insertelement <4 x float> %tmp100, float %tmp39, i32 2
+ %tmp102 = insertelement <4 x float> %tmp101, float 0.000000e+00, i32 3
+ %tmp103 = call float @llvm.r600.dot4(<4 x float> %tmp98, <4 x float> %tmp102)
+ %tmp104 = insertelement <4 x float> undef, float %tmp92, i32 0
+ %tmp105 = insertelement <4 x float> %tmp104, float %tmp93, i32 1
+ %tmp106 = insertelement <4 x float> %tmp105, float %tmp94, i32 2
+ %tmp107 = insertelement <4 x float> %tmp106, float 0.000000e+00, i32 3
+ %tmp108 = insertelement <4 x float> undef, float %tmp40, i32 0
+ %tmp109 = insertelement <4 x float> %tmp108, float %tmp41, i32 1
+ %tmp110 = insertelement <4 x float> %tmp109, float %tmp42, i32 2
+ %tmp111 = insertelement <4 x float> %tmp110, float 0.000000e+00, i32 3
+ %tmp112 = call float @llvm.r600.dot4(<4 x float> %tmp107, <4 x float> %tmp111)
+ %tmp113 = fsub float -0.000000e+00, %tmp92
+ %tmp114 = fsub float -0.000000e+00, %tmp93
+ %tmp115 = fsub float -0.000000e+00, %tmp94
+ %tmp116 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp117 = insertelement <4 x float> %tmp116, float %tmp35, i32 1
+ %tmp118 = insertelement <4 x float> %tmp117, float %tmp36, i32 2
+ %tmp119 = insertelement <4 x float> %tmp118, float 0.000000e+00, i32 3
+ %tmp120 = insertelement <4 x float> undef, float %tmp113, i32 0
+ %tmp121 = insertelement <4 x float> %tmp120, float %tmp114, i32 1
+ %tmp122 = insertelement <4 x float> %tmp121, float %tmp115, i32 2
+ %tmp123 = insertelement <4 x float> %tmp122, float 0.000000e+00, i32 3
+ %tmp124 = call float @llvm.r600.dot4(<4 x float> %tmp119, <4 x float> %tmp123)
+ %tmp125 = fdiv float 1.000000e+00, %tmp124
+ %tmp126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+ %tmp127 = extractelement <4 x float> %tmp126, i32 0
+ %tmp128 = fmul float %tmp127, %tmp125
+ %tmp129 = fmul float %tmp103, %tmp128
+ %tmp130 = fmul float %tmp112, %tmp128
+ %tmp131 = bitcast float %. to i32
+ %tmp132 = sitofp i32 %tmp131 to float
+ %tmp133 = fdiv float 1.000000e+00, %tmp132
+ %tmp134 = bitcast float %. to i32
+ %tmp135 = add i32 %tmp134, -1
+ %tmp136 = bitcast i32 %tmp135 to float
+ %tmp137 = bitcast float %tmp136 to i32
br label %LOOP
-ENDIF136: ; preds = %main_body, %ENDIF154
- %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ]
- %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ]
- %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ]
- %138 = fmul float %26, 0x3F847AE140000000
- %139 = fmul float %27, 0x3F847AE140000000
- %140 = fmul float %28, 0x3F847AE140000000
- %141 = insertelement <4 x float> undef, float %138, i32 0
- %142 = insertelement <4 x float> %141, float %139, i32 1
- %143 = insertelement <4 x float> %142, float %140, i32 2
- %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3
- %145 = extractelement <4 x float> %144, i32 0
- %146 = extractelement <4 x float> %144, i32 1
- %147 = extractelement <4 x float> %144, i32 2
- %148 = extractelement <4 x float> %144, i32 3
- %149 = insertelement <4 x float> undef, float %145, i32 0
- %150 = insertelement <4 x float> %149, float %146, i32 1
- %151 = insertelement <4 x float> %150, float %147, i32 2
- %152 = insertelement <4 x float> %151, float %148, i32 3
- %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3)
- %154 = extractelement <4 x float> %153, i32 0
- %155 = extractelement <4 x float> %153, i32 1
- %156 = extractelement <4 x float> %153, i32 2
- %157 = extractelement <4 x float> %153, i32 3
- %158 = fmul float %26, 0x3F45A07B40000000
- %159 = fmul float %27, 0x3F45A07B40000000
- %160 = fmul float %28, 0x3F45A07B40000000
- %161 = insertelement <4 x float> undef, float %158, i32 0
- %162 = insertelement <4 x float> %161, float %159, i32 1
- %163 = insertelement <4 x float> %162, float %160, i32 2
- %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3
- %165 = extractelement <4 x float> %164, i32 0
- %166 = extractelement <4 x float> %164, i32 1
- %167 = extractelement <4 x float> %164, i32 2
- %168 = extractelement <4 x float> %164, i32 3
- %169 = insertelement <4 x float> undef, float %165, i32 0
- %170 = insertelement <4 x float> %169, float %166, i32 1
- %171 = insertelement <4 x float> %170, float %167, i32 2
- %172 = insertelement <4 x float> %171, float %168, i32 3
- %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3)
- %174 = extractelement <4 x float> %173, i32 0
- %175 = extractelement <4 x float> %173, i32 1
- %176 = extractelement <4 x float> %173, i32 2
- %177 = extractelement <4 x float> %173, i32 3
- %178 = fmul float %176, 3.000000e+03
- %179 = fadd float %178, %28
- %180 = fdiv float 1.000000e+00, %33
- %181 = fmul float %32, %180
- %182 = call float @fabs(float %181)
- %183 = fmul float %174, 0x3FD99999A0000000
- %184 = fadd float %183, 0x3FAEB851E0000000
- %185 = fmul float %175, 0x3FE3333340000000
- %186 = fadd float %185, %184
- %187 = fmul float %176, 2.000000e+00
- %188 = fadd float %187, %186
- %189 = fmul float %177, 4.000000e+00
- %190 = fadd float %189, %188
- %191 = fmul float %154, 0x3FB99999A0000000
- %192 = fadd float %191, %190
- %193 = fmul float %155, 0x3FD99999A0000000
- %194 = fadd float %193, %192
- %195 = fmul float %156, 0x3FE99999A0000000
- %196 = fadd float %195, %194
- %197 = fmul float %157, 0x4000CCCCC0000000
- %198 = fadd float %197, %196
- %199 = fmul float 0xBE5EFB4CC0000000, %182
- %200 = fmul float %199, %182
- %201 = call float @llvm.AMDIL.exp.(float %200)
- %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000)
- %203 = fadd float %202, 0x3FF4CCCCC0000000
- %204 = fmul float %203, 0x3FE1C71C80000000
- %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00)
- %206 = fadd float %202, 0x3FF4CCCCC0000000
- %207 = fmul float %206, 0x3FE1C71C80000000
- %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00)
- %209 = fadd float %202, 2.000000e+00
- %210 = fmul float %209, 0x3FD611A7A0000000
- %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00)
- %212 = fmul float 2.000000e+00, %205
- %213 = fsub float -0.000000e+00, %212
- %214 = fadd float 3.000000e+00, %213
- %215 = fmul float %205, %214
- %216 = fmul float %205, %215
- %217 = fmul float 2.000000e+00, %208
- %218 = fsub float -0.000000e+00, %217
- %219 = fadd float 3.000000e+00, %218
- %220 = fmul float %208, %219
- %221 = fmul float %208, %220
- %222 = fmul float 2.000000e+00, %211
- %223 = fsub float -0.000000e+00, %222
- %224 = fadd float 3.000000e+00, %223
- %225 = fmul float %211, %224
- %226 = fmul float %211, %225
- %227 = fmul float %26, 0x3F368B5CC0000000
- %228 = fmul float %27, 0x3F368B5CC0000000
- %229 = insertelement <4 x float> undef, float %227, i32 0
- %230 = insertelement <4 x float> %229, float %228, i32 1
- %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2
- %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3
- %233 = extractelement <4 x float> %232, i32 0
- %234 = extractelement <4 x float> %232, i32 1
- %235 = insertelement <4 x float> undef, float %233, i32 0
- %236 = insertelement <4 x float> %235, float %234, i32 1
- %237 = insertelement <4 x float> %236, float undef, i32 2
- %238 = insertelement <4 x float> %237, float undef, i32 3
- %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2)
- %240 = extractelement <4 x float> %239, i32 0
- %241 = insertelement <4 x float> undef, float %240, i32 0
- %242 = insertelement <4 x float> %241, float %228, i32 1
- %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2
- %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3
- %245 = extractelement <4 x float> %244, i32 0
- %246 = insertelement <4 x float> undef, float %245, i32 0
- %247 = insertelement <4 x float> %246, float undef, i32 1
- %248 = insertelement <4 x float> %247, float undef, i32 2
- %249 = insertelement <4 x float> %248, float undef, i32 3
- %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1)
- %251 = extractelement <4 x float> %250, i32 0
- %252 = extractelement <4 x float> %250, i32 1
- %253 = extractelement <4 x float> %250, i32 2
- %254 = extractelement <4 x float> %250, i32 3
- %255 = fmul float %251, %216
- %256 = fmul float %252, %221
- %257 = fmul float %253, %226
- %258 = fmul float %254, 0.000000e+00
- %259 = fadd float %202, 0x3FF4CCCCC0000000
- %260 = fmul float %259, 0x3FE1C71C80000000
- %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00)
- %262 = fadd float %202, 0x3FF4CCCCC0000000
- %263 = fmul float %262, 0x3FE1C71C80000000
- %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00)
- %265 = fadd float %202, 2.000000e+00
- %266 = fmul float %265, 0x3FD611A7A0000000
- %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00)
- %268 = fmul float 2.000000e+00, %261
- %269 = fsub float -0.000000e+00, %268
- %270 = fadd float 3.000000e+00, %269
- %271 = fmul float %261, %270
- %272 = fmul float %261, %271
- %273 = fmul float 2.000000e+00, %264
- %274 = fsub float -0.000000e+00, %273
- %275 = fadd float 3.000000e+00, %274
- %276 = fmul float %264, %275
- %277 = fmul float %264, %276
- %278 = fmul float 2.000000e+00, %267
- %279 = fsub float -0.000000e+00, %278
- %280 = fadd float 3.000000e+00, %279
- %281 = fmul float %267, %280
- %282 = fmul float %267, %281
- %283 = fmul float %26, 0x3F22DFD6A0000000
- %284 = fmul float %27, 0x3F22DFD6A0000000
- %285 = insertelement <4 x float> undef, float %283, i32 0
- %286 = insertelement <4 x float> %285, float %284, i32 1
- %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2
- %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3
- %289 = extractelement <4 x float> %288, i32 0
- %290 = extractelement <4 x float> %288, i32 1
- %291 = insertelement <4 x float> undef, float %289, i32 0
- %292 = insertelement <4 x float> %291, float %290, i32 1
- %293 = insertelement <4 x float> %292, float undef, i32 2
- %294 = insertelement <4 x float> %293, float undef, i32 3
- %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2)
- %296 = extractelement <4 x float> %295, i32 0
- %297 = extractelement <4 x float> %295, i32 1
- %298 = extractelement <4 x float> %295, i32 2
- %299 = extractelement <4 x float> %295, i32 3
- %300 = fmul float %296, %272
- %301 = fmul float %297, %277
- %302 = fmul float %298, %282
- %303 = fmul float %299, 0.000000e+00
- %304 = fmul float %temp68.1, %37
- %305 = fmul float %temp68.1, %38
- %306 = fmul float %temp68.1, %39
- %307 = fmul float %temp69.0, %40
- %308 = fadd float %307, %304
- %309 = fmul float %temp69.0, %41
- %310 = fadd float %309, %305
- %311 = fmul float %temp69.0, %42
- %312 = fadd float %311, %306
- %313 = fmul float %temp70.0, %34
- %314 = fadd float %313, %308
- %315 = fmul float %temp70.0, %35
- %316 = fadd float %315, %310
- %317 = fmul float %temp70.0, %36
- %318 = fadd float %317, %312
- %319 = insertelement <4 x float> undef, float %314, i32 0
- %320 = insertelement <4 x float> %319, float %316, i32 1
- %321 = insertelement <4 x float> %320, float %318, i32 2
- %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
- %323 = insertelement <4 x float> undef, float %314, i32 0
- %324 = insertelement <4 x float> %323, float %316, i32 1
- %325 = insertelement <4 x float> %324, float %318, i32 2
- %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
- %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
- %328 = call float @llvm.AMDGPU.rsq.f32(float %327)
- %329 = fmul float %314, %328
- %330 = fmul float %316, %328
- %331 = fmul float %318, %328
- %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
- %333 = extractelement <4 x float> %332, i32 0
- %334 = fsub float -0.000000e+00, %333
- %335 = fadd float 1.000000e+00, %334
- %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
- %337 = extractelement <4 x float> %336, i32 0
- %338 = fsub float -0.000000e+00, %337
- %339 = fadd float 1.000000e+00, %338
- %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
- %341 = extractelement <4 x float> %340, i32 0
- %342 = fsub float -0.000000e+00, %341
- %343 = fadd float 1.000000e+00, %342
- %344 = fsub float -0.000000e+00, %335
- %345 = fadd float %202, %344
- %346 = fsub float -0.000000e+00, %339
- %347 = fadd float %202, %346
- %348 = fadd float %347, 0xBFE3333340000000
- %349 = fsub float -0.000000e+00, %202
- %350 = fsub float -0.000000e+00, %343
- %351 = fadd float %349, %350
- %352 = insertelement <4 x float> undef, float %43, i32 0
- %353 = insertelement <4 x float> %352, float %44, i32 1
- %354 = insertelement <4 x float> %353, float %45, i32 2
- %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3
- %356 = insertelement <4 x float> undef, float %43, i32 0
- %357 = insertelement <4 x float> %356, float %44, i32 1
- %358 = insertelement <4 x float> %357, float %45, i32 2
- %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
- %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
- %361 = call float @llvm.AMDGPU.rsq.f32(float %360)
- %362 = fmul float %45, %361
- %363 = call float @fabs(float %362)
- %364 = fmul float %176, 0x3FECCCCCC0000000
- %365 = fadd float %364, %363
- %366 = fadd float %365, 0xBFEFAE1480000000
- %367 = fmul float %366, 0xC023FFFFC0000000
- %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00)
- %369 = fsub float -0.000000e+00, %335
- %370 = fadd float %202, %369
- %371 = fadd float %370, 0x3FBEB851E0000000
- %372 = fsub float -0.000000e+00, %339
- %373 = fadd float %202, %372
- %374 = fadd float %373, 0xBFE0A3D700000000
- %375 = fsub float -0.000000e+00, %202
- %376 = fsub float -0.000000e+00, %343
- %377 = fadd float %375, %376
- %378 = insertelement <4 x float> undef, float %43, i32 0
- %379 = insertelement <4 x float> %378, float %44, i32 1
- %380 = insertelement <4 x float> %379, float %45, i32 2
- %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3
- %382 = insertelement <4 x float> undef, float %43, i32 0
- %383 = insertelement <4 x float> %382, float %44, i32 1
- %384 = insertelement <4 x float> %383, float %45, i32 2
- %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
- %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
- %387 = call float @llvm.AMDGPU.rsq.f32(float %386)
- %388 = fmul float %45, %387
- %389 = call float @fabs(float %388)
- %390 = fmul float %176, 0x3FF51EB860000000
- %391 = fadd float %390, %389
- %392 = fadd float %391, 0xBFEFAE1480000000
- %393 = fmul float %392, 0xC0490001A0000000
- %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00)
- %395 = fmul float 2.000000e+00, %368
- %396 = fsub float -0.000000e+00, %395
- %397 = fadd float 3.000000e+00, %396
- %398 = fmul float %368, %397
- %399 = fmul float %368, %398
- %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345)
- %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348)
- %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351)
- %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00)
- %404 = fmul float 2.000000e+00, %394
- %405 = fsub float -0.000000e+00, %404
- %406 = fadd float 3.000000e+00, %405
- %407 = fmul float %394, %406
- %408 = fmul float %394, %407
- %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371)
- %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374)
- %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377)
- %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000)
- %413 = fcmp oge float 2.200000e+03, %179
- %414 = sext i1 %413 to i32
- %415 = bitcast i32 %414 to float
- %416 = bitcast float %415 to i32
- %417 = icmp ne i32 %416, 0
- br i1 %417, label %IF161, label %ENDIF160
+ENDIF136: ; preds = %ENDIF154, %main_body
+ %temp68.1 = phi float [ %tmp603, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+ %temp69.0 = phi float [ %tmp605, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+ %temp70.0 = phi float [ %tmp607, %ENDIF154 ], [ 1.000000e+00, %main_body ]
+ %tmp138 = fmul float %tmp26, 0x3F847AE140000000
+ %tmp139 = fmul float %tmp27, 0x3F847AE140000000
+ %tmp140 = fmul float %tmp28, 0x3F847AE140000000
+ %tmp141 = insertelement <4 x float> undef, float %tmp138, i32 0
+ %tmp142 = insertelement <4 x float> %tmp141, float %tmp139, i32 1
+ %tmp143 = insertelement <4 x float> %tmp142, float %tmp140, i32 2
+ %tmp144 = insertelement <4 x float> %tmp143, float 0.000000e+00, i32 3
+ %tmp145 = extractelement <4 x float> %tmp144, i32 0
+ %tmp146 = extractelement <4 x float> %tmp144, i32 1
+ %tmp147 = extractelement <4 x float> %tmp144, i32 2
+ %tmp148 = extractelement <4 x float> %tmp144, i32 3
+ %tmp149 = insertelement <4 x float> undef, float %tmp145, i32 0
+ %tmp150 = insertelement <4 x float> %tmp149, float %tmp146, i32 1
+ %tmp151 = insertelement <4 x float> %tmp150, float %tmp147, i32 2
+ %tmp152 = insertelement <4 x float> %tmp151, float %tmp148, i32 3
+ %tmp153 = shufflevector <4 x float> %tmp152, <4 x float> %tmp152, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp154 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp153, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp155 = extractelement <4 x float> %tmp154, i32 0
+ %tmp156 = extractelement <4 x float> %tmp154, i32 1
+ %tmp157 = extractelement <4 x float> %tmp154, i32 2
+ %tmp158 = extractelement <4 x float> %tmp154, i32 3
+ %tmp159 = fmul float %tmp26, 0x3F45A07B40000000
+ %tmp160 = fmul float %tmp27, 0x3F45A07B40000000
+ %tmp161 = fmul float %tmp28, 0x3F45A07B40000000
+ %tmp162 = insertelement <4 x float> undef, float %tmp159, i32 0
+ %tmp163 = insertelement <4 x float> %tmp162, float %tmp160, i32 1
+ %tmp164 = insertelement <4 x float> %tmp163, float %tmp161, i32 2
+ %tmp165 = insertelement <4 x float> %tmp164, float 0.000000e+00, i32 3
+ %tmp166 = extractelement <4 x float> %tmp165, i32 0
+ %tmp167 = extractelement <4 x float> %tmp165, i32 1
+ %tmp168 = extractelement <4 x float> %tmp165, i32 2
+ %tmp169 = extractelement <4 x float> %tmp165, i32 3
+ %tmp170 = insertelement <4 x float> undef, float %tmp166, i32 0
+ %tmp171 = insertelement <4 x float> %tmp170, float %tmp167, i32 1
+ %tmp172 = insertelement <4 x float> %tmp171, float %tmp168, i32 2
+ %tmp173 = insertelement <4 x float> %tmp172, float %tmp169, i32 3
+ %tmp174 = shufflevector <4 x float> %tmp173, <4 x float> %tmp173, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp175 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp174, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp176 = extractelement <4 x float> %tmp175, i32 0
+ %tmp177 = extractelement <4 x float> %tmp175, i32 1
+ %tmp178 = extractelement <4 x float> %tmp175, i32 2
+ %tmp179 = extractelement <4 x float> %tmp175, i32 3
+ %tmp180 = fmul float %tmp178, 3.000000e+03
+ %tmp181 = fadd float %tmp180, %tmp28
+ %tmp182 = fdiv float 1.000000e+00, %tmp33
+ %tmp183 = fmul float %tmp32, %tmp182
+ %tmp184 = call float @llvm.fabs.f32(float %tmp183)
+ %tmp185 = fmul float %tmp176, 0x3FD99999A0000000
+ %tmp186 = fadd float %tmp185, 0x3FAEB851E0000000
+ %tmp187 = fmul float %tmp177, 0x3FE3333340000000
+ %tmp188 = fadd float %tmp187, %tmp186
+ %tmp189 = fmul float %tmp178, 2.000000e+00
+ %tmp190 = fadd float %tmp189, %tmp188
+ %tmp191 = fmul float %tmp179, 4.000000e+00
+ %tmp192 = fadd float %tmp191, %tmp190
+ %tmp193 = fmul float %tmp155, 0x3FB99999A0000000
+ %tmp194 = fadd float %tmp193, %tmp192
+ %tmp195 = fmul float %tmp156, 0x3FD99999A0000000
+ %tmp196 = fadd float %tmp195, %tmp194
+ %tmp197 = fmul float %tmp157, 0x3FE99999A0000000
+ %tmp198 = fadd float %tmp197, %tmp196
+ %tmp199 = fmul float %tmp158, 0x4000CCCCC0000000
+ %tmp200 = fadd float %tmp199, %tmp198
+ %tmp201 = fmul float 0xBE5EFB4CC0000000, %tmp184
+ %tmp202 = fmul float %tmp201, %tmp184
+ %tmp203 = call float @llvm.exp2.f32(float %tmp202)
+ %one.sub.a.i = fsub float 1.000000e+00, %tmp203
+ %one.sub.ac.i = fmul float %one.sub.a.i, 0x3FA99999A0000000
+ %mul.i = fmul float %tmp200, 0x3FA99999A0000000
+ %result.i = fadd float %mul.i, %one.sub.ac.i
+ %tmp204 = fadd float %result.i, 0x3FF4CCCCC0000000
+ %tmp205 = fmul float %tmp204, 0x3FE1C71C80000000
+ %tmp206 = call float @llvm.AMDGPU.clamp.f32(float %tmp205, float 0.000000e+00, float 1.000000e+00)
+ %tmp207 = fadd float %result.i, 0x3FF4CCCCC0000000
+ %tmp208 = fmul float %tmp207, 0x3FE1C71C80000000
+ %tmp209 = call float @llvm.AMDGPU.clamp.f32(float %tmp208, float 0.000000e+00, float 1.000000e+00)
+ %tmp210 = fadd float %result.i, 2.000000e+00
+ %tmp211 = fmul float %tmp210, 0x3FD611A7A0000000
+ %tmp212 = call float @llvm.AMDGPU.clamp.f32(float %tmp211, float 0.000000e+00, float 1.000000e+00)
+ %tmp213 = fmul float 2.000000e+00, %tmp206
+ %tmp214 = fsub float -0.000000e+00, %tmp213
+ %tmp215 = fadd float 3.000000e+00, %tmp214
+ %tmp216 = fmul float %tmp206, %tmp215
+ %tmp217 = fmul float %tmp206, %tmp216
+ %tmp218 = fmul float 2.000000e+00, %tmp209
+ %tmp219 = fsub float -0.000000e+00, %tmp218
+ %tmp220 = fadd float 3.000000e+00, %tmp219
+ %tmp221 = fmul float %tmp209, %tmp220
+ %tmp222 = fmul float %tmp209, %tmp221
+ %tmp223 = fmul float 2.000000e+00, %tmp212
+ %tmp224 = fsub float -0.000000e+00, %tmp223
+ %tmp225 = fadd float 3.000000e+00, %tmp224
+ %tmp226 = fmul float %tmp212, %tmp225
+ %tmp227 = fmul float %tmp212, %tmp226
+ %tmp228 = fmul float %tmp26, 0x3F368B5CC0000000
+ %tmp229 = fmul float %tmp27, 0x3F368B5CC0000000
+ %tmp230 = insertelement <4 x float> undef, float %tmp228, i32 0
+ %tmp231 = insertelement <4 x float> %tmp230, float %tmp229, i32 1
+ %tmp232 = insertelement <4 x float> %tmp231, float 0.000000e+00, i32 2
+ %tmp233 = insertelement <4 x float> %tmp232, float 0.000000e+00, i32 3
+ %tmp234 = extractelement <4 x float> %tmp233, i32 0
+ %tmp235 = extractelement <4 x float> %tmp233, i32 1
+ %tmp236 = insertelement <4 x float> undef, float %tmp234, i32 0
+ %tmp237 = insertelement <4 x float> %tmp236, float %tmp235, i32 1
+ %tmp238 = insertelement <4 x float> %tmp237, float undef, i32 2
+ %tmp239 = insertelement <4 x float> %tmp238, float undef, i32 3
+ %tmp240 = shufflevector <4 x float> %tmp239, <4 x float> %tmp239, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp241 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp240, i32 0, i32 0, i32 0, i32 17, i32 1, i32 1, i32 1, i32 1, i32 1)
+ %tmp242 = extractelement <4 x float> %tmp241, i32 0
+ %tmp243 = insertelement <4 x float> undef, float %tmp242, i32 0
+ %tmp244 = insertelement <4 x float> %tmp243, float %tmp229, i32 1
+ %tmp245 = insertelement <4 x float> %tmp244, float 0.000000e+00, i32 2
+ %tmp246 = insertelement <4 x float> %tmp245, float 0.000000e+00, i32 3
+ %tmp247 = extractelement <4 x float> %tmp246, i32 0
+ %tmp248 = insertelement <4 x float> undef, float %tmp247, i32 0
+ %tmp249 = insertelement <4 x float> %tmp248, float undef, i32 1
+ %tmp250 = insertelement <4 x float> %tmp249, float undef, i32 2
+ %tmp251 = insertelement <4 x float> %tmp250, float undef, i32 3
+ %tmp252 = shufflevector <4 x float> %tmp251, <4 x float> %tmp251, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp253 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp252, i32 0, i32 0, i32 0, i32 18, i32 2, i32 1, i32 1, i32 1, i32 1)
+ %tmp254 = extractelement <4 x float> %tmp253, i32 0
+ %tmp255 = extractelement <4 x float> %tmp253, i32 1
+ %tmp256 = extractelement <4 x float> %tmp253, i32 2
+ %tmp257 = extractelement <4 x float> %tmp253, i32 3
+ %tmp258 = fmul float %tmp254, %tmp217
+ %tmp259 = fmul float %tmp255, %tmp222
+ %tmp260 = fmul float %tmp256, %tmp227
+ %tmp261 = fmul float %tmp257, 0.000000e+00
+ %tmp262 = fadd float %result.i, 0x3FF4CCCCC0000000
+ %tmp263 = fmul float %tmp262, 0x3FE1C71C80000000
+ %tmp264 = call float @llvm.AMDGPU.clamp.f32(float %tmp263, float 0.000000e+00, float 1.000000e+00)
+ %tmp265 = fadd float %result.i, 0x3FF4CCCCC0000000
+ %tmp266 = fmul float %tmp265, 0x3FE1C71C80000000
+ %tmp267 = call float @llvm.AMDGPU.clamp.f32(float %tmp266, float 0.000000e+00, float 1.000000e+00)
+ %tmp268 = fadd float %result.i, 2.000000e+00
+ %tmp269 = fmul float %tmp268, 0x3FD611A7A0000000
+ %tmp270 = call float @llvm.AMDGPU.clamp.f32(float %tmp269, float 0.000000e+00, float 1.000000e+00)
+ %tmp271 = fmul float 2.000000e+00, %tmp264
+ %tmp272 = fsub float -0.000000e+00, %tmp271
+ %tmp273 = fadd float 3.000000e+00, %tmp272
+ %tmp274 = fmul float %tmp264, %tmp273
+ %tmp275 = fmul float %tmp264, %tmp274
+ %tmp276 = fmul float 2.000000e+00, %tmp267
+ %tmp277 = fsub float -0.000000e+00, %tmp276
+ %tmp278 = fadd float 3.000000e+00, %tmp277
+ %tmp279 = fmul float %tmp267, %tmp278
+ %tmp280 = fmul float %tmp267, %tmp279
+ %tmp281 = fmul float 2.000000e+00, %tmp270
+ %tmp282 = fsub float -0.000000e+00, %tmp281
+ %tmp283 = fadd float 3.000000e+00, %tmp282
+ %tmp284 = fmul float %tmp270, %tmp283
+ %tmp285 = fmul float %tmp270, %tmp284
+ %tmp286 = fmul float %tmp26, 0x3F22DFD6A0000000
+ %tmp287 = fmul float %tmp27, 0x3F22DFD6A0000000
+ %tmp288 = insertelement <4 x float> undef, float %tmp286, i32 0
+ %tmp289 = insertelement <4 x float> %tmp288, float %tmp287, i32 1
+ %tmp290 = insertelement <4 x float> %tmp289, float 0.000000e+00, i32 2
+ %tmp291 = insertelement <4 x float> %tmp290, float 0.000000e+00, i32 3
+ %tmp292 = extractelement <4 x float> %tmp291, i32 0
+ %tmp293 = extractelement <4 x float> %tmp291, i32 1
+ %tmp294 = insertelement <4 x float> undef, float %tmp292, i32 0
+ %tmp295 = insertelement <4 x float> %tmp294, float %tmp293, i32 1
+ %tmp296 = insertelement <4 x float> %tmp295, float undef, i32 2
+ %tmp297 = insertelement <4 x float> %tmp296, float undef, i32 3
+ %tmp298 = shufflevector <4 x float> %tmp297, <4 x float> %tmp297, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp299 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp298, i32 0, i32 0, i32 0, i32 19, i32 3, i32 1, i32 1, i32 1, i32 1)
+ %tmp300 = extractelement <4 x float> %tmp299, i32 0
+ %tmp301 = extractelement <4 x float> %tmp299, i32 1
+ %tmp302 = extractelement <4 x float> %tmp299, i32 2
+ %tmp303 = extractelement <4 x float> %tmp299, i32 3
+ %tmp304 = fmul float %tmp300, %tmp275
+ %tmp305 = fmul float %tmp301, %tmp280
+ %tmp306 = fmul float %tmp302, %tmp285
+ %tmp307 = fmul float %tmp303, 0.000000e+00
+ %tmp308 = fmul float %temp68.1, %tmp37
+ %tmp309 = fmul float %temp68.1, %tmp38
+ %tmp310 = fmul float %temp68.1, %tmp39
+ %tmp311 = fmul float %temp69.0, %tmp40
+ %tmp312 = fadd float %tmp311, %tmp308
+ %tmp313 = fmul float %temp69.0, %tmp41
+ %tmp314 = fadd float %tmp313, %tmp309
+ %tmp315 = fmul float %temp69.0, %tmp42
+ %tmp316 = fadd float %tmp315, %tmp310
+ %tmp317 = fmul float %temp70.0, %tmp34
+ %tmp318 = fadd float %tmp317, %tmp312
+ %tmp319 = fmul float %temp70.0, %tmp35
+ %tmp320 = fadd float %tmp319, %tmp314
+ %tmp321 = fmul float %temp70.0, %tmp36
+ %tmp322 = fadd float %tmp321, %tmp316
+ %tmp323 = insertelement <4 x float> undef, float %tmp318, i32 0
+ %tmp324 = insertelement <4 x float> %tmp323, float %tmp320, i32 1
+ %tmp325 = insertelement <4 x float> %tmp324, float %tmp322, i32 2
+ %tmp326 = insertelement <4 x float> %tmp325, float 0.000000e+00, i32 3
+ %tmp327 = insertelement <4 x float> undef, float %tmp318, i32 0
+ %tmp328 = insertelement <4 x float> %tmp327, float %tmp320, i32 1
+ %tmp329 = insertelement <4 x float> %tmp328, float %tmp322, i32 2
+ %tmp330 = insertelement <4 x float> %tmp329, float 0.000000e+00, i32 3
+ %tmp331 = call float @llvm.r600.dot4(<4 x float> %tmp326, <4 x float> %tmp330)
+ %tmp332 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp331)
+ %tmp333 = fmul float %tmp318, %tmp332
+ %tmp334 = fmul float %tmp320, %tmp332
+ %tmp335 = fmul float %tmp322, %tmp332
+ %tmp336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+ %tmp337 = extractelement <4 x float> %tmp336, i32 0
+ %tmp338 = fsub float -0.000000e+00, %tmp337
+ %tmp339 = fadd float 1.000000e+00, %tmp338
+ %tmp340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+ %tmp341 = extractelement <4 x float> %tmp340, i32 0
+ %tmp342 = fsub float -0.000000e+00, %tmp341
+ %tmp343 = fadd float 1.000000e+00, %tmp342
+ %tmp344 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+ %tmp345 = extractelement <4 x float> %tmp344, i32 0
+ %tmp346 = fsub float -0.000000e+00, %tmp345
+ %tmp347 = fadd float 1.000000e+00, %tmp346
+ %tmp348 = fsub float -0.000000e+00, %tmp339
+ %tmp349 = fadd float %result.i, %tmp348
+ %tmp350 = fsub float -0.000000e+00, %tmp343
+ %tmp351 = fadd float %result.i, %tmp350
+ %tmp352 = fadd float %tmp351, 0xBFE3333340000000
+ %tmp353 = fsub float -0.000000e+00, %result.i
+ %tmp354 = fsub float -0.000000e+00, %tmp347
+ %tmp355 = fadd float %tmp353, %tmp354
+ %tmp356 = insertelement <4 x float> undef, float %tmp43, i32 0
+ %tmp357 = insertelement <4 x float> %tmp356, float %tmp44, i32 1
+ %tmp358 = insertelement <4 x float> %tmp357, float %tmp45, i32 2
+ %tmp359 = insertelement <4 x float> %tmp358, float 0.000000e+00, i32 3
+ %tmp360 = insertelement <4 x float> undef, float %tmp43, i32 0
+ %tmp361 = insertelement <4 x float> %tmp360, float %tmp44, i32 1
+ %tmp362 = insertelement <4 x float> %tmp361, float %tmp45, i32 2
+ %tmp363 = insertelement <4 x float> %tmp362, float 0.000000e+00, i32 3
+ %tmp364 = call float @llvm.r600.dot4(<4 x float> %tmp359, <4 x float> %tmp363)
+ %tmp365 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp364)
+ %tmp366 = fmul float %tmp45, %tmp365
+ %tmp367 = call float @llvm.fabs.f32(float %tmp366)
+ %tmp368 = fmul float %tmp178, 0x3FECCCCCC0000000
+ %tmp369 = fadd float %tmp368, %tmp367
+ %tmp370 = fadd float %tmp369, 0xBFEFAE1480000000
+ %tmp371 = fmul float %tmp370, 0xC023FFFFC0000000
+ %tmp372 = call float @llvm.AMDGPU.clamp.f32(float %tmp371, float 0.000000e+00, float 1.000000e+00)
+ %tmp373 = fsub float -0.000000e+00, %tmp339
+ %tmp374 = fadd float %result.i, %tmp373
+ %tmp375 = fadd float %tmp374, 0x3FBEB851E0000000
+ %tmp376 = fsub float -0.000000e+00, %tmp343
+ %tmp377 = fadd float %result.i, %tmp376
+ %tmp378 = fadd float %tmp377, 0xBFE0A3D700000000
+ %tmp379 = fsub float -0.000000e+00, %result.i
+ %tmp380 = fsub float -0.000000e+00, %tmp347
+ %tmp381 = fadd float %tmp379, %tmp380
+ %tmp382 = insertelement <4 x float> undef, float %tmp43, i32 0
+ %tmp383 = insertelement <4 x float> %tmp382, float %tmp44, i32 1
+ %tmp384 = insertelement <4 x float> %tmp383, float %tmp45, i32 2
+ %tmp385 = insertelement <4 x float> %tmp384, float 0.000000e+00, i32 3
+ %tmp386 = insertelement <4 x float> undef, float %tmp43, i32 0
+ %tmp387 = insertelement <4 x float> %tmp386, float %tmp44, i32 1
+ %tmp388 = insertelement <4 x float> %tmp387, float %tmp45, i32 2
+ %tmp389 = insertelement <4 x float> %tmp388, float 0.000000e+00, i32 3
+ %tmp390 = call float @llvm.r600.dot4(<4 x float> %tmp385, <4 x float> %tmp389)
+ %tmp391 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp390)
+ %tmp392 = fmul float %tmp45, %tmp391
+ %tmp393 = call float @llvm.fabs.f32(float %tmp392)
+ %tmp394 = fmul float %tmp178, 0x3FF51EB860000000
+ %tmp395 = fadd float %tmp394, %tmp393
+ %tmp396 = fadd float %tmp395, 0xBFEFAE1480000000
+ %tmp397 = fmul float %tmp396, 0xC0490001A0000000
+ %tmp398 = call float @llvm.AMDGPU.clamp.f32(float %tmp397, float 0.000000e+00, float 1.000000e+00)
+ %tmp399 = fmul float 2.000000e+00, %tmp372
+ %tmp400 = fsub float -0.000000e+00, %tmp399
+ %tmp401 = fadd float 3.000000e+00, %tmp400
+ %tmp402 = fmul float %tmp372, %tmp401
+ %tmp403 = fmul float %tmp372, %tmp402
+ %one.sub.a.i169 = fsub float 1.000000e+00, %tmp403
+ %one.sub.ac.i170 = fmul float %one.sub.a.i169, %tmp349
+ %mul.i171 = fmul float %tmp258, %tmp349
+ %result.i172 = fadd float %mul.i171, %one.sub.ac.i170
+ %one.sub.a.i165 = fsub float 1.000000e+00, %tmp403
+ %one.sub.ac.i166 = fmul float %one.sub.a.i165, %tmp352
+ %mul.i167 = fmul float %tmp259, %tmp352
+ %result.i168 = fadd float %mul.i167, %one.sub.ac.i166
+ %one.sub.a.i161 = fsub float 1.000000e+00, %tmp403
+ %one.sub.ac.i162 = fmul float %one.sub.a.i161, %tmp355
+ %mul.i163 = fmul float %tmp260, %tmp355
+ %result.i164 = fadd float %mul.i163, %one.sub.ac.i162
+ %one.sub.a.i157 = fsub float 1.000000e+00, %tmp403
+ %one.sub.ac.i158 = fmul float %one.sub.a.i157, 0.000000e+00
+ %mul.i159 = fmul float %tmp261, 0.000000e+00
+ %result.i160 = fadd float %mul.i159, %one.sub.ac.i158
+ %tmp404 = fmul float 2.000000e+00, %tmp398
+ %tmp405 = fsub float -0.000000e+00, %tmp404
+ %tmp406 = fadd float 3.000000e+00, %tmp405
+ %tmp407 = fmul float %tmp398, %tmp406
+ %tmp408 = fmul float %tmp398, %tmp407
+ %one.sub.a.i153 = fsub float 1.000000e+00, %tmp408
+ %one.sub.ac.i154 = fmul float %one.sub.a.i153, %tmp375
+ %mul.i155 = fmul float %tmp258, %tmp375
+ %result.i156 = fadd float %mul.i155, %one.sub.ac.i154
+ %one.sub.a.i149 = fsub float 1.000000e+00, %tmp408
+ %one.sub.ac.i150 = fmul float %one.sub.a.i149, %tmp378
+ %mul.i151 = fmul float %tmp259, %tmp378
+ %result.i152 = fadd float %mul.i151, %one.sub.ac.i150
+ %one.sub.a.i145 = fsub float 1.000000e+00, %tmp408
+ %one.sub.ac.i146 = fmul float %one.sub.a.i145, %tmp381
+ %mul.i147 = fmul float %tmp260, %tmp381
+ %result.i148 = fadd float %mul.i147, %one.sub.ac.i146
+ %one.sub.a.i141 = fsub float 1.000000e+00, %tmp408
+ %one.sub.ac.i142 = fmul float %one.sub.a.i141, 0x3FD3333340000000
+ %mul.i143 = fmul float %tmp261, 0x3FD3333340000000
+ %result.i144 = fadd float %mul.i143, %one.sub.ac.i142
+ %tmp409 = fcmp oge float 2.200000e+03, %tmp181
+ %tmp410 = sext i1 %tmp409 to i32
+ %tmp411 = bitcast i32 %tmp410 to float
+ %tmp412 = bitcast float %tmp411 to i32
+ %tmp413 = icmp ne i32 %tmp412, 0
+ br i1 %tmp413, label %IF161, label %ENDIF160
LOOP: ; preds = %ENDIF139, %IF137
- %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ]
+ %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %tmp443, %ENDIF139 ]
%temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ]
- %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ]
- %418 = bitcast float %temp96.0 to i32
- %419 = icmp sge i32 %418, %137
- %420 = sext i1 %419 to i32
- %421 = bitcast i32 %420 to float
- %422 = bitcast float %421 to i32
- %423 = icmp ne i32 %422, 0
- br i1 %423, label %IF140, label %ENDIF139
+ %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %tmp475, %ENDIF139 ]
+ %tmp414 = bitcast float %temp96.0 to i32
+ %tmp415 = icmp sge i32 %tmp414, %tmp137
+ %tmp416 = sext i1 %tmp415 to i32
+ %tmp417 = bitcast i32 %tmp416 to float
+ %tmp418 = bitcast float %tmp417 to i32
+ %tmp419 = icmp ne i32 %tmp418, 0
+ br i1 %tmp419, label %IF140, label %ENDIF139
IF140: ; preds = %LOOP
- %424 = fmul float %133, 5.000000e-01
- %425 = fmul float %129, %temp92.0
- %426 = fadd float %425, %22
- %427 = fmul float %130, %temp92.0
- %428 = fadd float %427, %23
- %429 = insertelement <4 x float> undef, float %426, i32 0
- %430 = insertelement <4 x float> %429, float %428, i32 1
- %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2
- %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3
- %433 = extractelement <4 x float> %432, i32 0
- %434 = extractelement <4 x float> %432, i32 1
- %435 = insertelement <4 x float> undef, float %433, i32 0
- %436 = insertelement <4 x float> %435, float %434, i32 1
- %437 = insertelement <4 x float> %436, float undef, i32 2
- %438 = insertelement <4 x float> %437, float undef, i32 3
- %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2)
- %440 = extractelement <4 x float> %439, i32 3
- %441 = fcmp oge float %temp92.0, %440
- %442 = sext i1 %441 to i32
- %443 = bitcast i32 %442 to float
- %444 = bitcast float %443 to i32
- %445 = icmp ne i32 %444, 0
- br i1 %445, label %IF146, label %ENDIF145
+ %tmp420 = fmul float %tmp133, 5.000000e-01
+ %tmp421 = fmul float %tmp129, %temp92.0
+ %tmp422 = fadd float %tmp421, %tmp22
+ %tmp423 = fmul float %tmp130, %temp92.0
+ %tmp424 = fadd float %tmp423, %tmp23
+ %tmp425 = insertelement <4 x float> undef, float %tmp422, i32 0
+ %tmp426 = insertelement <4 x float> %tmp425, float %tmp424, i32 1
+ %tmp427 = insertelement <4 x float> %tmp426, float 0.000000e+00, i32 2
+ %tmp428 = insertelement <4 x float> %tmp427, float 0.000000e+00, i32 3
+ %tmp429 = extractelement <4 x float> %tmp428, i32 0
+ %tmp430 = extractelement <4 x float> %tmp428, i32 1
+ %tmp431 = insertelement <4 x float> undef, float %tmp429, i32 0
+ %tmp432 = insertelement <4 x float> %tmp431, float %tmp430, i32 1
+ %tmp433 = insertelement <4 x float> %tmp432, float undef, i32 2
+ %tmp434 = insertelement <4 x float> %tmp433, float undef, i32 3
+ %tmp435 = shufflevector <4 x float> %tmp434, <4 x float> %tmp434, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp436 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp435, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+ %tmp437 = extractelement <4 x float> %tmp436, i32 3
+ %tmp438 = fcmp oge float %temp92.0, %tmp437
+ %tmp439 = sext i1 %tmp438 to i32
+ %tmp440 = bitcast i32 %tmp439 to float
+ %tmp441 = bitcast float %tmp440 to i32
+ %tmp442 = icmp ne i32 %tmp441, 0
+ br i1 %tmp442, label %IF146, label %ENDIF145
ENDIF139: ; preds = %LOOP
- %446 = fadd float %temp88.0, %133
- %447 = fmul float %129, %446
- %448 = fadd float %447, %22
- %449 = fmul float %130, %446
- %450 = fadd float %449, %23
- %451 = insertelement <4 x float> undef, float %448, i32 0
- %452 = insertelement <4 x float> %451, float %450, i32 1
- %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2
- %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3
- %455 = extractelement <4 x float> %454, i32 0
- %456 = extractelement <4 x float> %454, i32 1
- %457 = insertelement <4 x float> undef, float %455, i32 0
- %458 = insertelement <4 x float> %457, float %456, i32 1
- %459 = insertelement <4 x float> %458, float undef, i32 2
- %460 = insertelement <4 x float> %459, float undef, i32 3
- %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2)
- %462 = extractelement <4 x float> %461, i32 3
- %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
- %464 = sext i1 %463 to i32
- %465 = bitcast i32 %464 to float
- %466 = fcmp oge float %446, %462
- %467 = sext i1 %466 to i32
- %468 = bitcast i32 %467 to float
- %469 = bitcast float %465 to i32
- %470 = bitcast float %468 to i32
- %471 = and i32 %469, %470
- %472 = bitcast i32 %471 to float
- %473 = bitcast float %472 to i32
- %474 = icmp ne i32 %473, 0
- %.temp92.0 = select i1 %474, float %446, float %temp92.0
- %475 = bitcast float %temp96.0 to i32
- %476 = add i32 %475, 1
- %477 = bitcast i32 %476 to float
+ %tmp443 = fadd float %temp88.0, %tmp133
+ %tmp444 = fmul float %tmp129, %tmp443
+ %tmp445 = fadd float %tmp444, %tmp22
+ %tmp446 = fmul float %tmp130, %tmp443
+ %tmp447 = fadd float %tmp446, %tmp23
+ %tmp448 = insertelement <4 x float> undef, float %tmp445, i32 0
+ %tmp449 = insertelement <4 x float> %tmp448, float %tmp447, i32 1
+ %tmp450 = insertelement <4 x float> %tmp449, float 0.000000e+00, i32 2
+ %tmp451 = insertelement <4 x float> %tmp450, float 0.000000e+00, i32 3
+ %tmp452 = extractelement <4 x float> %tmp451, i32 0
+ %tmp453 = extractelement <4 x float> %tmp451, i32 1
+ %tmp454 = insertelement <4 x float> undef, float %tmp452, i32 0
+ %tmp455 = insertelement <4 x float> %tmp454, float %tmp453, i32 1
+ %tmp456 = insertelement <4 x float> %tmp455, float undef, i32 2
+ %tmp457 = insertelement <4 x float> %tmp456, float undef, i32 3
+ %tmp458 = shufflevector <4 x float> %tmp457, <4 x float> %tmp457, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp459 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp458, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+ %tmp460 = extractelement <4 x float> %tmp459, i32 3
+ %tmp461 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
+ %tmp462 = sext i1 %tmp461 to i32
+ %tmp463 = bitcast i32 %tmp462 to float
+ %tmp464 = fcmp oge float %tmp443, %tmp460
+ %tmp465 = sext i1 %tmp464 to i32
+ %tmp466 = bitcast i32 %tmp465 to float
+ %tmp467 = bitcast float %tmp463 to i32
+ %tmp468 = bitcast float %tmp466 to i32
+ %tmp469 = and i32 %tmp467, %tmp468
+ %tmp470 = bitcast i32 %tmp469 to float
+ %tmp471 = bitcast float %tmp470 to i32
+ %tmp472 = icmp ne i32 %tmp471, 0
+ %.temp92.0 = select i1 %tmp472, float %tmp443, float %temp92.0
+ %tmp473 = bitcast float %temp96.0 to i32
+ %tmp474 = add i32 %tmp473, 1
+ %tmp475 = bitcast i32 %tmp474 to float
br label %LOOP
IF146: ; preds = %IF140
- %478 = fmul float 2.000000e+00, %424
- %479 = fsub float -0.000000e+00, %478
- %480 = fadd float %temp92.0, %479
+ %tmp476 = fmul float 2.000000e+00, %tmp420
+ %tmp477 = fsub float -0.000000e+00, %tmp476
+ %tmp478 = fadd float %temp92.0, %tmp477
br label %ENDIF145
-ENDIF145: ; preds = %IF140, %IF146
- %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ]
- %481 = fadd float %temp88.1, %424
- %482 = fmul float %424, 5.000000e-01
- %483 = fmul float %129, %481
- %484 = fadd float %483, %22
- %485 = fmul float %130, %481
- %486 = fadd float %485, %23
- %487 = insertelement <4 x float> undef, float %484, i32 0
- %488 = insertelement <4 x float> %487, float %486, i32 1
- %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2
- %490 = insertelement <4 x float> %489, float %440, i32 3
- %491 = extractelement <4 x float> %490, i32 0
- %492 = extractelement <4 x float> %490, i32 1
- %493 = insertelement <4 x float> undef, float %491, i32 0
- %494 = insertelement <4 x float> %493, float %492, i32 1
- %495 = insertelement <4 x float> %494, float undef, i32 2
- %496 = insertelement <4 x float> %495, float undef, i32 3
- %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2)
- %498 = extractelement <4 x float> %497, i32 3
- %499 = fcmp oge float %481, %498
- %500 = sext i1 %499 to i32
- %501 = bitcast i32 %500 to float
- %502 = bitcast float %501 to i32
- %503 = icmp ne i32 %502, 0
- br i1 %503, label %IF149, label %ENDIF148
+ENDIF145: ; preds = %IF146, %IF140
+ %temp88.1 = phi float [ %tmp478, %IF146 ], [ %temp92.0, %IF140 ]
+ %tmp479 = fadd float %temp88.1, %tmp420
+ %tmp480 = fmul float %tmp420, 5.000000e-01
+ %tmp481 = fmul float %tmp129, %tmp479
+ %tmp482 = fadd float %tmp481, %tmp22
+ %tmp483 = fmul float %tmp130, %tmp479
+ %tmp484 = fadd float %tmp483, %tmp23
+ %tmp485 = insertelement <4 x float> undef, float %tmp482, i32 0
+ %tmp486 = insertelement <4 x float> %tmp485, float %tmp484, i32 1
+ %tmp487 = insertelement <4 x float> %tmp486, float 0.000000e+00, i32 2
+ %tmp488 = insertelement <4 x float> %tmp487, float %tmp437, i32 3
+ %tmp489 = extractelement <4 x float> %tmp488, i32 0
+ %tmp490 = extractelement <4 x float> %tmp488, i32 1
+ %tmp491 = insertelement <4 x float> undef, float %tmp489, i32 0
+ %tmp492 = insertelement <4 x float> %tmp491, float %tmp490, i32 1
+ %tmp493 = insertelement <4 x float> %tmp492, float undef, i32 2
+ %tmp494 = insertelement <4 x float> %tmp493, float undef, i32 3
+ %tmp495 = shufflevector <4 x float> %tmp494, <4 x float> %tmp494, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp496 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp495, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+ %tmp497 = extractelement <4 x float> %tmp496, i32 3
+ %tmp498 = fcmp oge float %tmp479, %tmp497
+ %tmp499 = sext i1 %tmp498 to i32
+ %tmp500 = bitcast i32 %tmp499 to float
+ %tmp501 = bitcast float %tmp500 to i32
+ %tmp502 = icmp ne i32 %tmp501, 0
+ br i1 %tmp502, label %IF149, label %ENDIF148
IF149: ; preds = %ENDIF145
- %504 = fmul float 2.000000e+00, %482
- %505 = fsub float -0.000000e+00, %504
- %506 = fadd float %481, %505
+ %tmp503 = fmul float 2.000000e+00, %tmp480
+ %tmp504 = fsub float -0.000000e+00, %tmp503
+ %tmp505 = fadd float %tmp479, %tmp504
br label %ENDIF148
-ENDIF148: ; preds = %ENDIF145, %IF149
- %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ]
- %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ]
- %507 = fadd float %temp88.2, %482
- %508 = fmul float %482, 5.000000e-01
- %509 = fmul float %129, %507
- %510 = fadd float %509, %22
- %511 = fmul float %130, %507
- %512 = fadd float %511, %23
- %513 = insertelement <4 x float> undef, float %510, i32 0
- %514 = insertelement <4 x float> %513, float %512, i32 1
- %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2
- %516 = insertelement <4 x float> %515, float %498, i32 3
- %517 = extractelement <4 x float> %516, i32 0
- %518 = extractelement <4 x float> %516, i32 1
- %519 = insertelement <4 x float> undef, float %517, i32 0
- %520 = insertelement <4 x float> %519, float %518, i32 1
- %521 = insertelement <4 x float> %520, float undef, i32 2
- %522 = insertelement <4 x float> %521, float undef, i32 3
- %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2)
- %524 = extractelement <4 x float> %523, i32 3
- %525 = fcmp oge float %507, %524
- %526 = sext i1 %525 to i32
- %527 = bitcast i32 %526 to float
- %528 = bitcast float %527 to i32
- %529 = icmp ne i32 %528, 0
- br i1 %529, label %IF152, label %ENDIF151
+ENDIF148: ; preds = %IF149, %ENDIF145
+ %temp88.2 = phi float [ %tmp505, %IF149 ], [ %tmp479, %ENDIF145 ]
+ %temp92.2 = phi float [ %tmp479, %IF149 ], [ %temp92.0, %ENDIF145 ]
+ %tmp506 = fadd float %temp88.2, %tmp480
+ %tmp507 = fmul float %tmp480, 5.000000e-01
+ %tmp508 = fmul float %tmp129, %tmp506
+ %tmp509 = fadd float %tmp508, %tmp22
+ %tmp510 = fmul float %tmp130, %tmp506
+ %tmp511 = fadd float %tmp510, %tmp23
+ %tmp512 = insertelement <4 x float> undef, float %tmp509, i32 0
+ %tmp513 = insertelement <4 x float> %tmp512, float %tmp511, i32 1
+ %tmp514 = insertelement <4 x float> %tmp513, float 0.000000e+00, i32 2
+ %tmp515 = insertelement <4 x float> %tmp514, float %tmp497, i32 3
+ %tmp516 = extractelement <4 x float> %tmp515, i32 0
+ %tmp517 = extractelement <4 x float> %tmp515, i32 1
+ %tmp518 = insertelement <4 x float> undef, float %tmp516, i32 0
+ %tmp519 = insertelement <4 x float> %tmp518, float %tmp517, i32 1
+ %tmp520 = insertelement <4 x float> %tmp519, float undef, i32 2
+ %tmp521 = insertelement <4 x float> %tmp520, float undef, i32 3
+ %tmp522 = shufflevector <4 x float> %tmp521, <4 x float> %tmp521, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp523 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp522, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+ %tmp524 = extractelement <4 x float> %tmp523, i32 3
+ %tmp525 = fcmp oge float %tmp506, %tmp524
+ %tmp526 = sext i1 %tmp525 to i32
+ %tmp527 = bitcast i32 %tmp526 to float
+ %tmp528 = bitcast float %tmp527 to i32
+ %tmp529 = icmp ne i32 %tmp528, 0
+ br i1 %tmp529, label %IF152, label %ENDIF151
IF152: ; preds = %ENDIF148
- %530 = fmul float 2.000000e+00, %508
- %531 = fsub float -0.000000e+00, %530
- %532 = fadd float %507, %531
+ %tmp530 = fmul float 2.000000e+00, %tmp507
+ %tmp531 = fsub float -0.000000e+00, %tmp530
+ %tmp532 = fadd float %tmp506, %tmp531
br label %ENDIF151
-ENDIF151: ; preds = %ENDIF148, %IF152
- %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ]
- %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ]
- %533 = fadd float %temp88.3, %508
- %534 = fmul float %508, 5.000000e-01
- %535 = fmul float %129, %533
- %536 = fadd float %535, %22
- %537 = fmul float %130, %533
- %538 = fadd float %537, %23
- %539 = insertelement <4 x float> undef, float %536, i32 0
- %540 = insertelement <4 x float> %539, float %538, i32 1
- %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2
- %542 = insertelement <4 x float> %541, float %524, i32 3
- %543 = extractelement <4 x float> %542, i32 0
- %544 = extractelement <4 x float> %542, i32 1
- %545 = insertelement <4 x float> undef, float %543, i32 0
- %546 = insertelement <4 x float> %545, float %544, i32 1
- %547 = insertelement <4 x float> %546, float undef, i32 2
- %548 = insertelement <4 x float> %547, float undef, i32 3
- %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2)
- %550 = extractelement <4 x float> %549, i32 3
- %551 = fcmp oge float %533, %550
- %552 = sext i1 %551 to i32
- %553 = bitcast i32 %552 to float
- %554 = bitcast float %553 to i32
- %555 = icmp ne i32 %554, 0
- br i1 %555, label %IF155, label %ENDIF154
+ENDIF151: ; preds = %IF152, %ENDIF148
+ %temp88.3 = phi float [ %tmp532, %IF152 ], [ %tmp506, %ENDIF148 ]
+ %temp92.3 = phi float [ %tmp506, %IF152 ], [ %temp92.2, %ENDIF148 ]
+ %tmp533 = fadd float %temp88.3, %tmp507
+ %tmp534 = fmul float %tmp507, 5.000000e-01
+ %tmp535 = fmul float %tmp129, %tmp533
+ %tmp536 = fadd float %tmp535, %tmp22
+ %tmp537 = fmul float %tmp130, %tmp533
+ %tmp538 = fadd float %tmp537, %tmp23
+ %tmp539 = insertelement <4 x float> undef, float %tmp536, i32 0
+ %tmp540 = insertelement <4 x float> %tmp539, float %tmp538, i32 1
+ %tmp541 = insertelement <4 x float> %tmp540, float 0.000000e+00, i32 2
+ %tmp542 = insertelement <4 x float> %tmp541, float %tmp524, i32 3
+ %tmp543 = extractelement <4 x float> %tmp542, i32 0
+ %tmp544 = extractelement <4 x float> %tmp542, i32 1
+ %tmp545 = insertelement <4 x float> undef, float %tmp543, i32 0
+ %tmp546 = insertelement <4 x float> %tmp545, float %tmp544, i32 1
+ %tmp547 = insertelement <4 x float> %tmp546, float undef, i32 2
+ %tmp548 = insertelement <4 x float> %tmp547, float undef, i32 3
+ %tmp549 = shufflevector <4 x float> %tmp548, <4 x float> %tmp548, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp550 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp549, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+ %tmp551 = extractelement <4 x float> %tmp550, i32 3
+ %tmp552 = fcmp oge float %tmp533, %tmp551
+ %tmp553 = sext i1 %tmp552 to i32
+ %tmp554 = bitcast i32 %tmp553 to float
+ %tmp555 = bitcast float %tmp554 to i32
+ %tmp556 = icmp ne i32 %tmp555, 0
+ br i1 %tmp556, label %IF155, label %ENDIF154
IF155: ; preds = %ENDIF151
- %556 = fmul float 2.000000e+00, %534
- %557 = fsub float -0.000000e+00, %556
- %558 = fadd float %533, %557
+ %tmp557 = fmul float 2.000000e+00, %tmp534
+ %tmp558 = fsub float -0.000000e+00, %tmp557
+ %tmp559 = fadd float %tmp533, %tmp558
br label %ENDIF154
-ENDIF154: ; preds = %ENDIF151, %IF155
- %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ]
- %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ]
- %559 = fadd float %temp88.4, %534
- %560 = fmul float %129, %559
- %561 = fadd float %560, %22
- %562 = fmul float %130, %559
- %563 = fadd float %562, %23
- %564 = insertelement <4 x float> undef, float %561, i32 0
- %565 = insertelement <4 x float> %564, float %563, i32 1
- %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2
- %567 = insertelement <4 x float> %566, float %550, i32 3
- %568 = extractelement <4 x float> %567, i32 0
- %569 = extractelement <4 x float> %567, i32 1
- %570 = insertelement <4 x float> undef, float %568, i32 0
- %571 = insertelement <4 x float> %570, float %569, i32 1
- %572 = insertelement <4 x float> %571, float undef, i32 2
- %573 = insertelement <4 x float> %572, float undef, i32 3
- %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2)
- %575 = extractelement <4 x float> %574, i32 3
- %576 = fcmp oge float %559, %575
- %577 = sext i1 %576 to i32
- %578 = bitcast i32 %577 to float
- %579 = bitcast float %578 to i32
- %580 = icmp ne i32 %579, 0
- %.temp92.4 = select i1 %580, float %559, float %temp92.4
- %581 = fmul float %129, %.temp92.4
- %582 = fadd float %581, %22
- %583 = fmul float %130, %.temp92.4
- %584 = fadd float %583, %23
- %585 = insertelement <4 x float> undef, float %582, i32 0
- %586 = insertelement <4 x float> %585, float %584, i32 1
- %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2
- %588 = insertelement <4 x float> %587, float %575, i32 3
- %589 = extractelement <4 x float> %588, i32 0
- %590 = extractelement <4 x float> %588, i32 1
- %591 = insertelement <4 x float> undef, float %589, i32 0
- %592 = insertelement <4 x float> %591, float %590, i32 1
- %593 = insertelement <4 x float> %592, float undef, i32 2
- %594 = insertelement <4 x float> %593, float undef, i32 3
- %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2)
- %596 = extractelement <4 x float> %595, i32 0
- %597 = extractelement <4 x float> %595, i32 1
- %598 = extractelement <4 x float> %595, i32 2
- %599 = fmul float %596, 2.000000e+00
- %600 = fadd float %599, -1.000000e+00
- %601 = fmul float %597, 2.000000e+00
- %602 = fadd float %601, -1.000000e+00
- %603 = fmul float %598, 2.000000e+00
- %604 = fadd float %603, -1.000000e+00
+ENDIF154: ; preds = %IF155, %ENDIF151
+ %temp88.4 = phi float [ %tmp559, %IF155 ], [ %tmp533, %ENDIF151 ]
+ %temp92.4 = phi float [ %tmp533, %IF155 ], [ %temp92.3, %ENDIF151 ]
+ %tmp560 = fadd float %temp88.4, %tmp534
+ %tmp561 = fmul float %tmp129, %tmp560
+ %tmp562 = fadd float %tmp561, %tmp22
+ %tmp563 = fmul float %tmp130, %tmp560
+ %tmp564 = fadd float %tmp563, %tmp23
+ %tmp565 = insertelement <4 x float> undef, float %tmp562, i32 0
+ %tmp566 = insertelement <4 x float> %tmp565, float %tmp564, i32 1
+ %tmp567 = insertelement <4 x float> %tmp566, float 0.000000e+00, i32 2
+ %tmp568 = insertelement <4 x float> %tmp567, float %tmp551, i32 3
+ %tmp569 = extractelement <4 x float> %tmp568, i32 0
+ %tmp570 = extractelement <4 x float> %tmp568, i32 1
+ %tmp571 = insertelement <4 x float> undef, float %tmp569, i32 0
+ %tmp572 = insertelement <4 x float> %tmp571, float %tmp570, i32 1
+ %tmp573 = insertelement <4 x float> %tmp572, float undef, i32 2
+ %tmp574 = insertelement <4 x float> %tmp573, float undef, i32 3
+ %tmp575 = shufflevector <4 x float> %tmp574, <4 x float> %tmp574, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp576 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp575, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+ %tmp577 = extractelement <4 x float> %tmp576, i32 3
+ %tmp578 = fcmp oge float %tmp560, %tmp577
+ %tmp579 = sext i1 %tmp578 to i32
+ %tmp580 = bitcast i32 %tmp579 to float
+ %tmp581 = bitcast float %tmp580 to i32
+ %tmp582 = icmp ne i32 %tmp581, 0
+ %.temp92.4 = select i1 %tmp582, float %tmp560, float %temp92.4
+ %tmp583 = fmul float %tmp129, %.temp92.4
+ %tmp584 = fadd float %tmp583, %tmp22
+ %tmp585 = fmul float %tmp130, %.temp92.4
+ %tmp586 = fadd float %tmp585, %tmp23
+ %tmp587 = insertelement <4 x float> undef, float %tmp584, i32 0
+ %tmp588 = insertelement <4 x float> %tmp587, float %tmp586, i32 1
+ %tmp589 = insertelement <4 x float> %tmp588, float 0.000000e+00, i32 2
+ %tmp590 = insertelement <4 x float> %tmp589, float %tmp577, i32 3
+ %tmp591 = extractelement <4 x float> %tmp590, i32 0
+ %tmp592 = extractelement <4 x float> %tmp590, i32 1
+ %tmp593 = insertelement <4 x float> undef, float %tmp591, i32 0
+ %tmp594 = insertelement <4 x float> %tmp593, float %tmp592, i32 1
+ %tmp595 = insertelement <4 x float> %tmp594, float undef, i32 2
+ %tmp596 = insertelement <4 x float> %tmp595, float undef, i32 3
+ %tmp597 = shufflevector <4 x float> %tmp596, <4 x float> %tmp596, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp598 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp597, i32 0, i32 0, i32 0, i32 20, i32 4, i32 1, i32 1, i32 1, i32 1)
+ %tmp599 = extractelement <4 x float> %tmp598, i32 0
+ %tmp600 = extractelement <4 x float> %tmp598, i32 1
+ %tmp601 = extractelement <4 x float> %tmp598, i32 2
+ %tmp602 = fmul float %tmp599, 2.000000e+00
+ %tmp603 = fadd float %tmp602, -1.000000e+00
+ %tmp604 = fmul float %tmp600, 2.000000e+00
+ %tmp605 = fadd float %tmp604, -1.000000e+00
+ %tmp606 = fmul float %tmp601, 2.000000e+00
+ %tmp607 = fadd float %tmp606, -1.000000e+00
br label %ENDIF136
IF161: ; preds = %ENDIF136
- %605 = fmul float %202, 0x3FB99999A0000000
- %606 = fcmp uge float 0x3FE4CCCCC0000000, %605
- %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605
- %608 = fcmp uge float %607, 5.000000e-01
- %609 = select i1 %608, float 5.000000e-01, float %607
- %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300)
- %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301)
- %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302)
- %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303)
- %614 = insertelement <4 x float> undef, float %329, i32 0
- %615 = insertelement <4 x float> %614, float %330, i32 1
- %616 = insertelement <4 x float> %615, float %331, i32 2
- %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3
- %618 = insertelement <4 x float> undef, float %63, i32 0
- %619 = insertelement <4 x float> %618, float %65, i32 1
- %620 = insertelement <4 x float> %619, float %67, i32 2
- %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3
- %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621)
- %623 = fcmp uge float 0x3FE6666660000000, %622
- %624 = select i1 %623, float 0x3FE6666660000000, float %622
- %625 = fmul float %8, %624
- %626 = fmul float %13, %624
- %627 = fmul float %18, %624
- %628 = insertelement <4 x float> undef, float %34, i32 0
- %629 = insertelement <4 x float> %628, float %35, i32 1
- %630 = insertelement <4 x float> %629, float %36, i32 2
- %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3
- %632 = insertelement <4 x float> undef, float %63, i32 0
- %633 = insertelement <4 x float> %632, float %65, i32 1
- %634 = insertelement <4 x float> %633, float %67, i32 2
- %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3
- %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635)
- %637 = fcmp uge float 0x3FECCCCCC0000000, %636
- %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636
- %639 = fmul float %625, %638
- %640 = fmul float %626, %638
- %641 = fmul float %627, %638
+ %tmp608 = fmul float %result.i, 0x3FB99999A0000000
+ %tmp609 = fcmp uge float 0x3FE4CCCCC0000000, %tmp608
+ %tmp610 = select i1 %tmp609, float 0x3FE4CCCCC0000000, float %tmp608
+ %tmp611 = fcmp uge float %tmp610, 5.000000e-01
+ %tmp612 = select i1 %tmp611, float 5.000000e-01, float %tmp610
+ %one.sub.a.i137 = fsub float 1.000000e+00, %tmp612
+ %one.sub.ac.i138 = fmul float %one.sub.a.i137, %tmp304
+ %mul.i139 = fmul float %result.i172, %tmp304
+ %result.i140 = fadd float %mul.i139, %one.sub.ac.i138
+ %one.sub.a.i133 = fsub float 1.000000e+00, %tmp612
+ %one.sub.ac.i134 = fmul float %one.sub.a.i133, %tmp305
+ %mul.i135 = fmul float %result.i168, %tmp305
+ %result.i136 = fadd float %mul.i135, %one.sub.ac.i134
+ %one.sub.a.i129 = fsub float 1.000000e+00, %tmp612
+ %one.sub.ac.i130 = fmul float %one.sub.a.i129, %tmp306
+ %mul.i131 = fmul float %result.i164, %tmp306
+ %result.i132 = fadd float %mul.i131, %one.sub.ac.i130
+ %one.sub.a.i125 = fsub float 1.000000e+00, %tmp612
+ %one.sub.ac.i126 = fmul float %one.sub.a.i125, %tmp307
+ %mul.i127 = fmul float %result.i160, %tmp307
+ %result.i128 = fadd float %mul.i127, %one.sub.ac.i126
+ %tmp613 = insertelement <4 x float> undef, float %tmp333, i32 0
+ %tmp614 = insertelement <4 x float> %tmp613, float %tmp334, i32 1
+ %tmp615 = insertelement <4 x float> %tmp614, float %tmp335, i32 2
+ %tmp616 = insertelement <4 x float> %tmp615, float 0.000000e+00, i32 3
+ %tmp617 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp618 = insertelement <4 x float> %tmp617, float %tmp65, i32 1
+ %tmp619 = insertelement <4 x float> %tmp618, float %tmp67, i32 2
+ %tmp620 = insertelement <4 x float> %tmp619, float 0.000000e+00, i32 3
+ %tmp621 = call float @llvm.r600.dot4(<4 x float> %tmp616, <4 x float> %tmp620)
+ %tmp622 = fcmp uge float 0x3FE6666660000000, %tmp621
+ %tmp623 = select i1 %tmp622, float 0x3FE6666660000000, float %tmp621
+ %tmp624 = fmul float %tmp8, %tmp623
+ %tmp625 = fmul float %tmp13, %tmp623
+ %tmp626 = fmul float %tmp18, %tmp623
+ %tmp627 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp628 = insertelement <4 x float> %tmp627, float %tmp35, i32 1
+ %tmp629 = insertelement <4 x float> %tmp628, float %tmp36, i32 2
+ %tmp630 = insertelement <4 x float> %tmp629, float 0.000000e+00, i32 3
+ %tmp631 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp632 = insertelement <4 x float> %tmp631, float %tmp65, i32 1
+ %tmp633 = insertelement <4 x float> %tmp632, float %tmp67, i32 2
+ %tmp634 = insertelement <4 x float> %tmp633, float 0.000000e+00, i32 3
+ %tmp635 = call float @llvm.r600.dot4(<4 x float> %tmp630, <4 x float> %tmp634)
+ %tmp636 = fcmp uge float 0x3FECCCCCC0000000, %tmp635
+ %tmp637 = select i1 %tmp636, float 0x3FECCCCCC0000000, float %tmp635
+ %tmp638 = fmul float %tmp624, %tmp637
+ %tmp639 = fmul float %tmp625, %tmp637
+ %tmp640 = fmul float %tmp626, %tmp637
br label %ENDIF160
-ENDIF160: ; preds = %ENDIF136, %IF161
- %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ]
- %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ]
- %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ]
- %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ]
- %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ]
- %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
- %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
- %642 = fcmp olt float 2.200000e+03, %179
- %643 = sext i1 %642 to i32
- %644 = bitcast i32 %643 to float
- %645 = fcmp olt float %179, 2.300000e+03
- %646 = sext i1 %645 to i32
- %647 = bitcast i32 %646 to float
- %648 = bitcast float %644 to i32
- %649 = bitcast float %647 to i32
- %650 = and i32 %648, %649
- %651 = bitcast i32 %650 to float
- %652 = bitcast float %651 to i32
- %653 = icmp ne i32 %652, 0
- br i1 %653, label %IF164, label %ENDIF163
+ENDIF160: ; preds = %IF161, %ENDIF136
+ %temp84.0 = phi float [ %result.i140, %IF161 ], [ %tmp258, %ENDIF136 ]
+ %temp85.0 = phi float [ %result.i136, %IF161 ], [ %tmp259, %ENDIF136 ]
+ %temp86.0 = phi float [ %result.i132, %IF161 ], [ %tmp260, %ENDIF136 ]
+ %temp87.0 = phi float [ %result.i128, %IF161 ], [ %tmp261, %ENDIF136 ]
+ %temp92.6 = phi float [ %tmp638, %IF161 ], [ %tmp411, %ENDIF136 ]
+ %temp93.0 = phi float [ %tmp639, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+ %temp94.0 = phi float [ %tmp640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+ %tmp641 = fcmp olt float 2.200000e+03, %tmp181
+ %tmp642 = sext i1 %tmp641 to i32
+ %tmp643 = bitcast i32 %tmp642 to float
+ %tmp644 = fcmp olt float %tmp181, 2.300000e+03
+ %tmp645 = sext i1 %tmp644 to i32
+ %tmp646 = bitcast i32 %tmp645 to float
+ %tmp647 = bitcast float %tmp643 to i32
+ %tmp648 = bitcast float %tmp646 to i32
+ %tmp649 = and i32 %tmp647, %tmp648
+ %tmp650 = bitcast i32 %tmp649 to float
+ %tmp651 = bitcast float %tmp650 to i32
+ %tmp652 = icmp ne i32 %tmp651, 0
+ br i1 %tmp652, label %IF164, label %ENDIF163
IF164: ; preds = %ENDIF160
- %654 = fmul float %202, 5.000000e-01
- %655 = fcmp uge float 0x3FE4CCCCC0000000, %654
- %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654
- %657 = fcmp uge float %656, 0x3FD6666660000000
- %658 = select i1 %657, float 0x3FD6666660000000, float %656
- %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300)
- %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301)
- %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302)
- %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303)
- %663 = insertelement <4 x float> undef, float %329, i32 0
- %664 = insertelement <4 x float> %663, float %330, i32 1
- %665 = insertelement <4 x float> %664, float %331, i32 2
- %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3
- %667 = insertelement <4 x float> undef, float %63, i32 0
- %668 = insertelement <4 x float> %667, float %65, i32 1
- %669 = insertelement <4 x float> %668, float %67, i32 2
- %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3
- %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670)
- %672 = fcmp uge float 0x3FE6666660000000, %671
- %673 = select i1 %672, float 0x3FE6666660000000, float %671
- %674 = fmul float %8, %673
- %675 = fmul float %13, %673
- %676 = fmul float %18, %673
- %677 = insertelement <4 x float> undef, float %34, i32 0
- %678 = insertelement <4 x float> %677, float %35, i32 1
- %679 = insertelement <4 x float> %678, float %36, i32 2
- %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3
- %681 = insertelement <4 x float> undef, float %63, i32 0
- %682 = insertelement <4 x float> %681, float %65, i32 1
- %683 = insertelement <4 x float> %682, float %67, i32 2
- %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3
- %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684)
- %686 = fcmp uge float 0x3FECCCCCC0000000, %685
- %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685
- %688 = fmul float %674, %687
- %689 = fmul float %675, %687
- %690 = fmul float %676, %687
+ %tmp653 = fmul float %result.i, 5.000000e-01
+ %tmp654 = fcmp uge float 0x3FE4CCCCC0000000, %tmp653
+ %tmp655 = select i1 %tmp654, float 0x3FE4CCCCC0000000, float %tmp653
+ %tmp656 = fcmp uge float %tmp655, 0x3FD6666660000000
+ %tmp657 = select i1 %tmp656, float 0x3FD6666660000000, float %tmp655
+ %one.sub.a.i121 = fsub float 1.000000e+00, %tmp657
+ %one.sub.ac.i122 = fmul float %one.sub.a.i121, %tmp304
+ %mul.i123 = fmul float %result.i172, %tmp304
+ %result.i124 = fadd float %mul.i123, %one.sub.ac.i122
+ %one.sub.a.i117 = fsub float 1.000000e+00, %tmp657
+ %one.sub.ac.i118 = fmul float %one.sub.a.i117, %tmp305
+ %mul.i119 = fmul float %result.i168, %tmp305
+ %result.i120 = fadd float %mul.i119, %one.sub.ac.i118
+ %one.sub.a.i113 = fsub float 1.000000e+00, %tmp657
+ %one.sub.ac.i114 = fmul float %one.sub.a.i113, %tmp306
+ %mul.i115 = fmul float %result.i164, %tmp306
+ %result.i116 = fadd float %mul.i115, %one.sub.ac.i114
+ %one.sub.a.i109 = fsub float 1.000000e+00, %tmp657
+ %one.sub.ac.i110 = fmul float %one.sub.a.i109, %tmp307
+ %mul.i111 = fmul float %result.i160, %tmp307
+ %result.i112 = fadd float %mul.i111, %one.sub.ac.i110
+ %tmp658 = insertelement <4 x float> undef, float %tmp333, i32 0
+ %tmp659 = insertelement <4 x float> %tmp658, float %tmp334, i32 1
+ %tmp660 = insertelement <4 x float> %tmp659, float %tmp335, i32 2
+ %tmp661 = insertelement <4 x float> %tmp660, float 0.000000e+00, i32 3
+ %tmp662 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp663 = insertelement <4 x float> %tmp662, float %tmp65, i32 1
+ %tmp664 = insertelement <4 x float> %tmp663, float %tmp67, i32 2
+ %tmp665 = insertelement <4 x float> %tmp664, float 0.000000e+00, i32 3
+ %tmp666 = call float @llvm.r600.dot4(<4 x float> %tmp661, <4 x float> %tmp665)
+ %tmp667 = fcmp uge float 0x3FE6666660000000, %tmp666
+ %tmp668 = select i1 %tmp667, float 0x3FE6666660000000, float %tmp666
+ %tmp669 = fmul float %tmp8, %tmp668
+ %tmp670 = fmul float %tmp13, %tmp668
+ %tmp671 = fmul float %tmp18, %tmp668
+ %tmp672 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp673 = insertelement <4 x float> %tmp672, float %tmp35, i32 1
+ %tmp674 = insertelement <4 x float> %tmp673, float %tmp36, i32 2
+ %tmp675 = insertelement <4 x float> %tmp674, float 0.000000e+00, i32 3
+ %tmp676 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp677 = insertelement <4 x float> %tmp676, float %tmp65, i32 1
+ %tmp678 = insertelement <4 x float> %tmp677, float %tmp67, i32 2
+ %tmp679 = insertelement <4 x float> %tmp678, float 0.000000e+00, i32 3
+ %tmp680 = call float @llvm.r600.dot4(<4 x float> %tmp675, <4 x float> %tmp679)
+ %tmp681 = fcmp uge float 0x3FECCCCCC0000000, %tmp680
+ %tmp682 = select i1 %tmp681, float 0x3FECCCCCC0000000, float %tmp680
+ %tmp683 = fmul float %tmp669, %tmp682
+ %tmp684 = fmul float %tmp670, %tmp682
+ %tmp685 = fmul float %tmp671, %tmp682
br label %ENDIF163
-ENDIF163: ; preds = %ENDIF160, %IF164
- %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ]
- %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ]
- %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ]
- %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ]
- %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ]
- %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ]
- %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ]
- %691 = fcmp oge float %179, 2.300000e+03
- %692 = sext i1 %691 to i32
- %693 = bitcast i32 %692 to float
- %694 = fcmp olt float %179, 2.480000e+03
- %695 = sext i1 %694 to i32
- %696 = bitcast i32 %695 to float
- %697 = bitcast float %693 to i32
- %698 = bitcast float %696 to i32
- %699 = and i32 %697, %698
- %700 = bitcast i32 %699 to float
- %701 = bitcast float %700 to i32
- %702 = icmp ne i32 %701, 0
- br i1 %702, label %IF167, label %ENDIF166
+ENDIF163: ; preds = %IF164, %ENDIF160
+ %temp84.1 = phi float [ %result.i124, %IF164 ], [ %temp84.0, %ENDIF160 ]
+ %temp85.1 = phi float [ %result.i120, %IF164 ], [ %temp85.0, %ENDIF160 ]
+ %temp86.1 = phi float [ %result.i116, %IF164 ], [ %temp86.0, %ENDIF160 ]
+ %temp87.1 = phi float [ %result.i112, %IF164 ], [ %temp87.0, %ENDIF160 ]
+ %temp92.7 = phi float [ %tmp683, %IF164 ], [ %temp92.6, %ENDIF160 ]
+ %temp93.1 = phi float [ %tmp684, %IF164 ], [ %temp93.0, %ENDIF160 ]
+ %temp94.1 = phi float [ %tmp685, %IF164 ], [ %temp94.0, %ENDIF160 ]
+ %tmp686 = fcmp oge float %tmp181, 2.300000e+03
+ %tmp687 = sext i1 %tmp686 to i32
+ %tmp688 = bitcast i32 %tmp687 to float
+ %tmp689 = fcmp olt float %tmp181, 2.480000e+03
+ %tmp690 = sext i1 %tmp689 to i32
+ %tmp691 = bitcast i32 %tmp690 to float
+ %tmp692 = bitcast float %tmp688 to i32
+ %tmp693 = bitcast float %tmp691 to i32
+ %tmp694 = and i32 %tmp692, %tmp693
+ %tmp695 = bitcast i32 %tmp694 to float
+ %tmp696 = bitcast float %tmp695 to i32
+ %tmp697 = icmp ne i32 %tmp696, 0
+ br i1 %tmp697, label %IF167, label %ENDIF166
IF167: ; preds = %ENDIF163
- %703 = fmul float %202, 5.000000e-01
- %704 = fcmp uge float 0x3FE4CCCCC0000000, %703
- %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703
- %706 = fcmp uge float %705, 0x3FD3333340000000
- %707 = select i1 %706, float 0x3FD3333340000000, float %705
- %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300)
- %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301)
- %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302)
- %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303)
- %712 = insertelement <4 x float> undef, float %329, i32 0
- %713 = insertelement <4 x float> %712, float %330, i32 1
- %714 = insertelement <4 x float> %713, float %331, i32 2
- %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3
- %716 = insertelement <4 x float> undef, float %63, i32 0
- %717 = insertelement <4 x float> %716, float %65, i32 1
- %718 = insertelement <4 x float> %717, float %67, i32 2
- %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3
- %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719)
- %721 = fcmp uge float 0x3FEB333340000000, %720
- %722 = select i1 %721, float 0x3FEB333340000000, float %720
- %723 = fmul float %8, %722
- %724 = fmul float %13, %722
- %725 = fmul float %18, %722
- %726 = insertelement <4 x float> undef, float %34, i32 0
- %727 = insertelement <4 x float> %726, float %35, i32 1
- %728 = insertelement <4 x float> %727, float %36, i32 2
- %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3
- %730 = insertelement <4 x float> undef, float %63, i32 0
- %731 = insertelement <4 x float> %730, float %65, i32 1
- %732 = insertelement <4 x float> %731, float %67, i32 2
- %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3
- %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733)
- %735 = fcmp uge float 0x3FECCCCCC0000000, %734
- %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734
- %737 = fmul float %723, %736
- %738 = fmul float %724, %736
- %739 = fmul float %725, %736
+ %tmp698 = fmul float %result.i, 5.000000e-01
+ %tmp699 = fcmp uge float 0x3FE4CCCCC0000000, %tmp698
+ %tmp700 = select i1 %tmp699, float 0x3FE4CCCCC0000000, float %tmp698
+ %tmp701 = fcmp uge float %tmp700, 0x3FD3333340000000
+ %tmp702 = select i1 %tmp701, float 0x3FD3333340000000, float %tmp700
+ %one.sub.a.i105 = fsub float 1.000000e+00, %tmp702
+ %one.sub.ac.i106 = fmul float %one.sub.a.i105, %tmp304
+ %mul.i107 = fmul float %result.i156, %tmp304
+ %result.i108 = fadd float %mul.i107, %one.sub.ac.i106
+ %one.sub.a.i101 = fsub float 1.000000e+00, %tmp702
+ %one.sub.ac.i102 = fmul float %one.sub.a.i101, %tmp305
+ %mul.i103 = fmul float %result.i152, %tmp305
+ %result.i104 = fadd float %mul.i103, %one.sub.ac.i102
+ %one.sub.a.i97 = fsub float 1.000000e+00, %tmp702
+ %one.sub.ac.i98 = fmul float %one.sub.a.i97, %tmp306
+ %mul.i99 = fmul float %result.i148, %tmp306
+ %result.i100 = fadd float %mul.i99, %one.sub.ac.i98
+ %one.sub.a.i93 = fsub float 1.000000e+00, %tmp702
+ %one.sub.ac.i94 = fmul float %one.sub.a.i93, %tmp307
+ %mul.i95 = fmul float %result.i144, %tmp307
+ %result.i96 = fadd float %mul.i95, %one.sub.ac.i94
+ %tmp703 = insertelement <4 x float> undef, float %tmp333, i32 0
+ %tmp704 = insertelement <4 x float> %tmp703, float %tmp334, i32 1
+ %tmp705 = insertelement <4 x float> %tmp704, float %tmp335, i32 2
+ %tmp706 = insertelement <4 x float> %tmp705, float 0.000000e+00, i32 3
+ %tmp707 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp708 = insertelement <4 x float> %tmp707, float %tmp65, i32 1
+ %tmp709 = insertelement <4 x float> %tmp708, float %tmp67, i32 2
+ %tmp710 = insertelement <4 x float> %tmp709, float 0.000000e+00, i32 3
+ %tmp711 = call float @llvm.r600.dot4(<4 x float> %tmp706, <4 x float> %tmp710)
+ %tmp712 = fcmp uge float 0x3FEB333340000000, %tmp711
+ %tmp713 = select i1 %tmp712, float 0x3FEB333340000000, float %tmp711
+ %tmp714 = fmul float %tmp8, %tmp713
+ %tmp715 = fmul float %tmp13, %tmp713
+ %tmp716 = fmul float %tmp18, %tmp713
+ %tmp717 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp718 = insertelement <4 x float> %tmp717, float %tmp35, i32 1
+ %tmp719 = insertelement <4 x float> %tmp718, float %tmp36, i32 2
+ %tmp720 = insertelement <4 x float> %tmp719, float 0.000000e+00, i32 3
+ %tmp721 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp722 = insertelement <4 x float> %tmp721, float %tmp65, i32 1
+ %tmp723 = insertelement <4 x float> %tmp722, float %tmp67, i32 2
+ %tmp724 = insertelement <4 x float> %tmp723, float 0.000000e+00, i32 3
+ %tmp725 = call float @llvm.r600.dot4(<4 x float> %tmp720, <4 x float> %tmp724)
+ %tmp726 = fcmp uge float 0x3FECCCCCC0000000, %tmp725
+ %tmp727 = select i1 %tmp726, float 0x3FECCCCCC0000000, float %tmp725
+ %tmp728 = fmul float %tmp714, %tmp727
+ %tmp729 = fmul float %tmp715, %tmp727
+ %tmp730 = fmul float %tmp716, %tmp727
br label %ENDIF166
-ENDIF166: ; preds = %ENDIF163, %IF167
- %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ]
- %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ]
- %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ]
- %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ]
- %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ]
- %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ]
- %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ]
- %740 = fcmp oge float %179, 2.480000e+03
- %741 = sext i1 %740 to i32
- %742 = bitcast i32 %741 to float
- %743 = fcmp olt float %179, 2.530000e+03
- %744 = sext i1 %743 to i32
- %745 = bitcast i32 %744 to float
- %746 = bitcast float %742 to i32
- %747 = bitcast float %745 to i32
- %748 = and i32 %746, %747
- %749 = bitcast i32 %748 to float
- %750 = bitcast float %749 to i32
- %751 = icmp ne i32 %750, 0
- br i1 %751, label %IF170, label %ENDIF169
+ENDIF166: ; preds = %IF167, %ENDIF163
+ %temp84.2 = phi float [ %result.i108, %IF167 ], [ %temp84.1, %ENDIF163 ]
+ %temp85.2 = phi float [ %result.i104, %IF167 ], [ %temp85.1, %ENDIF163 ]
+ %temp86.2 = phi float [ %result.i100, %IF167 ], [ %temp86.1, %ENDIF163 ]
+ %temp87.2 = phi float [ %result.i96, %IF167 ], [ %temp87.1, %ENDIF163 ]
+ %temp92.8 = phi float [ %tmp728, %IF167 ], [ %temp92.7, %ENDIF163 ]
+ %temp93.2 = phi float [ %tmp729, %IF167 ], [ %temp93.1, %ENDIF163 ]
+ %temp94.2 = phi float [ %tmp730, %IF167 ], [ %temp94.1, %ENDIF163 ]
+ %tmp731 = fcmp oge float %tmp181, 2.480000e+03
+ %tmp732 = sext i1 %tmp731 to i32
+ %tmp733 = bitcast i32 %tmp732 to float
+ %tmp734 = fcmp olt float %tmp181, 2.530000e+03
+ %tmp735 = sext i1 %tmp734 to i32
+ %tmp736 = bitcast i32 %tmp735 to float
+ %tmp737 = bitcast float %tmp733 to i32
+ %tmp738 = bitcast float %tmp736 to i32
+ %tmp739 = and i32 %tmp737, %tmp738
+ %tmp740 = bitcast i32 %tmp739 to float
+ %tmp741 = bitcast float %tmp740 to i32
+ %tmp742 = icmp ne i32 %tmp741, 0
+ br i1 %tmp742, label %IF170, label %ENDIF169
IF170: ; preds = %ENDIF166
- %752 = fmul float %202, 5.000000e-01
- %753 = fcmp uge float 0x3FE4CCCCC0000000, %752
- %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752
- %755 = fcmp uge float %754, 0x3FC99999A0000000
- %756 = select i1 %755, float 0x3FC99999A0000000, float %754
- %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300)
- %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301)
- %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302)
- %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303)
- %761 = insertelement <4 x float> undef, float %329, i32 0
- %762 = insertelement <4 x float> %761, float %330, i32 1
- %763 = insertelement <4 x float> %762, float %331, i32 2
- %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3
- %765 = insertelement <4 x float> undef, float %63, i32 0
- %766 = insertelement <4 x float> %765, float %65, i32 1
- %767 = insertelement <4 x float> %766, float %67, i32 2
- %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3
- %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768)
- %770 = fcmp uge float 0x3FEB333340000000, %769
- %771 = select i1 %770, float 0x3FEB333340000000, float %769
- %772 = fmul float %8, %771
- %773 = fmul float %13, %771
- %774 = fmul float %18, %771
- %775 = insertelement <4 x float> undef, float %34, i32 0
- %776 = insertelement <4 x float> %775, float %35, i32 1
- %777 = insertelement <4 x float> %776, float %36, i32 2
- %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3
- %779 = insertelement <4 x float> undef, float %63, i32 0
- %780 = insertelement <4 x float> %779, float %65, i32 1
- %781 = insertelement <4 x float> %780, float %67, i32 2
- %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3
- %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782)
- %784 = fcmp uge float 0x3FECCCCCC0000000, %783
- %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783
- %786 = fmul float %772, %785
- %787 = fmul float %773, %785
- %788 = fmul float %774, %785
+ %tmp743 = fmul float %result.i, 5.000000e-01
+ %tmp744 = fcmp uge float 0x3FE4CCCCC0000000, %tmp743
+ %tmp745 = select i1 %tmp744, float 0x3FE4CCCCC0000000, float %tmp743
+ %tmp746 = fcmp uge float %tmp745, 0x3FC99999A0000000
+ %tmp747 = select i1 %tmp746, float 0x3FC99999A0000000, float %tmp745
+ %one.sub.a.i89 = fsub float 1.000000e+00, %tmp747
+ %one.sub.ac.i90 = fmul float %one.sub.a.i89, %tmp304
+ %mul.i91 = fmul float %result.i156, %tmp304
+ %result.i92 = fadd float %mul.i91, %one.sub.ac.i90
+ %one.sub.a.i85 = fsub float 1.000000e+00, %tmp747
+ %one.sub.ac.i86 = fmul float %one.sub.a.i85, %tmp305
+ %mul.i87 = fmul float %result.i152, %tmp305
+ %result.i88 = fadd float %mul.i87, %one.sub.ac.i86
+ %one.sub.a.i81 = fsub float 1.000000e+00, %tmp747
+ %one.sub.ac.i82 = fmul float %one.sub.a.i81, %tmp306
+ %mul.i83 = fmul float %result.i148, %tmp306
+ %result.i84 = fadd float %mul.i83, %one.sub.ac.i82
+ %one.sub.a.i77 = fsub float 1.000000e+00, %tmp747
+ %one.sub.ac.i78 = fmul float %one.sub.a.i77, %tmp307
+ %mul.i79 = fmul float %result.i144, %tmp307
+ %result.i80 = fadd float %mul.i79, %one.sub.ac.i78
+ %tmp748 = insertelement <4 x float> undef, float %tmp333, i32 0
+ %tmp749 = insertelement <4 x float> %tmp748, float %tmp334, i32 1
+ %tmp750 = insertelement <4 x float> %tmp749, float %tmp335, i32 2
+ %tmp751 = insertelement <4 x float> %tmp750, float 0.000000e+00, i32 3
+ %tmp752 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp753 = insertelement <4 x float> %tmp752, float %tmp65, i32 1
+ %tmp754 = insertelement <4 x float> %tmp753, float %tmp67, i32 2
+ %tmp755 = insertelement <4 x float> %tmp754, float 0.000000e+00, i32 3
+ %tmp756 = call float @llvm.r600.dot4(<4 x float> %tmp751, <4 x float> %tmp755)
+ %tmp757 = fcmp uge float 0x3FEB333340000000, %tmp756
+ %tmp758 = select i1 %tmp757, float 0x3FEB333340000000, float %tmp756
+ %tmp759 = fmul float %tmp8, %tmp758
+ %tmp760 = fmul float %tmp13, %tmp758
+ %tmp761 = fmul float %tmp18, %tmp758
+ %tmp762 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp763 = insertelement <4 x float> %tmp762, float %tmp35, i32 1
+ %tmp764 = insertelement <4 x float> %tmp763, float %tmp36, i32 2
+ %tmp765 = insertelement <4 x float> %tmp764, float 0.000000e+00, i32 3
+ %tmp766 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp767 = insertelement <4 x float> %tmp766, float %tmp65, i32 1
+ %tmp768 = insertelement <4 x float> %tmp767, float %tmp67, i32 2
+ %tmp769 = insertelement <4 x float> %tmp768, float 0.000000e+00, i32 3
+ %tmp770 = call float @llvm.r600.dot4(<4 x float> %tmp765, <4 x float> %tmp769)
+ %tmp771 = fcmp uge float 0x3FECCCCCC0000000, %tmp770
+ %tmp772 = select i1 %tmp771, float 0x3FECCCCCC0000000, float %tmp770
+ %tmp773 = fmul float %tmp759, %tmp772
+ %tmp774 = fmul float %tmp760, %tmp772
+ %tmp775 = fmul float %tmp761, %tmp772
br label %ENDIF169
-ENDIF169: ; preds = %ENDIF166, %IF170
- %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ]
- %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ]
- %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ]
- %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ]
- %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ]
- %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ]
- %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ]
- %789 = fcmp oge float %179, 2.530000e+03
- %790 = sext i1 %789 to i32
- %791 = bitcast i32 %790 to float
- %792 = fcmp olt float %179, 2.670000e+03
- %793 = sext i1 %792 to i32
- %794 = bitcast i32 %793 to float
- %795 = bitcast float %791 to i32
- %796 = bitcast float %794 to i32
- %797 = and i32 %795, %796
- %798 = bitcast i32 %797 to float
- %799 = bitcast float %798 to i32
- %800 = icmp ne i32 %799, 0
- br i1 %800, label %IF173, label %ENDIF172
+ENDIF169: ; preds = %IF170, %ENDIF166
+ %temp84.3 = phi float [ %result.i92, %IF170 ], [ %temp84.2, %ENDIF166 ]
+ %temp85.3 = phi float [ %result.i88, %IF170 ], [ %temp85.2, %ENDIF166 ]
+ %temp86.3 = phi float [ %result.i84, %IF170 ], [ %temp86.2, %ENDIF166 ]
+ %temp87.3 = phi float [ %result.i80, %IF170 ], [ %temp87.2, %ENDIF166 ]
+ %temp92.9 = phi float [ %tmp773, %IF170 ], [ %temp92.8, %ENDIF166 ]
+ %temp93.3 = phi float [ %tmp774, %IF170 ], [ %temp93.2, %ENDIF166 ]
+ %temp94.3 = phi float [ %tmp775, %IF170 ], [ %temp94.2, %ENDIF166 ]
+ %tmp776 = fcmp oge float %tmp181, 2.530000e+03
+ %tmp777 = sext i1 %tmp776 to i32
+ %tmp778 = bitcast i32 %tmp777 to float
+ %tmp779 = fcmp olt float %tmp181, 2.670000e+03
+ %tmp780 = sext i1 %tmp779 to i32
+ %tmp781 = bitcast i32 %tmp780 to float
+ %tmp782 = bitcast float %tmp778 to i32
+ %tmp783 = bitcast float %tmp781 to i32
+ %tmp784 = and i32 %tmp782, %tmp783
+ %tmp785 = bitcast i32 %tmp784 to float
+ %tmp786 = bitcast float %tmp785 to i32
+ %tmp787 = icmp ne i32 %tmp786, 0
+ br i1 %tmp787, label %IF173, label %ENDIF172
IF173: ; preds = %ENDIF169
- %801 = fmul float %202, 5.000000e-01
- %802 = fcmp uge float 0x3FE4CCCCC0000000, %801
- %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801
- %804 = fcmp uge float %803, 0x3FB99999A0000000
- %805 = select i1 %804, float 0x3FB99999A0000000, float %803
- %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300)
- %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301)
- %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302)
- %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303)
- %810 = insertelement <4 x float> undef, float %329, i32 0
- %811 = insertelement <4 x float> %810, float %330, i32 1
- %812 = insertelement <4 x float> %811, float %331, i32 2
- %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3
- %814 = insertelement <4 x float> undef, float %63, i32 0
- %815 = insertelement <4 x float> %814, float %65, i32 1
- %816 = insertelement <4 x float> %815, float %67, i32 2
- %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3
- %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817)
- %819 = fcmp uge float 0x3FEB333340000000, %818
- %820 = select i1 %819, float 0x3FEB333340000000, float %818
- %821 = fmul float %8, %820
- %822 = fmul float %13, %820
- %823 = fmul float %18, %820
- %824 = insertelement <4 x float> undef, float %34, i32 0
- %825 = insertelement <4 x float> %824, float %35, i32 1
- %826 = insertelement <4 x float> %825, float %36, i32 2
- %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3
- %828 = insertelement <4 x float> undef, float %63, i32 0
- %829 = insertelement <4 x float> %828, float %65, i32 1
- %830 = insertelement <4 x float> %829, float %67, i32 2
- %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3
- %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831)
- %833 = fcmp uge float 0x3FECCCCCC0000000, %832
- %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832
- %835 = fmul float %821, %834
- %836 = fmul float %822, %834
- %837 = fmul float %823, %834
+ %tmp788 = fmul float %result.i, 5.000000e-01
+ %tmp789 = fcmp uge float 0x3FE4CCCCC0000000, %tmp788
+ %tmp790 = select i1 %tmp789, float 0x3FE4CCCCC0000000, float %tmp788
+ %tmp791 = fcmp uge float %tmp790, 0x3FB99999A0000000
+ %tmp792 = select i1 %tmp791, float 0x3FB99999A0000000, float %tmp790
+ %one.sub.a.i73 = fsub float 1.000000e+00, %tmp792
+ %one.sub.ac.i74 = fmul float %one.sub.a.i73, %tmp304
+ %mul.i75 = fmul float %result.i172, %tmp304
+ %result.i76 = fadd float %mul.i75, %one.sub.ac.i74
+ %one.sub.a.i69 = fsub float 1.000000e+00, %tmp792
+ %one.sub.ac.i70 = fmul float %one.sub.a.i69, %tmp305
+ %mul.i71 = fmul float %result.i168, %tmp305
+ %result.i72 = fadd float %mul.i71, %one.sub.ac.i70
+ %one.sub.a.i65 = fsub float 1.000000e+00, %tmp792
+ %one.sub.ac.i66 = fmul float %one.sub.a.i65, %tmp306
+ %mul.i67 = fmul float %result.i164, %tmp306
+ %result.i68 = fadd float %mul.i67, %one.sub.ac.i66
+ %one.sub.a.i61 = fsub float 1.000000e+00, %tmp792
+ %one.sub.ac.i62 = fmul float %one.sub.a.i61, %tmp307
+ %mul.i63 = fmul float %result.i160, %tmp307
+ %result.i64 = fadd float %mul.i63, %one.sub.ac.i62
+ %tmp793 = insertelement <4 x float> undef, float %tmp333, i32 0
+ %tmp794 = insertelement <4 x float> %tmp793, float %tmp334, i32 1
+ %tmp795 = insertelement <4 x float> %tmp794, float %tmp335, i32 2
+ %tmp796 = insertelement <4 x float> %tmp795, float 0.000000e+00, i32 3
+ %tmp797 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp798 = insertelement <4 x float> %tmp797, float %tmp65, i32 1
+ %tmp799 = insertelement <4 x float> %tmp798, float %tmp67, i32 2
+ %tmp800 = insertelement <4 x float> %tmp799, float 0.000000e+00, i32 3
+ %tmp801 = call float @llvm.r600.dot4(<4 x float> %tmp796, <4 x float> %tmp800)
+ %tmp802 = fcmp uge float 0x3FEB333340000000, %tmp801
+ %tmp803 = select i1 %tmp802, float 0x3FEB333340000000, float %tmp801
+ %tmp804 = fmul float %tmp8, %tmp803
+ %tmp805 = fmul float %tmp13, %tmp803
+ %tmp806 = fmul float %tmp18, %tmp803
+ %tmp807 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp808 = insertelement <4 x float> %tmp807, float %tmp35, i32 1
+ %tmp809 = insertelement <4 x float> %tmp808, float %tmp36, i32 2
+ %tmp810 = insertelement <4 x float> %tmp809, float 0.000000e+00, i32 3
+ %tmp811 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp812 = insertelement <4 x float> %tmp811, float %tmp65, i32 1
+ %tmp813 = insertelement <4 x float> %tmp812, float %tmp67, i32 2
+ %tmp814 = insertelement <4 x float> %tmp813, float 0.000000e+00, i32 3
+ %tmp815 = call float @llvm.r600.dot4(<4 x float> %tmp810, <4 x float> %tmp814)
+ %tmp816 = fcmp uge float 0x3FECCCCCC0000000, %tmp815
+ %tmp817 = select i1 %tmp816, float 0x3FECCCCCC0000000, float %tmp815
+ %tmp818 = fmul float %tmp804, %tmp817
+ %tmp819 = fmul float %tmp805, %tmp817
+ %tmp820 = fmul float %tmp806, %tmp817
br label %ENDIF172
-ENDIF172: ; preds = %ENDIF169, %IF173
- %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ]
- %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ]
- %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ]
- %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ]
- %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ]
- %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ]
- %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ]
- %838 = fcmp oge float %179, 2.670000e+03
- %839 = sext i1 %838 to i32
- %840 = bitcast i32 %839 to float
- %841 = bitcast float %840 to i32
- %842 = icmp ne i32 %841, 0
- br i1 %842, label %IF176, label %ENDIF175
+ENDIF172: ; preds = %IF173, %ENDIF169
+ %temp84.4 = phi float [ %result.i76, %IF173 ], [ %temp84.3, %ENDIF169 ]
+ %temp85.4 = phi float [ %result.i72, %IF173 ], [ %temp85.3, %ENDIF169 ]
+ %temp86.4 = phi float [ %result.i68, %IF173 ], [ %temp86.3, %ENDIF169 ]
+ %temp87.4 = phi float [ %result.i64, %IF173 ], [ %temp87.3, %ENDIF169 ]
+ %temp92.10 = phi float [ %tmp818, %IF173 ], [ %temp92.9, %ENDIF169 ]
+ %temp93.4 = phi float [ %tmp819, %IF173 ], [ %temp93.3, %ENDIF169 ]
+ %temp94.4 = phi float [ %tmp820, %IF173 ], [ %temp94.3, %ENDIF169 ]
+ %tmp821 = fcmp oge float %tmp181, 2.670000e+03
+ %tmp822 = sext i1 %tmp821 to i32
+ %tmp823 = bitcast i32 %tmp822 to float
+ %tmp824 = bitcast float %tmp823 to i32
+ %tmp825 = icmp ne i32 %tmp824, 0
+ br i1 %tmp825, label %IF176, label %ENDIF175
IF176: ; preds = %ENDIF172
- %843 = fmul float %202, 0x3FB99999A0000000
- %844 = fcmp uge float 0.000000e+00, %843
- %845 = select i1 %844, float 0.000000e+00, float %843
- %846 = fcmp uge float %845, 0x3FD99999A0000000
- %847 = select i1 %846, float 0x3FD99999A0000000, float %845
- %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300)
- %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301)
- %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302)
- %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303)
- %852 = insertelement <4 x float> undef, float %329, i32 0
- %853 = insertelement <4 x float> %852, float %330, i32 1
- %854 = insertelement <4 x float> %853, float %331, i32 2
- %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3
- %856 = insertelement <4 x float> undef, float %63, i32 0
- %857 = insertelement <4 x float> %856, float %65, i32 1
- %858 = insertelement <4 x float> %857, float %67, i32 2
- %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3
- %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859)
- %861 = fcmp uge float 0x3FEB333340000000, %860
- %862 = select i1 %861, float 0x3FEB333340000000, float %860
- %863 = fmul float %8, %862
- %864 = fmul float %13, %862
- %865 = fmul float %18, %862
- %866 = insertelement <4 x float> undef, float %34, i32 0
- %867 = insertelement <4 x float> %866, float %35, i32 1
- %868 = insertelement <4 x float> %867, float %36, i32 2
- %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3
- %870 = insertelement <4 x float> undef, float %63, i32 0
- %871 = insertelement <4 x float> %870, float %65, i32 1
- %872 = insertelement <4 x float> %871, float %67, i32 2
- %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3
- %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873)
- %875 = fcmp uge float 0x3FECCCCCC0000000, %874
- %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874
- %877 = fmul float %863, %876
- %878 = fmul float %864, %876
- %879 = fmul float %865, %876
+ %tmp826 = fmul float %result.i, 0x3FB99999A0000000
+ %tmp827 = fcmp uge float 0.000000e+00, %tmp826
+ %tmp828 = select i1 %tmp827, float 0.000000e+00, float %tmp826
+ %tmp829 = fcmp uge float %tmp828, 0x3FD99999A0000000
+ %tmp830 = select i1 %tmp829, float 0x3FD99999A0000000, float %tmp828
+ %one.sub.a.i57 = fsub float 1.000000e+00, %tmp830
+ %one.sub.ac.i58 = fmul float %one.sub.a.i57, %tmp304
+ %mul.i59 = fmul float %result.i172, %tmp304
+ %result.i60 = fadd float %mul.i59, %one.sub.ac.i58
+ %one.sub.a.i53 = fsub float 1.000000e+00, %tmp830
+ %one.sub.ac.i54 = fmul float %one.sub.a.i53, %tmp305
+ %mul.i55 = fmul float %result.i168, %tmp305
+ %result.i56 = fadd float %mul.i55, %one.sub.ac.i54
+ %one.sub.a.i49 = fsub float 1.000000e+00, %tmp830
+ %one.sub.ac.i50 = fmul float %one.sub.a.i49, %tmp306
+ %mul.i51 = fmul float %result.i164, %tmp306
+ %result.i52 = fadd float %mul.i51, %one.sub.ac.i50
+ %one.sub.a.i45 = fsub float 1.000000e+00, %tmp830
+ %one.sub.ac.i46 = fmul float %one.sub.a.i45, %tmp307
+ %mul.i47 = fmul float %result.i160, %tmp307
+ %result.i48 = fadd float %mul.i47, %one.sub.ac.i46
+ %tmp831 = insertelement <4 x float> undef, float %tmp333, i32 0
+ %tmp832 = insertelement <4 x float> %tmp831, float %tmp334, i32 1
+ %tmp833 = insertelement <4 x float> %tmp832, float %tmp335, i32 2
+ %tmp834 = insertelement <4 x float> %tmp833, float 0.000000e+00, i32 3
+ %tmp835 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp836 = insertelement <4 x float> %tmp835, float %tmp65, i32 1
+ %tmp837 = insertelement <4 x float> %tmp836, float %tmp67, i32 2
+ %tmp838 = insertelement <4 x float> %tmp837, float 0.000000e+00, i32 3
+ %tmp839 = call float @llvm.r600.dot4(<4 x float> %tmp834, <4 x float> %tmp838)
+ %tmp840 = fcmp uge float 0x3FEB333340000000, %tmp839
+ %tmp841 = select i1 %tmp840, float 0x3FEB333340000000, float %tmp839
+ %tmp842 = fmul float %tmp8, %tmp841
+ %tmp843 = fmul float %tmp13, %tmp841
+ %tmp844 = fmul float %tmp18, %tmp841
+ %tmp845 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp846 = insertelement <4 x float> %tmp845, float %tmp35, i32 1
+ %tmp847 = insertelement <4 x float> %tmp846, float %tmp36, i32 2
+ %tmp848 = insertelement <4 x float> %tmp847, float 0.000000e+00, i32 3
+ %tmp849 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp850 = insertelement <4 x float> %tmp849, float %tmp65, i32 1
+ %tmp851 = insertelement <4 x float> %tmp850, float %tmp67, i32 2
+ %tmp852 = insertelement <4 x float> %tmp851, float 0.000000e+00, i32 3
+ %tmp853 = call float @llvm.r600.dot4(<4 x float> %tmp848, <4 x float> %tmp852)
+ %tmp854 = fcmp uge float 0x3FECCCCCC0000000, %tmp853
+ %tmp855 = select i1 %tmp854, float 0x3FECCCCCC0000000, float %tmp853
+ %tmp856 = fmul float %tmp842, %tmp855
+ %tmp857 = fmul float %tmp843, %tmp855
+ %tmp858 = fmul float %tmp844, %tmp855
br label %ENDIF175
-ENDIF175: ; preds = %ENDIF172, %IF176
- %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ]
- %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ]
- %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ]
- %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ]
- %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
- %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
- %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
- %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
- %881 = extractelement <4 x float> %880, i32 0
- %882 = fcmp olt float %881, %179
- %883 = sext i1 %882 to i32
- %884 = bitcast i32 %883 to float
- %885 = bitcast float %884 to i32
- %886 = icmp ne i32 %885, 0
- br i1 %886, label %IF179, label %ENDIF178
+ENDIF175: ; preds = %IF176, %ENDIF172
+ %temp84.5 = phi float [ %result.i60, %IF176 ], [ %temp84.4, %ENDIF172 ]
+ %temp85.5 = phi float [ %result.i56, %IF176 ], [ %temp85.4, %ENDIF172 ]
+ %temp86.5 = phi float [ %result.i52, %IF176 ], [ %temp86.4, %ENDIF172 ]
+ %temp87.5 = phi float [ %result.i48, %IF176 ], [ %temp87.4, %ENDIF172 ]
+ %temp92.11 = phi float [ %tmp856, %IF176 ], [ %temp92.10, %ENDIF172 ]
+ %temp93.5 = phi float [ %tmp857, %IF176 ], [ %temp93.4, %ENDIF172 ]
+ %temp94.5 = phi float [ %tmp858, %IF176 ], [ %temp94.4, %ENDIF172 ]
+ %tmp859 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+ %tmp860 = extractelement <4 x float> %tmp859, i32 0
+ %tmp861 = fcmp olt float %tmp860, %tmp181
+ %tmp862 = sext i1 %tmp861 to i32
+ %tmp863 = bitcast i32 %tmp862 to float
+ %tmp864 = bitcast float %tmp863 to i32
+ %tmp865 = icmp ne i32 %tmp864, 0
+ br i1 %tmp865, label %IF179, label %ENDIF178
IF179: ; preds = %ENDIF175
- %887 = fadd float %202, 1.000000e+00
- %888 = fadd float %202, 1.000000e+00
- %889 = fadd float %202, 1.000000e+00
- %890 = insertelement <4 x float> undef, float %43, i32 0
- %891 = insertelement <4 x float> %890, float %44, i32 1
- %892 = insertelement <4 x float> %891, float %45, i32 2
- %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3
- %894 = insertelement <4 x float> undef, float %43, i32 0
- %895 = insertelement <4 x float> %894, float %44, i32 1
- %896 = insertelement <4 x float> %895, float %45, i32 2
- %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
- %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
- %899 = call float @llvm.AMDGPU.rsq.f32(float %898)
- %900 = fmul float %45, %899
- %901 = call float @fabs(float %900)
- %902 = fmul float %176, 0x3FECCCCCC0000000
- %903 = fadd float %902, %901
- %904 = fadd float %903, 0xBFEFAE1480000000
- %905 = fmul float %904, 0xC043FFFE20000000
- %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00)
- %907 = fmul float 2.000000e+00, %906
- %908 = fsub float -0.000000e+00, %907
- %909 = fadd float 3.000000e+00, %908
- %910 = fmul float %906, %909
- %911 = fmul float %906, %910
- %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887)
- %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888)
- %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889)
- %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00)
- %916 = fmul float %202, 5.000000e-01
- %917 = fcmp uge float 0x3FE4CCCCC0000000, %916
- %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916
- %919 = fcmp uge float %918, 0x3FE3333340000000
- %920 = select i1 %919, float 0x3FE3333340000000, float %918
- %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5)
- %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5)
- %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5)
- %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5)
- %925 = insertelement <4 x float> undef, float %329, i32 0
- %926 = insertelement <4 x float> %925, float %330, i32 1
- %927 = insertelement <4 x float> %926, float %331, i32 2
- %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3
- %929 = insertelement <4 x float> undef, float %63, i32 0
- %930 = insertelement <4 x float> %929, float %65, i32 1
- %931 = insertelement <4 x float> %930, float %67, i32 2
- %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3
- %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932)
- %934 = fcmp uge float 0x3FE99999A0000000, %933
- %935 = select i1 %934, float 0x3FE99999A0000000, float %933
- %936 = fmul float %8, %935
- %937 = fmul float %13, %935
- %938 = fmul float %18, %935
- %939 = insertelement <4 x float> undef, float %34, i32 0
- %940 = insertelement <4 x float> %939, float %35, i32 1
- %941 = insertelement <4 x float> %940, float %36, i32 2
- %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3
- %943 = insertelement <4 x float> undef, float %63, i32 0
- %944 = insertelement <4 x float> %943, float %65, i32 1
- %945 = insertelement <4 x float> %944, float %67, i32 2
- %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3
- %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946)
- %948 = fcmp uge float 0x3FECCCCCC0000000, %947
- %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947
- %950 = fmul float %936, %949
- %951 = fmul float %937, %949
- %952 = fmul float %938, %949
+ %tmp866 = fadd float %result.i, 1.000000e+00
+ %tmp867 = fadd float %result.i, 1.000000e+00
+ %tmp868 = fadd float %result.i, 1.000000e+00
+ %tmp869 = insertelement <4 x float> undef, float %tmp43, i32 0
+ %tmp870 = insertelement <4 x float> %tmp869, float %tmp44, i32 1
+ %tmp871 = insertelement <4 x float> %tmp870, float %tmp45, i32 2
+ %tmp872 = insertelement <4 x float> %tmp871, float 0.000000e+00, i32 3
+ %tmp873 = insertelement <4 x float> undef, float %tmp43, i32 0
+ %tmp874 = insertelement <4 x float> %tmp873, float %tmp44, i32 1
+ %tmp875 = insertelement <4 x float> %tmp874, float %tmp45, i32 2
+ %tmp876 = insertelement <4 x float> %tmp875, float 0.000000e+00, i32 3
+ %tmp877 = call float @llvm.r600.dot4(<4 x float> %tmp872, <4 x float> %tmp876)
+ %tmp878 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp877)
+ %tmp879 = fmul float %tmp45, %tmp878
+ %tmp880 = call float @llvm.fabs.f32(float %tmp879)
+ %tmp881 = fmul float %tmp178, 0x3FECCCCCC0000000
+ %tmp882 = fadd float %tmp881, %tmp880
+ %tmp883 = fadd float %tmp882, 0xBFEFAE1480000000
+ %tmp884 = fmul float %tmp883, 0xC043FFFE20000000
+ %tmp885 = call float @llvm.AMDGPU.clamp.f32(float %tmp884, float 0.000000e+00, float 1.000000e+00)
+ %tmp886 = fmul float 2.000000e+00, %tmp885
+ %tmp887 = fsub float -0.000000e+00, %tmp886
+ %tmp888 = fadd float 3.000000e+00, %tmp887
+ %tmp889 = fmul float %tmp885, %tmp888
+ %tmp890 = fmul float %tmp885, %tmp889
+ %one.sub.a.i41 = fsub float 1.000000e+00, %tmp890
+ %one.sub.ac.i42 = fmul float %one.sub.a.i41, %tmp866
+ %mul.i43 = fmul float %temp84.5, %tmp866
+ %result.i44 = fadd float %mul.i43, %one.sub.ac.i42
+ %one.sub.a.i37 = fsub float 1.000000e+00, %tmp890
+ %one.sub.ac.i38 = fmul float %one.sub.a.i37, %tmp867
+ %mul.i39 = fmul float %temp85.5, %tmp867
+ %result.i40 = fadd float %mul.i39, %one.sub.ac.i38
+ %one.sub.a.i33 = fsub float 1.000000e+00, %tmp890
+ %one.sub.ac.i34 = fmul float %one.sub.a.i33, %tmp868
+ %mul.i35 = fmul float %temp86.5, %tmp868
+ %result.i36 = fadd float %mul.i35, %one.sub.ac.i34
+ %one.sub.a.i29 = fsub float 1.000000e+00, %tmp890
+ %one.sub.ac.i30 = fmul float %one.sub.a.i29, 0.000000e+00
+ %mul.i31 = fmul float %temp87.5, 0.000000e+00
+ %result.i32 = fadd float %mul.i31, %one.sub.ac.i30
+ %tmp891 = fmul float %result.i, 5.000000e-01
+ %tmp892 = fcmp uge float 0x3FE4CCCCC0000000, %tmp891
+ %tmp893 = select i1 %tmp892, float 0x3FE4CCCCC0000000, float %tmp891
+ %tmp894 = fcmp uge float %tmp893, 0x3FE3333340000000
+ %tmp895 = select i1 %tmp894, float 0x3FE3333340000000, float %tmp893
+ %one.sub.a.i25 = fsub float 1.000000e+00, %tmp895
+ %one.sub.ac.i26 = fmul float %one.sub.a.i25, %temp84.5
+ %mul.i27 = fmul float %result.i44, %temp84.5
+ %result.i28 = fadd float %mul.i27, %one.sub.ac.i26
+ %one.sub.a.i21 = fsub float 1.000000e+00, %tmp895
+ %one.sub.ac.i22 = fmul float %one.sub.a.i21, %temp85.5
+ %mul.i23 = fmul float %result.i40, %temp85.5
+ %result.i24 = fadd float %mul.i23, %one.sub.ac.i22
+ %one.sub.a.i17 = fsub float 1.000000e+00, %tmp895
+ %one.sub.ac.i18 = fmul float %one.sub.a.i17, %temp86.5
+ %mul.i19 = fmul float %result.i36, %temp86.5
+ %result.i20 = fadd float %mul.i19, %one.sub.ac.i18
+ %one.sub.a.i13 = fsub float 1.000000e+00, %tmp895
+ %one.sub.ac.i14 = fmul float %one.sub.a.i13, %temp87.5
+ %mul.i15 = fmul float %result.i32, %temp87.5
+ %result.i16 = fadd float %mul.i15, %one.sub.ac.i14
+ %tmp896 = insertelement <4 x float> undef, float %tmp333, i32 0
+ %tmp897 = insertelement <4 x float> %tmp896, float %tmp334, i32 1
+ %tmp898 = insertelement <4 x float> %tmp897, float %tmp335, i32 2
+ %tmp899 = insertelement <4 x float> %tmp898, float 0.000000e+00, i32 3
+ %tmp900 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp901 = insertelement <4 x float> %tmp900, float %tmp65, i32 1
+ %tmp902 = insertelement <4 x float> %tmp901, float %tmp67, i32 2
+ %tmp903 = insertelement <4 x float> %tmp902, float 0.000000e+00, i32 3
+ %tmp904 = call float @llvm.r600.dot4(<4 x float> %tmp899, <4 x float> %tmp903)
+ %tmp905 = fcmp uge float 0x3FE99999A0000000, %tmp904
+ %tmp906 = select i1 %tmp905, float 0x3FE99999A0000000, float %tmp904
+ %tmp907 = fmul float %tmp8, %tmp906
+ %tmp908 = fmul float %tmp13, %tmp906
+ %tmp909 = fmul float %tmp18, %tmp906
+ %tmp910 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp911 = insertelement <4 x float> %tmp910, float %tmp35, i32 1
+ %tmp912 = insertelement <4 x float> %tmp911, float %tmp36, i32 2
+ %tmp913 = insertelement <4 x float> %tmp912, float 0.000000e+00, i32 3
+ %tmp914 = insertelement <4 x float> undef, float %tmp63, i32 0
+ %tmp915 = insertelement <4 x float> %tmp914, float %tmp65, i32 1
+ %tmp916 = insertelement <4 x float> %tmp915, float %tmp67, i32 2
+ %tmp917 = insertelement <4 x float> %tmp916, float 0.000000e+00, i32 3
+ %tmp918 = call float @llvm.r600.dot4(<4 x float> %tmp913, <4 x float> %tmp917)
+ %tmp919 = fcmp uge float 0x3FECCCCCC0000000, %tmp918
+ %tmp920 = select i1 %tmp919, float 0x3FECCCCCC0000000, float %tmp918
+ %tmp921 = fmul float %tmp907, %tmp920
+ %tmp922 = fmul float %tmp908, %tmp920
+ %tmp923 = fmul float %tmp909, %tmp920
br label %ENDIF178
-ENDIF178: ; preds = %ENDIF175, %IF179
- %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ]
- %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ]
- %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ]
- %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ]
- %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ]
- %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ]
- %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ]
- %953 = fmul float %55, %temp92.12
- %954 = fmul float %57, %temp93.6
- %955 = fmul float %59, %temp94.6
- %956 = fmul float %61, 0.000000e+00
- %957 = fmul float %temp84.6, %953
- %958 = fmul float %temp85.6, %954
- %959 = fmul float %temp86.6, %955
- %960 = fmul float %temp87.6, %956
- %961 = fmul float %2, -2.000000e+00
- %962 = fadd float %961, 1.000000e+00
- %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
- %964 = extractelement <4 x float> %963, i32 2
- %965 = fsub float -0.000000e+00, %964
- %966 = fadd float %962, %965
- %967 = fdiv float 1.000000e+00, %966
- %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
- %969 = extractelement <4 x float> %968, i32 2
- %970 = fmul float %969, %967
- %971 = fsub float -0.000000e+00, %53
- %972 = fmul float %971, %53
- %973 = fmul float %972, %970
- %974 = fmul float %973, %970
- %975 = fmul float %974, 0x3FF7154760000000
- %976 = call float @llvm.AMDIL.exp.(float %975)
- %977 = fcmp oeq float %53, 1.000000e+00
- %978 = sext i1 %977 to i32
- %979 = bitcast i32 %978 to float
- %980 = bitcast float %979 to i32
- %981 = icmp ne i32 %980, 0
- %.184 = select i1 %981, float 1.000000e+00, float %976
- %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47)
- %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49)
- %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51)
- %985 = insertelement <4 x float> undef, float %982, i32 0
- %986 = insertelement <4 x float> %985, float %983, i32 1
- %987 = insertelement <4 x float> %986, float %984, i32 2
- %988 = insertelement <4 x float> %987, float %960, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0)
+ENDIF178: ; preds = %IF179, %ENDIF175
+ %temp84.6 = phi float [ %result.i28, %IF179 ], [ %temp84.5, %ENDIF175 ]
+ %temp85.6 = phi float [ %result.i24, %IF179 ], [ %temp85.5, %ENDIF175 ]
+ %temp86.6 = phi float [ %result.i20, %IF179 ], [ %temp86.5, %ENDIF175 ]
+ %temp87.6 = phi float [ %result.i16, %IF179 ], [ %temp87.5, %ENDIF175 ]
+ %temp92.12 = phi float [ %tmp921, %IF179 ], [ %temp92.11, %ENDIF175 ]
+ %temp93.6 = phi float [ %tmp922, %IF179 ], [ %temp93.5, %ENDIF175 ]
+ %temp94.6 = phi float [ %tmp923, %IF179 ], [ %temp94.5, %ENDIF175 ]
+ %tmp924 = fmul float %tmp55, %temp92.12
+ %tmp925 = fmul float %tmp57, %temp93.6
+ %tmp926 = fmul float %tmp59, %temp94.6
+ %tmp927 = fmul float %tmp61, 0.000000e+00
+ %tmp928 = fmul float %temp84.6, %tmp924
+ %tmp929 = fmul float %temp85.6, %tmp925
+ %tmp930 = fmul float %temp86.6, %tmp926
+ %tmp931 = fmul float %temp87.6, %tmp927
+ %tmp932 = fmul float %tmp2, -2.000000e+00
+ %tmp933 = fadd float %tmp932, 1.000000e+00
+ %tmp934 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+ %tmp935 = extractelement <4 x float> %tmp934, i32 2
+ %tmp936 = fsub float -0.000000e+00, %tmp935
+ %tmp937 = fadd float %tmp933, %tmp936
+ %tmp938 = fdiv float 1.000000e+00, %tmp937
+ %tmp939 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
+ %tmp940 = extractelement <4 x float> %tmp939, i32 2
+ %tmp941 = fmul float %tmp940, %tmp938
+ %tmp942 = fsub float -0.000000e+00, %tmp53
+ %tmp943 = fmul float %tmp942, %tmp53
+ %tmp944 = fmul float %tmp943, %tmp941
+ %tmp945 = fmul float %tmp944, %tmp941
+ %tmp946 = fmul float %tmp945, 0x3FF7154760000000
+ %tmp947 = call float @llvm.exp2.f32(float %tmp946)
+ %tmp948 = fcmp oeq float %tmp53, 1.000000e+00
+ %tmp949 = sext i1 %tmp948 to i32
+ %tmp950 = bitcast i32 %tmp949 to float
+ %tmp951 = bitcast float %tmp950 to i32
+ %tmp952 = icmp ne i32 %tmp951, 0
+ %.184 = select i1 %tmp952, float 1.000000e+00, float %tmp947
+ %one.sub.a.i9 = fsub float 1.000000e+00, %.184
+ %one.sub.ac.i10 = fmul float %one.sub.a.i9, %tmp47
+ %mul.i11 = fmul float %tmp928, %tmp47
+ %result.i12 = fadd float %mul.i11, %one.sub.ac.i10
+ %one.sub.a.i5 = fsub float 1.000000e+00, %.184
+ %one.sub.ac.i6 = fmul float %one.sub.a.i5, %tmp49
+ %mul.i7 = fmul float %tmp929, %tmp49
+ %result.i8 = fadd float %mul.i7, %one.sub.ac.i6
+ %one.sub.a.i1 = fsub float 1.000000e+00, %.184
+ %one.sub.ac.i2 = fmul float %one.sub.a.i1, %tmp51
+ %mul.i3 = fmul float %tmp930, %tmp51
+ %result.i4 = fadd float %mul.i3, %one.sub.ac.i2
+ %tmp953 = insertelement <4 x float> undef, float %result.i12, i32 0
+ %tmp954 = insertelement <4 x float> %tmp953, float %result.i8, i32 1
+ %tmp955 = insertelement <4 x float> %tmp954, float %result.i4, i32 2
+ %tmp956 = insertelement <4 x float> %tmp955, float %tmp931, i32 3
+ call void @llvm.r600.store.swizzle(<4 x float> %tmp956, i32 0, i32 0)
ret void
}
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #0
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+; Function Attrs: nounwind readonly
+declare float @llvm.fabs.f32(float) #1
-; Function Attrs: readonly
-declare float @fabs(float) #2
+; Function Attrs: nounwind readnone
+declare float @llvm.exp2.f32(float) #0
-; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/bitcast.ll b/test/CodeGen/AMDGPU/bitcast.ll
index fd56d956bf31..87ef5978ebfc 100644
--- a/test/CodeGen/AMDGPU/bitcast.ll
+++ b/test/CodeGen/AMDGPU/bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; This test just checks that the compiler doesn't crash.
@@ -7,7 +7,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
; FUNC-LABEL: {{^}}v32i8_to_v8i32:
; SI: s_endpgm
-define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
entry:
%1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
%2 = bitcast <32 x i8> %1 to <8 x i32>
@@ -76,4 +76,34 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
ret void
}
-attributes #0 = { "ShaderType"="0" }
+; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
+define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+entry:
+ %cmp0 = icmp eq i32 %cond, 0
+ br i1 %cmp0, label %if, label %end
+
+if:
+ %cast = bitcast <2 x i64> %value to <2 x double>
+ br label %end
+
+end:
+ %phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if]
+ store <2 x double> %phi, <2 x double> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
+define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+entry:
+ %cmp0 = icmp eq i32 %cond, 0
+ br i1 %cmp0, label %if, label %end
+
+if:
+ %cast = bitcast <2 x double> %value to <2 x i64>
+ br label %end
+
+end:
+ %phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if]
+ store <2 x i64> %phi, <2 x i64> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
new file mode 100644
index 000000000000..150e3430a5e9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -0,0 +1,158 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Test that materialization constants that are the bit reversed of
+; inline immediates are replaced with bfrev of the inline immediate to
+; save code size.
+
+; GCN-LABEL: {{^}}materialize_0_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_0_i32(i32 addrspace(1)* %out) {
+ store i32 0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_0_i64:
+; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_0_i64(i64 addrspace(1)* %out) {
+ store i64 0, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_neg1_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+ store i32 -1, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_neg1_i64:
+; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
+; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+ store i64 -1, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_signbit_i32:
+; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+ store i32 -2147483648, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_signbit_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+ store i64 -9223372036854775808, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
+; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+ store i32 268435455, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg16_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+ store i64 1152921504606846975, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+ store i32 -134217729, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_neg17_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+ store i64 -576460752303423489, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_64_i32:
+; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+ store i32 33554432, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_64_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+ store i64 144115188075855872, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_65_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+ store i32 -2113929216, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_65_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+ store i64 -9079256848778919936, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_3_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+ store i32 -1073741824, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_3_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+ store i64 -4611686018427387904, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
+; GCN: buffer_store_dword [[K]]
+define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+ store i32 508, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}materialize_rev_1.0_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+ store i64 508, i64 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
index 0ef7d5184c1f..62e7904f4382 100644
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -11,8 +11,6 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
-declare i32 @llvm.AMDGPU.brev(i32) #1
-
; FUNC-LABEL: {{^}}s_brev_i16:
; SI: s_brev_b32
define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
@@ -103,13 +101,5 @@ define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrsp
ret void
}
-; FUNC-LABEL: {{^}}legacy_s_brev_i32:
-; SI: s_brev_b32
-define void @legacy_s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
- %brev = call i32 @llvm.AMDGPU.brev(i32 %val) #1
- store i32 %brev, i32 addrspace(1)* %out
- ret void
-}
-
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/branch-uniformity.ll b/test/CodeGen/AMDGPU/branch-uniformity.ll
new file mode 100644
index 000000000000..d1a1f93f0210
--- /dev/null
+++ b/test/CodeGen/AMDGPU/branch-uniformity.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck %s
+
+; The branch instruction in LOOP49 has a uniform condition, but PHI instructions
+; introduced by the structurizecfg pass previously caused a false divergence
+; which ended up in an assertion (or incorrect code) because
+; SIAnnotateControlFlow and structurizecfg had different ideas about which
+; branches are uniform.
+;
+; CHECK-LABEL: {{^}}main:
+; CHECK: ; %LOOP49
+; CHECK: v_cmp_ne_i32_e32 vcc,
+; CHECK: s_cbranch_vccnz
+; CHECK: ; %ENDIF53
+define amdgpu_vs float @main(i32 %in) {
+main_body:
+ %cmp = mul i32 %in, 2
+ br label %LOOP
+
+LOOP: ; preds = %ENDLOOP48, %main_body
+ %counter = phi i32 [ 0, %main_body ], [ %counter.next, %ENDLOOP48 ]
+ %v.LOOP = phi i32 [ 0, %main_body ], [ %v.ENDLOOP48, %ENDLOOP48 ]
+ %tmp7 = icmp slt i32 %cmp, %counter
+ br i1 %tmp7, label %IF, label %LOOP49
+
+IF: ; preds = %LOOP
+ %r = bitcast i32 %v.LOOP to float
+ ret float %r
+
+LOOP49: ; preds = %LOOP
+ %tmp8 = icmp ne i32 %counter, 0
+ br i1 %tmp8, label %ENDLOOP48, label %ENDIF53
+
+ENDLOOP48: ; preds = %ENDIF53, %LOOP49
+ %v.ENDLOOP48 = phi i32 [ %v.LOOP, %LOOP49 ], [ %v.ENDIF53, %ENDIF53 ]
+ %counter.next = add i32 %counter, 1
+ br label %LOOP
+
+ENDIF53: ; preds = %LOOP49
+ %v.ENDIF53 = add i32 %v.LOOP, %counter
+ br label %ENDLOOP48
+}
diff --git a/test/CodeGen/AMDGPU/bug-vopc-commute.ll b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
new file mode 100644
index 000000000000..990671102757
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+target triple = "amdgcn--"
+
+; CHECK-LABEL: {{^}}main:
+;
+; Test for compilation only. This generated an invalid machine instruction
+; by trying to commute the operands of a V_CMP_EQ_i32_e32 instruction, both
+; of which were in SGPRs.
+define amdgpu_vs float @main(i32 %v) {
+main_body:
+ %d1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 960)
+ %d2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 976)
+ br i1 undef, label %ENDIF56, label %IF57
+
+IF57: ; preds = %ENDIF
+ %v.1 = mul i32 %v, 2
+ br label %ENDIF56
+
+ENDIF56: ; preds = %IF57, %ENDIF
+ %v.2 = phi i32 [ %v, %main_body ], [ %v.1, %IF57 ]
+ %d1.i = bitcast float %d1 to i32
+ %cc1 = icmp eq i32 %d1.i, 0
+ br i1 %cc1, label %ENDIF59, label %IF60
+
+IF60: ; preds = %ENDIF56
+ %v.3 = mul i32 %v.2, 2
+ br label %ENDIF59
+
+ENDIF59: ; preds = %IF60, %ENDIF56
+ %v.4 = phi i32 [ %v.2, %ENDIF56 ], [ %v.3, %IF60 ]
+ %d2.i = bitcast float %d2 to i32
+ %cc2 = icmp eq i32 %d2.i, 0
+ br i1 %cc2, label %ENDIF62, label %IF63
+
+IF63: ; preds = %ENDIF59
+ unreachable
+
+ENDIF62: ; preds = %ENDIF59
+ %r = bitcast i32 %v.4 to float
+ ret float %r
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll
index e769fd11c282..a3e986d367e1 100644
--- a/test/CodeGen/AMDGPU/call.ll
+++ b/test/CodeGen/AMDGPU/call.ll
@@ -1,8 +1,10 @@
-; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s 2>&1 | FileCheck %s
; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
-; CHECK: error: unsupported call to function external_function in test_call_external
+; CHECK: in function test_call_external{{.*}}: unsupported call to function external_function
+; CHECK: in function test_call{{.*}}: unsupported call to function defined_function
+; CHECK: in function test_tail_call{{.*}}: unsupported call to function defined_function
declare i32 @external_function(i32) nounwind
@@ -31,3 +33,13 @@ define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
store i32 %result, i32 addrspace(1)* %out
ret void
}
+
+define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %a = load i32, i32 addrspace(1)* %in
+ %b = load i32, i32 addrspace(1)* %b_ptr
+ %c = tail call i32 @defined_function(i32 %b) nounwind
+ %result = add i32 %a, %c
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/call_fs.ll b/test/CodeGen/AMDGPU/call_fs.ll
index 87bebbc49d52..a5a2d28ff716 100644
--- a/test/CodeGen/AMDGPU/call_fs.ll
+++ b/test/CodeGen/AMDGPU/call_fs.ll
@@ -10,8 +10,6 @@
; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
-define void @call_fs() #0 {
+define amdgpu_vs void @call_fs() {
ret void
}
-
-attributes #0 = { "ShaderType"="1" } ; Vertex Shader
diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll
new file mode 100644
index 000000000000..161c46b486eb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -0,0 +1,166 @@
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}stored_fi_to_lds:
+; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
+define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+ %tmp = alloca float
+ store float 4.0, float *%tmp
+ store float* %tmp, float* addrspace(3)* %ptr
+ ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+
+; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
+
+; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO]]
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]]
+define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+ %tmp0 = alloca float
+ %tmp1 = alloca float
+ store float 4.0, float* %tmp0
+ store float 4.0, float* %tmp1
+ store volatile float* %tmp0, float* addrspace(3)* %ptr
+ store volatile float* %tmp1, float* addrspace(3)* %ptr
+ ret void
+}
+
+; Same frame index is used multiple times in the store
+; GCN-LABEL: {{^}}stored_fi_to_self:
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[K]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword [[ZERO]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+define void @stored_fi_to_self() #0 {
+ %tmp = alloca i32*
+
+ ; Avoid optimizing everything out
+ store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+ %bitcast = bitcast i32** %tmp to i32*
+ store volatile i32* %bitcast, i32** %tmp
+ ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_self_offset:
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
+; GCN: buffer_store_dword [[K0]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
+; GCN: buffer_store_dword [[K1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}}
+
+; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2048{{$}}
+define void @stored_fi_to_self_offset() #0 {
+ %tmp0 = alloca [512 x i32]
+ %tmp1 = alloca i32*
+
+ ; Avoid optimizing everything out
+ %tmp0.cast = bitcast [512 x i32]* %tmp0 to i32*
+ store volatile i32 32, i32* %tmp0.cast
+
+ store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp1
+
+ %bitcast = bitcast i32** %tmp1 to i32*
+ store volatile i32* %bitcast, i32** %tmp1
+ ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_fi:
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}}
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8{{$}}
+
+; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], [[ZERO]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4{{$}}
+define void @stored_fi_to_fi() #0 {
+ %tmp0 = alloca i32*
+ %tmp1 = alloca i32*
+ %tmp2 = alloca i32*
+ store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp0
+ store volatile i32* inttoptr (i32 5678 to i32*), i32** %tmp1
+ store volatile i32* inttoptr (i32 9999 to i32*), i32** %tmp2
+
+ %bitcast1 = bitcast i32** %tmp1 to i32*
+ %bitcast2 = bitcast i32** %tmp2 to i32* ; at offset 8
+
+ store volatile i32* %bitcast1, i32** %tmp2 ; store offset 4 at offset 8
+ store volatile i32* %bitcast2, i32** %tmp1 ; store offset 8 at offset 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_global:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+ %tmp = alloca float
+ store float 0.0, float *%tmp
+ store float* %tmp, float* addrspace(1)* %ptr
+ ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+ %tmp0 = alloca float
+ %tmp1 = alloca float
+ %tmp2 = alloca float
+ store volatile float 0.0, float *%tmp0
+ store volatile float 0.0, float *%tmp1
+ store volatile float 0.0, float *%tmp2
+ store volatile float* %tmp1, float* addrspace(1)* %ptr
+ store volatile float* %tmp2, float* addrspace(1)* %ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
+; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc
+; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]]
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56
+; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]]
+; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+ %tmp0 = alloca [4096 x i32]
+ %tmp1 = alloca [4096 x i32]
+ %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
+ store volatile i32 0, i32* %gep0.tmp0
+ %gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 4095
+ store volatile i32 999, i32* %gep1.tmp0
+ %gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 14
+ store i32* %gep0.tmp1, i32* addrspace(1)* %ptr
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/cayman-loop-bug.ll b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
index c7b8c4037316..34e6669434f1 100644
--- a/test/CodeGen/AMDGPU/cayman-loop-bug.ll
+++ b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
@@ -8,25 +8,29 @@
; CHECK-NOT: ALU_PUSH_BEFORE
; CHECK: END_LOOP
; CHECK: END_LOOP
-define void @main (<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @main (<4 x float> inreg %reg0) {
entry:
br label %outer_loop
+
outer_loop:
%cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop]
%cond = icmp eq i32 %cnt, 16
br i1 %cond, label %outer_loop_body, label %exit
+
outer_loop_body:
%cnt_incr = add i32 %cnt, 1
br label %inner_loop
+
inner_loop:
%cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body]
- %cond2 = icmp eq i32 %cnt2, 16
- br i1 %cond, label %inner_loop_body, label %outer_loop
+ %n = load volatile i32, i32 addrspace(1)* undef
+ %cond2 = icmp slt i32 %cnt2, %n
+ br i1 %cond2, label %inner_loop_body, label %outer_loop
+
inner_loop_body:
%cnt2_incr = add i32 %cnt2, 1
br label %inner_loop
+
exit:
ret void
}
-
-attributes #0 = { "ShaderType"="0" } \ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
new file mode 100644
index 000000000000..759c48b3a9cf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -0,0 +1,121 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
+
+; GCN-LABEL: {{^}}test_loop:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+; GCN: s_branch [[LABEL]]
+; GCN: s_endpgm
+define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+ %cmp = icmp eq i32 %n, -1
+ br i1 %cmp, label %for.exit, label %for.body
+
+for.exit:
+ ret void
+
+for.body:
+ %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %tmp = add i32 %indvar, 32
+ %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+ %vecload = load float, float addrspace(3)* %arrayidx, align 4
+ %add = fadd float %vecload, 1.0
+ store float %add, float addrspace(3)* %arrayidx, align 8
+ %inc = add i32 %indvar, 1
+ br label %for.body
+}
+
+; GCN-LABEL: @loop_const_true
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+; GCN: s_branch [[LABEL]]
+define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+ br label %for.body
+
+for.exit:
+ ret void
+
+for.body:
+ %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %tmp = add i32 %indvar, 32
+ %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+ %vecload = load float, float addrspace(3)* %arrayidx, align 4
+ %add = fadd float %vecload, 1.0
+ store float %add, float addrspace(3)* %arrayidx, align 8
+ %inc = add i32 %indvar, 1
+ br i1 true, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_const_false:
+; GCN-NOT: s_branch
+; GCN: s_endpgm
+define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+ br label %for.body
+
+for.exit:
+ ret void
+
+; XXX - Should there be an S_ENDPGM?
+for.body:
+ %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %tmp = add i32 %indvar, 32
+ %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+ %vecload = load float, float addrspace(3)* %arrayidx, align 4
+ %add = fadd float %vecload, 1.0
+ store float %add, float addrspace(3)* %arrayidx, align 8
+ %inc = add i32 %indvar, 1
+ br i1 false, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_const_undef:
+; GCN-NOT: s_branch
+; GCN: s_endpgm
+define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+entry:
+ br label %for.body
+
+for.exit:
+ ret void
+
+; XXX - Should there be an s_endpgm?
+for.body:
+ %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %tmp = add i32 %indvar, 32
+ %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+ %vecload = load float, float addrspace(3)* %arrayidx, align 4
+ %add = fadd float %vecload, 1.0
+ store float %add, float addrspace(3)* %arrayidx, align 8
+ %inc = add i32 %indvar, 1
+ br i1 undef, label %for.body, label %for.exit
+}
+
+; GCN-LABEL: {{^}}loop_arg_0:
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN: v_cmp_eq_i32_e32 vcc, 1,
+
+; GCN: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, exec, vcc
+; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_vccnz [[LOOPBB]]
+; GCN-NEXT: ; BB#2
+; GCN-NEXT: s_endpgm
+define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+entry:
+ br label %for.body
+
+for.exit:
+ ret void
+
+for.body:
+ %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %tmp = add i32 %indvar, 32
+ %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
+ %vecload = load float, float addrspace(3)* %arrayidx, align 4
+ %add = fadd float %vecload, 1.0
+ store float %add, float addrspace(3)* %arrayidx, align 8
+ %inc = add i32 %indvar, 1
+ br i1 %cond, label %for.body, label %for.exit
+}
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 1c5bed3b905f..82f88a079307 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -42,7 +42,7 @@ done:
; OPT: br label
; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
-; CI: buffer_load_dword {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
entry:
%out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 698494265a7d..916d667ec492 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,11 +1,9 @@
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-declare i32 @llvm.r600.read.tidig.x() #0
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; OPT-LABEL: @test_sink_global_small_offset_i32(
; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
@@ -15,11 +13,12 @@ declare i32 @llvm.r600.read.tidig.x() #0
; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -41,14 +40,15 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; GCN: {{^}}BB1_2:
; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -67,14 +67,15 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
; GCN: {{^}}BB2_2:
; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -93,14 +94,15 @@ done:
; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
; GCN: s_and_saveexec_b64
-; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
; GCN: {{^}}BB3_2:
; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
%in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -127,14 +129,15 @@ done:
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
entry:
%alloca = alloca [512 x i32], align 4
%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%add.arg = add i32 %arg, 8
%alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -163,14 +166,15 @@ done:
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN: {{^}}BB5_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
entry:
%alloca = alloca [512 x i32], align 4
%out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%add.arg = add i32 %arg, 8
%alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -190,18 +194,17 @@ done:
}
; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
-; VI-DAG: s_movk_i32 flat_scratch_lo, 0x0
-; VI-DAG: s_movk_i32 flat_scratch_hi, 0x0
; GCN: s_and_saveexec_b64
; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
; GCN: {{^}}BB6_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
+define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
entry:
%offset.ext = zext i32 %offset to i64
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -230,11 +233,12 @@ attributes #1 = { nounwind }
; GCN: s_and_saveexec_b64
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -258,11 +262,12 @@ done:
; GCN: s_and_saveexec_b64
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -290,11 +295,12 @@ done:
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -321,11 +327,12 @@ done:
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -351,11 +358,12 @@ done:
; GCN: s_addc_u32
; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -380,11 +388,12 @@ done:
; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -417,11 +426,12 @@ done:
; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) {
+define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
%in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
- %tmp0 = icmp eq i32 %cond, 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp0 = icmp eq i32 %tid, 0
br i1 %tmp0, label %endif, label %if
if:
@@ -436,3 +446,35 @@ endif:
done:
ret void
}
+
+%struct.foo = type { [3 x float], [3 x float] }
+
+; OPT-LABEL: @sink_ds_address(
+; OPT: ptrtoint %struct.foo addrspace(3)* %ptr to i64
+
+; GCN-LABEL: {{^}}sink_ds_address:
+; GCN: s_load_dword [[SREG1:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
+define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+entry:
+ %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
+ %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+ br label %bb32
+
+bb32:
+ %a = load float, float addrspace(3)* %x, align 4
+ %b = load float, float addrspace(3)* %y, align 4
+ %cmp = fcmp one float %a, %b
+ br i1 %cmp, label %bb34, label %bb33
+
+bb33:
+ unreachable
+
+bb34:
+ unreachable
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
new file mode 100644
index 000000000000..33daf0292ae1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -0,0 +1,301 @@
+; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -codegenprepare < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; This particular case will actually be worse in terms of code size
+; from sinking into both.
+
+; OPT-LABEL: @sink_ubfe_i32(
+; OPT: entry:
+; OPT-NEXT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i32 %arg1, 8
+; OPT-NEXT: %val0 = and i32 %0, 255
+; OPT: br label
+
+; OPT: bb1:
+; OPT: %1 = lshr i32 %arg1, 8
+; OPT-NEXT: %val1 = and i32 %1, 127
+; OPT: br label
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+
+; GCN-LABEL: {{^}}sink_ubfe_i32:
+; GCN-NOT: lshr
+; GCN: s_cbranch_vccnz
+
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008
+; GCN: BB0_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70008
+
+; GCN: BB0_3:
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+entry:
+ %shr = lshr i32 %arg1, 8
+ br i1 undef, label %bb0, label %bb1
+
+bb0:
+ %val0 = and i32 %shr, 255
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+bb1:
+ %val1 = and i32 %shr, 127
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+ret:
+ %phi = phi i32 [ %val0, %bb0 ], [ %val1, %bb1 ]
+ store i32 %phi, i32 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @sink_sbfe_i32(
+; OPT: entry:
+; OPT-NEXT: br i1
+
+; OPT: bb0:
+; OPT: %0 = ashr i32 %arg1, 8
+; OPT-NEXT: %val0 = and i32 %0, 255
+; OPT: br label
+
+; OPT: bb1:
+; OPT: %1 = ashr i32 %arg1, 8
+; OPT-NEXT: %val1 = and i32 %1, 127
+; OPT: br label
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_sbfe_i32:
+define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+entry:
+ %shr = ashr i32 %arg1, 8
+ br i1 undef, label %bb0, label %bb1
+
+bb0:
+ %val0 = and i32 %shr, 255
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+bb1:
+ %val1 = and i32 %shr, 127
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+ret:
+ %phi = phi i32 [ %val0, %bb0 ], [ %val1, %bb1 ]
+ store i32 %phi, i32 addrspace(1)* %out
+ ret void
+}
+
+
+; OPT-LABEL: @sink_ubfe_i16(
+; OPT: entry:
+; OPT-NEXT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i16 %arg1, 4
+; OPT-NEXT: %val0 = and i16 %0, 255
+; OPT: br label
+
+; OPT: bb1:
+; OPT: %1 = lshr i16 %arg1, 4
+; OPT-NEXT: %val1 = and i16 %1, 127
+; OPT: br label
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+
+; GCN-LABEL: {{^}}sink_ubfe_i16:
+; GCN-NOT: lshr
+; GCN: s_cbranch_vccnz
+
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
+; GCN: BB2_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
+
+; GCN: BB2_3:
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+entry:
+ %shr = lshr i16 %arg1, 4
+ br i1 undef, label %bb0, label %bb1
+
+bb0:
+ %val0 = and i16 %shr, 255
+ store volatile i16 0, i16 addrspace(1)* undef
+ br label %ret
+
+bb1:
+ %val1 = and i16 %shr, 127
+ store volatile i16 0, i16 addrspace(1)* undef
+ br label %ret
+
+ret:
+ %phi = phi i16 [ %val0, %bb0 ], [ %val1, %bb1 ]
+ store i16 %phi, i16 addrspace(1)* %out
+ ret void
+}
+
+; We don't really want to sink this one since it isn't reducible to a
+; 32-bit BFE on one half of the integer.
+
+; OPT-LABEL: @sink_ubfe_i64_span_midpoint(
+; OPT: entry:
+; OPT-NOT: lshr
+; OPT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i64 %arg1, 30
+; OPT-NEXT: %val0 = and i64 %0, 255
+
+; OPT: bb1:
+; OPT: %1 = lshr i64 %arg1, 30
+; OPT-NEXT: %val1 = and i64 %1, 127
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_ubfe_i64_span_midpoint:
+; GCN: s_cbranch_vccnz BB3_2
+
+; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 30
+; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0xff
+
+; GCN: BB3_2:
+; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 30
+; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7f
+
+; GCN: BB3_3:
+; GCN: buffer_store_dwordx2
+define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+entry:
+ %shr = lshr i64 %arg1, 30
+ br i1 undef, label %bb0, label %bb1
+
+bb0:
+ %val0 = and i64 %shr, 255
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+bb1:
+ %val1 = and i64 %shr, 127
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+ret:
+ %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
+ store i64 %phi, i64 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @sink_ubfe_i64_low32(
+; OPT: entry:
+; OPT-NOT: lshr
+; OPT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i64 %arg1, 15
+; OPT-NEXT: %val0 = and i64 %0, 255
+
+; OPT: bb1:
+; OPT: %1 = lshr i64 %arg1, 15
+; OPT-NEXT: %val1 = and i64 %1, 127
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_ubfe_i64_low32:
+
+; GCN: s_cbranch_vccnz BB4_2
+
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000f
+
+; GCN: BB4_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7000f
+
+; GCN: BB4_3:
+; GCN: buffer_store_dwordx2
+define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+entry:
+ %shr = lshr i64 %arg1, 15
+ br i1 undef, label %bb0, label %bb1
+
+bb0:
+ %val0 = and i64 %shr, 255
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+bb1:
+ %val1 = and i64 %shr, 127
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+ret:
+ %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
+ store i64 %phi, i64 addrspace(1)* %out
+ ret void
+}
+
+; OPT-LABEL: @sink_ubfe_i64_high32(
+; OPT: entry:
+; OPT-NOT: lshr
+; OPT: br i1
+
+; OPT: bb0:
+; OPT: %0 = lshr i64 %arg1, 35
+; OPT-NEXT: %val0 = and i64 %0, 255
+
+; OPT: bb1:
+; OPT: %1 = lshr i64 %arg1, 35
+; OPT-NEXT: %val1 = and i64 %1, 127
+
+; OPT: ret:
+; OPT: store
+; OPT: ret
+
+; GCN-LABEL: {{^}}sink_ubfe_i64_high32:
+; GCN: s_cbranch_vccnz BB5_2
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80003
+
+; GCN: BB5_2:
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70003
+
+; GCN: BB5_3:
+; GCN: buffer_store_dwordx2
+define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+entry:
+ %shr = lshr i64 %arg1, 35
+ br i1 undef, label %bb0, label %bb1
+
+bb0:
+ %val0 = and i64 %shr, 255
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+bb1:
+ %val1 = and i64 %shr, 127
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %ret
+
+ret:
+ %phi = phi i64 [ %val0, %bb0 ], [ %val1, %bb1 ]
+ store i64 %phi, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
index 1a37e3c75fa3..8227d4c873ee 100644
--- a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
+++ b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
@@ -1,11 +1,22 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
+; There are no stack objects even though flat is used by default, so
+; flat_scratch_init should be disabled.
+
+; ALL-LABEL: {{^}}test:
+; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: .end_amd_kernel_code_t
+
+; ALL-NOT: flat_scr
+
; HSA-DEFAULT: flat_store_dword
; HSA-NODEFAULT: buffer_store_dword
+
; NOHSA-DEFAULT: buffer_store_dword
; NOHSA-NODEFAULT: flat_store_dword
define void @test(i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
new file mode 100644
index 000000000000..2c4c07c193af
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i1 @llvm.amdgcn.class.f32(float, i32)
+
+; Produces error after adding an implicit def to v_cndmask_b32
+
+; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
+; GCN: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
+define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+bb0:
+ %tmp = icmp sgt i32 %arg1, 4
+ %c = icmp eq i32 %arg3, 0
+ %tmp4 = select i1 %c, float %arg, float 1.000000e+00
+ %tmp5 = fcmp ogt float %arg2, 0.000000e+00
+ %tmp6 = fcmp olt float %arg2, 1.000000e+00
+ %tmp7 = fcmp olt float %arg, %tmp4
+ %tmp8 = and i1 %tmp5, %tmp6
+ %tmp9 = and i1 %tmp8, %tmp7
+ br i1 %tmp9, label %bb1, label %bb2
+
+bb1:
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %bb2
+
+bb2:
+ ret void
+}
+
+; The undef flag on the condition src must be preserved on the
+; implicit vcc use to avoid verifier errors.
+
+; GCN-LABEL: {{^}}preserve_condition_undef_flag:
+; GCN-NOT: vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
+define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+bb0:
+ %tmp = icmp sgt i32 %arg1, 4
+ %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
+ %tmp4 = select i1 %undef, float %arg, float 1.000000e+00
+ %tmp5 = fcmp ogt float %arg2, 0.000000e+00
+ %tmp6 = fcmp olt float %arg2, 1.000000e+00
+ %tmp7 = fcmp olt float %arg, %tmp4
+ %tmp8 = and i1 %tmp5, %tmp6
+ %tmp9 = and i1 %tmp8, %tmp7
+ br i1 %tmp9, label %bb1, label %bb2
+
+bb1:
+ store volatile i32 0, i32 addrspace(1)* undef
+ br label %bb2
+
+bb2:
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/coalescer_distribute.ll b/test/CodeGen/AMDGPU/coalescer_distribute.ll
new file mode 100644
index 000000000000..7ca2612598c8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/coalescer_distribute.ll
@@ -0,0 +1,53 @@
+; RUN: llc -o /dev/null %s
+; This testcase produces a situation with unused value numbers in subregister
+; liveranges that get distributed by ConnectedVNInfoEqClasses.
+target triple = "amdgcn--"
+
+define spir_kernel void @hoge() {
+bb:
+ %tmp = tail call i32 @llvm.r600.read.tidig.x()
+ br i1 undef, label %bb2, label %bb23
+
+bb2:
+ br i1 undef, label %bb6, label %bb8
+
+bb6:
+ %tmp7 = or i64 undef, undef
+ br label %bb8
+
+bb8:
+ %tmp9 = phi i64 [ %tmp7, %bb6 ], [ undef, %bb2 ]
+ %tmp10 = icmp eq i32 %tmp, 0
+ br i1 %tmp10, label %bb11, label %bb23
+
+bb11:
+ br i1 undef, label %bb20, label %bb17
+
+bb17:
+ br label %bb20
+
+bb20:
+ %tmp21 = phi i64 [ undef, %bb17 ], [ %tmp9, %bb11 ]
+ %tmp22 = trunc i64 %tmp21 to i32
+ br label %bb23
+
+bb23:
+ %tmp24 = phi i32 [ %tmp22, %bb20 ], [ undef, %bb8 ], [ undef, %bb ]
+ br label %bb25
+
+bb25:
+ %tmp26 = phi i32 [ %tmp24, %bb23 ], [ undef, %bb25 ]
+ br i1 undef, label %bb25, label %bb30
+
+bb30:
+ br i1 undef, label %bb32, label %bb34
+
+bb32:
+ %tmp33 = zext i32 %tmp26 to i64
+ br label %bb34
+
+bb34:
+ ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x()
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
index 31766047a358..731b47cd9ee2 100644
--- a/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
; --------------------------------------------------------------------------------
; i32 compares
@@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
; GCN-LABEL: {{^}}commute_eq_64_i32:
; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}}
define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -22,7 +22,7 @@ define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
; GCN-LABEL: {{^}}commute_ne_64_i32:
; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}}
define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -37,7 +37,7 @@ define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}}
define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -50,7 +50,7 @@ define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
; GCN-LABEL: {{^}}commute_ugt_64_i32:
; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -63,7 +63,7 @@ define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_uge_64_i32:
; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -76,7 +76,7 @@ define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_ult_64_i32:
; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -89,7 +89,7 @@ define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_ule_63_i32:
; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -105,7 +105,7 @@ define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -118,7 +118,7 @@ define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -131,7 +131,7 @@ define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
; GCN-LABEL: {{^}}commute_sge_neg2_i32:
; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -144,7 +144,7 @@ define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
; GCN-LABEL: {{^}}commute_slt_neg16_i32:
; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -157,7 +157,7 @@ define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
; GCN-LABEL: {{^}}commute_sle_5_i32:
; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i32, i32 addrspace(1)* %gep.in
@@ -174,7 +174,7 @@ define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
; GCN-LABEL: {{^}}commute_eq_64_i64:
; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -187,7 +187,7 @@ define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
; GCN-LABEL: {{^}}commute_ne_64_i64:
; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -200,7 +200,7 @@ define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
; GCN-LABEL: {{^}}commute_ugt_64_i64:
; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -213,7 +213,7 @@ define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_uge_64_i64:
; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -226,7 +226,7 @@ define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_ult_64_i64:
; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -239,7 +239,7 @@ define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_ule_63_i64:
; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -255,7 +255,7 @@ define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -268,7 +268,7 @@ define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -281,7 +281,7 @@ define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
; GCN-LABEL: {{^}}commute_sge_neg2_i64:
; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -294,7 +294,7 @@ define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
; GCN-LABEL: {{^}}commute_slt_neg16_i64:
; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -307,7 +307,7 @@ define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in
; GCN-LABEL: {{^}}commute_sle_5_i64:
; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %gep.in
@@ -325,7 +325,7 @@ define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -339,7 +339,7 @@ define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -352,7 +352,7 @@ define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_oge_2.0_f32:
; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -365,7 +365,7 @@ define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_olt_2.0_f32:
; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -378,7 +378,7 @@ define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_ole_2.0_f32:
; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -391,7 +391,7 @@ define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_one_2.0_f32:
; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -404,7 +404,7 @@ define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_ord_2.0_f32:
; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -417,7 +417,7 @@ define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -430,7 +430,7 @@ define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -443,7 +443,7 @@ define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_uge_2.0_f32:
; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -456,7 +456,7 @@ define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_ult_2.0_f32:
; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -469,7 +469,7 @@ define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_ule_2.0_f32:
; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -482,7 +482,7 @@ define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_une_2.0_f32:
; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -495,7 +495,7 @@ define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_uno_2.0_f32:
; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load float, float addrspace(1)* %gep.in
@@ -513,7 +513,7 @@ define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -527,7 +527,7 @@ define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -540,7 +540,7 @@ define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_oge_2.0_f64:
; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -553,7 +553,7 @@ define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_olt_2.0_f64:
; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -566,7 +566,7 @@ define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_ole_2.0_f64:
; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -579,7 +579,7 @@ define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_one_2.0_f64:
; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -592,7 +592,7 @@ define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_ord_2.0_f64:
; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -605,7 +605,7 @@ define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -618,7 +618,7 @@ define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -631,7 +631,7 @@ define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_uge_2.0_f64:
; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -644,7 +644,7 @@ define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_ult_2.0_f64:
; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -657,7 +657,7 @@ define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_ule_2.0_f64:
; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -670,7 +670,7 @@ define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_une_2.0_f64:
; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
@@ -683,7 +683,7 @@ define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
; GCN-LABEL: {{^}}commute_uno_2.0_f64:
; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load double, double addrspace(1)* %gep.in
diff --git a/test/CodeGen/AMDGPU/commute-shifts.ll b/test/CodeGen/AMDGPU/commute-shifts.ll
index f88cf6470c4f..862f236514ca 100644
--- a/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -4,30 +4,26 @@
; GCN-LABEL: {{^}}main:
; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1
-
-define void @main() #0 {
-main_body:
- %0 = fptosi float undef to i32
- %1 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> undef, <32 x i8> undef, i32 2)
- %2 = extractelement <4 x i32> %1, i32 0
- %3 = and i32 %0, 7
- %4 = shl i32 1, %3
- %5 = and i32 %2, %4
- %6 = icmp eq i32 %5, 0
- %.10 = select i1 %6, float 0.000000e+00, float undef
- %7 = call i32 @llvm.SI.packf16(float undef, float %.10)
- %8 = bitcast i32 %7 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %8, float undef, float %8)
+define amdgpu_ps void @main(float %arg0, float %arg1) #0 {
+bb:
+ %tmp = fptosi float %arg0 to i32
+ %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp2.f = extractelement <4 x float> %tmp1, i32 0
+ %tmp2 = bitcast float %tmp2.f to i32
+ %tmp3 = and i32 %tmp, 7
+ %tmp4 = shl i32 1, %tmp3
+ %tmp5 = and i32 %tmp2, %tmp4
+ %tmp6 = icmp eq i32 %tmp5, 0
+ %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
+ %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7)
+ %tmp9 = bitcast i32 %tmp8 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9)
ret void
}
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
declare i32 @llvm.SI.packf16(float, float) #1
-
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll
index 7fc36eabb780..bce3fe998c8a 100644
--- a/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fabs.f32(float) #1
declare float @llvm.fma.f32(float, float, float) nounwind readnone
; FUNC-LABEL: @commute_add_imm_fabs_f32
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%x = load float, float addrspace(1)* %gep.0
%x.fabs = call float @llvm.fabs.f32(float %x) #1
@@ -21,9 +21,9 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%x = load float, float addrspace(1)* %gep.0
%x.fabs = call float @llvm.fabs.f32(float %x) #1
@@ -36,9 +36,9 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
; FUNC-LABEL: @commute_mul_imm_fneg_f32
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%x = load float, float addrspace(1)* %gep.0
%x.fneg = fsub float -0.000000e+00, %x
@@ -52,9 +52,9 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%x = load float, float addrspace(1)* %gep.0
%x.fabs = call float @llvm.fabs.f32(float %x) #1
@@ -67,13 +67,13 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%z = fadd float %x, %y.fabs
store float %z, float addrspace(1)* %out
@@ -84,13 +84,13 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
%y.fneg = fsub float -0.000000e+00, %y
%z = fmul float %x, %y.fneg
store float %z, float addrspace(1)* %out
@@ -101,13 +101,13 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
%z = fmul float %x, %y.fabs.fneg
@@ -120,13 +120,13 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
%x.fabs = call float @llvm.fabs.f32(float %x) #1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%z = fmul float %x.fabs, %y.fabs
@@ -138,13 +138,13 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
-; SI-NEXT: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
%x.fabs = call float @llvm.fabs.f32(float %x) #1
%y.fabs = call float @llvm.fabs.f32(float %y) #1
%y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
@@ -159,16 +159,16 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
; SI: buffer_store_dword [[RESULT]]
define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%r2.fabs = call float @llvm.fabs.f32(float %r2)
diff --git a/test/CodeGen/AMDGPU/complex-folding.ll b/test/CodeGen/AMDGPU/complex-folding.ll
index a5399a71324c..acf81ba7b5dd 100644
--- a/test/CodeGen/AMDGPU/complex-folding.ll
+++ b/test/CodeGen/AMDGPU/complex-folding.ll
@@ -2,18 +2,16 @@
; CHECK: {{^}}main:
; CHECK-NOT: MOV
-define void @main(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @main(<4 x float> inreg %reg0) {
entry:
%0 = extractelement <4 x float> %reg0, i32 0
%1 = call float @fabs(float %0)
%2 = fptoui float %1 to i32
%3 = bitcast i32 %2 to float
%4 = insertelement <4 x float> undef, float %3, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %4, i32 0, i32 0)
ret void
}
declare float @fabs(float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" } \ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
new file mode 100644
index 000000000000..55a38e576ad1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+; GCN-LABEL: {{^}}convergent_inlineasm:
+; GCN: BB#0:
+; GCN: v_cmp_ne_i32_e64
+; GCN: BB#1:
+define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+bb:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 1) #1
+ %tmp2 = icmp eq i32 %tmp, 8
+ br i1 %tmp2, label %bb3, label %bb5
+
+bb3: ; preds = %bb
+ %tmp4 = getelementptr i64, i64 addrspace(1)* %arg, i32 %tmp
+ store i64 %tmp1, i64 addrspace(1)* %arg, align 8
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret void
+}
+
+; GCN-LABEL: {{^}}nonconvergent_inlineasm:
+; GCN: BB#1:
+; GCN: v_cmp_ne_i32_e64
+; GCN: BB1_2:
+define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+bb:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 1)
+ %tmp2 = icmp eq i32 %tmp, 8
+ br i1 %tmp2, label %bb3, label %bb5
+
+bb3: ; preds = %bb
+ %tmp4 = getelementptr i64, i64 addrspace(1)* %arg, i32 %tmp
+ store i64 %tmp1, i64 addrspace(1)* %arg, align 8
+ br label %bb5
+
+bb5: ; preds = %bb3, %bb
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { convergent nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 8b397566066a..00d2257f4adc 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -54,31 +54,12 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
}
; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-
-; After scalarizing v4i8 loads is fixed.
-; XSI: buffer_load_dword
-; XSI: V_BFE
-; XSI: V_ADD
-; XSI: V_ADD
-; XSI: V_ADD
-; XSI: buffer_store_dword
-; XSI: buffer_store_dword
+; SI: buffer_load_dword
+; SI-DAG: v_lshrrev_b32
+; SI: v_and_b32
+; SI: v_or_b32
+; SI-DAG: buffer_store_dword
+; SI-DAG: buffer_store_dword
; SI: s_endpgm
define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
@@ -90,34 +71,14 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
}
; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: v_add
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI-DAG: buffer_store_byte
-; SI_DAG: buffer_store_byte
-
-; XSI: buffer_load_dword
-; XSI: BFE
-; XSI: buffer_store_dword
-; XSI: V_ADD
-; XSI: buffer_store_dword
-; XSI-NEXT: buffer_store_dword
-
+; SI: buffer_load_dword
+; SI-DAG: v_lshrrev_b32
+; SI-DAG: v_add_i32
+; SI-DAG: v_and_b32
+; SI-DAG: v_or_b32
+; SI-DAG: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
; SI: s_endpgm
define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
@@ -128,21 +89,50 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
ret void
}
-; FUNC-LABEL: {{^}}test_copy_v3i8:
-; SI-NOT: bfe
-; SI-NOT: bfi
+; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
+; SI: buffer_load_dword
+; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
; SI: s_endpgm
-define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
%val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
ret void
}
+; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
+; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; SI: s_endpgm
+define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+ %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
+ store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+ %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
+ store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
+ ret void
+}
+
; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
+; SI: buffer_store_dword
; SI: s_endpgm
define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
%val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index baedf47eef0d..6d2d260177e5 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -116,11 +116,11 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
-; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
store i64 %ctlz, i64 addrspace(1)* %out
@@ -136,7 +136,8 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
}
; FUNC-LABEL: {{^}}v_ctlz_i64:
-; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; SI-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
@@ -145,8 +146,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
-; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
+; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index c1f84cd460cf..65e8205317b6 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -116,14 +116,14 @@ define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %va
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
-; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
; SI: buffer_load_dword [[VAL:v[0-9]+]],
; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI-NEXT: buffer_store_dword [[RESULT]],
+; SI: buffer_store_dword [[RESULT]],
define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
%val = load i32, i32 addrspace(1)* %valptr
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
; SI: buffer_load_dword [[VAL:v[0-9]+]],
; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI-NEXT: buffer_store_dword [[RESULT]],
+; SI: buffer_store_dword [[RESULT]],
define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
%val = load i32, i32 addrspace(1)* %valptr
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index 0a031c5e24d1..e53ad13464e8 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -60,9 +60,9 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
-; GCN-NEXT: s_waitcnt
+; GCN: s_waitcnt
; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
-; GCN-NEXT: buffer_store_dword [[RESULT]],
+; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
%val0 = load i32, i32 addrspace(1)* %in0, align 4
@@ -203,8 +203,8 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
; GCN: buffer_store_dword [[RESULT]],
@@ -250,8 +250,8 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
}
; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
-; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
+; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index ec2971e98032..d0976b7d45b8 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
@@ -7,6 +7,9 @@ declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
+declare i65 @llvm.ctpop.i65(i65) nounwind readnone
+declare i128 @llvm.ctpop.i128(i128) nounwind readnone
+
; FUNC-LABEL: {{^}}s_ctpop_i64:
; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
@@ -110,15 +113,13 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
ret void
}
-; FIXME: We currently disallow SALU instructions in all branches,
-; but there are some cases when the should be allowed.
-
; FUNC-LABEL: {{^}}ctpop_i64_in_br:
-; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
-; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
-; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
-; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
; GCN: s_endpgm
define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
@@ -140,3 +141,51 @@ endif:
store i64 %tmp5, i64 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: {{^}}s_ctpop_i128:
+; GCN: s_bcnt1_i32_b64 [[SRESULT0:s[0-9]+]],
+; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
+; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
+; GCN: s_endpgm
+define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+ %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
+ %truncctpop = trunc i128 %ctpop to i32
+ store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctpop_i65:
+; GCN: s_and_b32
+; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
+; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
+; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
+; GCN: s_endpgm
+define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+ %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
+ %truncctpop = trunc i65 %ctpop to i32
+ store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FIXME: Should not have extra add
+
+; FUNC-LABEL: {{^}}v_ctpop_i128:
+; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
+; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
+
+; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
+; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]]
+
+; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]]
+
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+ %val = load i128, i128 addrspace(1)* %in, align 8
+ %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
+ %truncctpop = trunc i128 %ctpop to i32
+ store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/cube.ll b/test/CodeGen/AMDGPU/cube.ll
new file mode 100644
index 000000000000..ab99af5864e9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cube.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubeid(float, float, float) #0
+declare float @llvm.amdgcn.cubesc(float, float, float) #0
+declare float @llvm.amdgcn.cubetc(float, float, float) #0
+declare float @llvm.amdgcn.cubema(float, float, float) #0
+
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
+
+
+; GCN-LABEL: {{^}}cube:
+; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+ %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
+ %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
+ %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
+ %cubema = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
+
+ %vec0 = insertelement <4 x float> undef, float %cubeid, i32 0
+ %vec1 = insertelement <4 x float> %vec0, float %cubesc, i32 1
+ %vec2 = insertelement <4 x float> %vec1, float %cubetc, i32 2
+ %vec3 = insertelement <4 x float> %vec2, float %cubema, i32 3
+ store <4 x float> %vec3, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}legacy_cube:
+; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 {
+ %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx)
+ store <4 x float> %cube, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 834922c62cbd..dcd48c97434d 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}load_i8_to_f32:
@@ -15,12 +15,9 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
}
; SI-LABEL: {{^}}load_v2i8_to_v2f32:
-; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
-; SI-NOT: bfe
-; SI-NOT: lshr
-; SI-NOT: and
-; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_load_ushort [[LD:v[0-9]+]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
@@ -30,11 +27,11 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
}
; SI-LABEL: {{^}}load_v3i8_to_v3f32:
-; SI-NOT: bfe
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
; SI-NOT: v_cvt_f32_ubyte3_e32
-; SI-DAG: v_cvt_f32_ubyte2_e32
-; SI-DAG: v_cvt_f32_ubyte1_e32
-; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
@@ -62,20 +59,20 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
; This should not be adding instructions to shift into the correct
; position in the word for the component.
+; FIXME: Packing bytes
; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
-; SI-NOT: v_lshlrev_b32
-; SI-NOT: v_or_b32
-
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
+; SI-DAG: v_lshlrev_b32
+; SI-DAG: v_or_b32
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
-; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI: buffer_store_dwordx4
define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
%cvt = uitofp <4 x i8> %load to <4 x float>
@@ -83,26 +80,25 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
ret void
}
-; XXX - This should really still be able to use the v_cvt_f32_ubyte0
-; for each component, but computeKnownBits doesn't handle vectors very
-; well.
-
+; Instructions still emitted to repack bytes for add use.
; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-; SI: v_cvt_f32_ubyte0_e32
-
-; XXX - replace with this when v4i8 loads aren't scalarized anymore.
-; XSI: buffer_load_dword
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
-; XSI: v_cvt_f32_u32_e32
+; SI: buffer_load_dword
+; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI-DAG: v_cvt_f32_ubyte1_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32
+; SI-DAG: v_cvt_f32_ubyte3_e32
+
+; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
+; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
+; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
+; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
+; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
+; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
+; SI-DAG: v_add_i32
+
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dword
+
; SI: s_endpgm
define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
@@ -170,9 +166,9 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
ret void
}
-
; We don't get these ones because of the zext, but instcombine removes
; them so it shouldn't really matter.
+; SI-LABEL: {{^}}i8_zext_i32_to_f32:
define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
%load = load i8, i8 addrspace(1)* %in, align 1
%ext = zext i8 %load to i32
@@ -181,6 +177,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
ret void
}
+; SI-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
%ext = zext <4 x i8> %load to <4 x i32>
@@ -188,3 +185,58 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
ret void
}
+
+; SI-LABEL: {{^}}extract_byte0_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+ %val = load i32, i32 addrspace(1)* %in
+ %and = and i32 %val, 255
+ %cvt = uitofp i32 %and to float
+ store float %cvt, float addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}extract_byte1_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+ %val = load i32, i32 addrspace(1)* %in
+ %srl = lshr i32 %val, 8
+ %and = and i32 %srl, 255
+ %cvt = uitofp i32 %and to float
+ store float %cvt, float addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}extract_byte2_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+ %val = load i32, i32 addrspace(1)* %in
+ %srl = lshr i32 %val, 16
+ %and = and i32 %srl, 255
+ %cvt = uitofp i32 %and to float
+ store float %cvt, float addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}extract_byte3_to_f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOT: [[VAL]]
+; SI: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[CONV]]
+define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+ %val = load i32, i32 addrspace(1)* %in
+ %srl = lshr i32 %val, 24
+ %and = and i32 %srl, 255
+ %cvt = uitofp i32 %and to float
+ store float %cvt, float addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
new file mode 100644
index 000000000000..a32c16dfac38
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Test for a bug where DAGCombiner::ReassociateOps() was creating adds
+; with offset in the first operand and base pointers in the second.
+
+; CHECK-LABEL: {{^}}store_same_base_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR:v\[[0-9]+:[0-9]+\]]], [[SADDR:s\[[0-9]+:[0-9]+\]]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
+
+define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %offset = sext i32 %id to i64
+ %offset0 = add i64 %offset, 1027
+ %ptr0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset0
+ store volatile i32 3, i32 addrspace(1)* %ptr0
+ %offset1 = add i64 %offset, 1026
+ %ptr1 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset1
+ store volatile i32 2, i32 addrspace(1)* %ptr1
+ %offset2 = add i64 %offset, 1025
+ %ptr2 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset2
+ store volatile i32 1, i32 addrspace(1)* %ptr2
+ %offset3 = add i64 %offset, 1024
+ %ptr3 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset3
+ store volatile i32 0, i32 addrspace(1)* %ptr3
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
new file mode 100644
index 000000000000..49a7e722f29c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -0,0 +1,80 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR
+
+; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]]
+; CHECK: debug_private_segment_buffer_sgpr = [[SREG:[0-9]+]]
+
+; CHECK: v_mov_b32_e32 [[WGIDX:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDX]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]]
+; CHECK: buffer_store_dword v0, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:16
+
+; CHECK: v_mov_b32_e32 [[WGIDY:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDY]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:4
+; CHECK: buffer_store_dword v1, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:20
+
+; CHECK: v_mov_b32_e32 [[WGIDZ:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDZ]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:8
+; CHECK: buffer_store_dword v2, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:24
+
+; CHECK: DebuggerWavefrontPrivateSegmentOffsetSGPR: s[[SOFF]]
+; CHECK: DebuggerPrivateSegmentBufferSGPR: s[[SREG]]
+
+; NOATTR-NOT: DebuggerWavefrontPrivateSegmentOffsetSGPR
+; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR
+
+; Function Attrs: nounwind
+define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+entry:
+ %A.addr = alloca i32 addrspace(1)*, align 4
+ store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+ call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
+ %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
+ %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
+ store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+ %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
+ %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
+ store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
+ %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24
+ store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25
+ ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!3}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 269772)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing")
+!2 = !{}
+!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0 (trunk 269772)"}
+!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32)
+!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15)
+!18 = !DIExpression()
+!19 = !DILocation(line: 1, column: 30, scope: !12)
+!20 = !DILocation(line: 2, column: 3, scope: !12)
+!21 = !DILocation(line: 2, column: 8, scope: !12)
+!22 = !DILocation(line: 3, column: 3, scope: !12)
+!23 = !DILocation(line: 3, column: 8, scope: !12)
+!24 = !DILocation(line: 4, column: 3, scope: !12)
+!25 = !DILocation(line: 4, column: 8, scope: !12)
+!26 = !DILocation(line: 5, column: 1, scope: !12)
diff --git a/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
new file mode 100644
index 000000000000..6638f4e25821
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
@@ -0,0 +1,71 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: test01.cl:2:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+
+; CHECK: test01.cl:3:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+
+; CHECK: test01.cl:4:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+
+; CHECK: test01.cl:5:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_endpgm
+
+; Function Attrs: nounwind
+define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+entry:
+ %A.addr = alloca i32 addrspace(1)*, align 4
+ store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+ call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
+ %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
+ %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
+ store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+ %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
+ %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
+ store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
+ %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24
+ store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25
+ ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!3}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing")
+!2 = !{}
+!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0 (trunk 268929)"}
+!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32)
+!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15)
+!18 = !DIExpression()
+!19 = !DILocation(line: 1, column: 30, scope: !12)
+!20 = !DILocation(line: 2, column: 3, scope: !12)
+!21 = !DILocation(line: 2, column: 8, scope: !12)
+!22 = !DILocation(line: 3, column: 3, scope: !12)
+!23 = !DILocation(line: 3, column: 8, scope: !12)
+!24 = !DILocation(line: 4, column: 3, scope: !12)
+!25 = !DILocation(line: 4, column: 8, scope: !12)
+!26 = !DILocation(line: 5, column: 1, scope: !12)
diff --git a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
new file mode 100644
index 000000000000..d30bb20bb03a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
@@ -0,0 +1,62 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
+; CHECK: reserved_vgpr_first = {{[0-9]+}}
+; CHECK-NEXT: reserved_vgpr_count = 4
+; CHECK: ReservedVGPRFirst: {{[0-9]+}}
+; CHECK-NEXT: ReservedVGPRCount: 4
+
+; Function Attrs: nounwind
+define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+entry:
+ %A.addr = alloca i32 addrspace(1)*, align 4
+ store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+ call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
+ %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
+ %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
+ store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+ %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
+ %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
+ store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
+ %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24
+ store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25
+ ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!3}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing")
+!2 = !{}
+!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0 (trunk 268929)"}
+!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32)
+!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15)
+!18 = !DIExpression()
+!19 = !DILocation(line: 1, column: 30, scope: !12)
+!20 = !DILocation(line: 2, column: 3, scope: !12)
+!21 = !DILocation(line: 2, column: 8, scope: !12)
+!22 = !DILocation(line: 3, column: 3, scope: !12)
+!23 = !DILocation(line: 3, column: 8, scope: !12)
+!24 = !DILocation(line: 4, column: 3, scope: !12)
+!25 = !DILocation(line: 4, column: 8, scope: !12)
+!26 = !DILocation(line: 5, column: 1, scope: !12)
diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll
index da8e91454b98..723e3c27ad6b 100644
--- a/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -1,36 +1,62 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; FUNC-LABEL: {{^}}test_kernel:
+; GCN-LABEL: {{^}}test_default_si:
+; GCN: FloatMode: 192
+; GCN: IeeeMode: 0
+define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
-; DEFAULT: FloatMode: 192
-; DEFAULT: IeeeMode: 0
+; GCN-LABEL: {{^}}test_default_vi:
+; GCN: FloatMode: 192
+; GCN: IeeeMode: 0
+define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
-; FP64-DENORMAL: FloatMode: 192
-; FP64-DENORMAL: IeeeMode: 0
+; GCN-LABEL: {{^}}test_f64_denormals:
+; GCN: FloatMode: 192
+; GCN: IeeeMode: 0
+define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
-; FP32-DENORMAL: FloatMode: 48
-; FP32-DENORMAL: IeeeMode: 0
+; GCN-LABEL: {{^}}test_f32_denormals:
+; GCNL: FloatMode: 48
+; GCN: IeeeMode: 0
+define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
-; BOTH-DENORMAL: FloatMode: 240
-; BOTH-DENORMAL: IeeeMode: 0
+; GCN-LABEL: {{^}}test_f32_f64_denormals:
+; GCN: FloatMode: 240
+; GCN: IeeeMode: 0
+define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
-; NO-DENORMAL: FloatMode: 0
-; NO-DENORMAL: IeeeMode: 0
-define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+; GCN-LABEL: {{^}}test_no_denormals
+; GCN: FloatMode: 0
+; GCN: IeeeMode: 0
+define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
store float 0.0, float addrspace(1)* %out0
store double 0.0, double addrspace(1)* %out1
ret void
}
+
+attributes #0 = { nounwind "target-cpu"="tahiti" }
+attributes #1 = { nounwind "target-cpu"="fiji" }
+attributes #2 = { nounwind "target-features"="+fp64-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals" }
+attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
+attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
new file mode 100644
index 000000000000..f7f953c144da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -0,0 +1,428 @@
+# RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
+--- |
+ define void @test0() { ret void }
+ define void @test1() { ret void }
+ define void @test2() { ret void }
+ define void @test3() { ret void }
+ define void @test4() { ret void }
+ define void @test5() { ret void }
+ define void @loop0() { ret void }
+ define void @loop1() { ret void }
+ define void @loop2() { ret void }
+...
+---
+# Combined use/def transfer check, the basics.
+# CHECK-LABEL: name: test0
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: S_NOP 0, implicit-def %1
+# CHECK: S_NOP 0, implicit-def dead %2
+# CHECK: %3 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %3:sub0
+# CHECK: S_NOP 0, implicit %3:sub1
+# CHECK: S_NOP 0, implicit undef %3:sub2
+# CHECK: %4 = COPY %3:sub0_sub1
+# CHECK: %5 = COPY undef %3:sub2_sub3
+# CHECK: S_NOP 0, implicit %4:sub0
+# CHECK: S_NOP 0, implicit %4:sub1
+# CHECK: S_NOP 0, implicit undef %5:sub0
+name: test0
+isSSA: true
+registers:
+ - { id: 0, class: sreg_32 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sreg_32 }
+ - { id: 3, class: sreg_128 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_64 }
+body: |
+ bb.0:
+ S_NOP 0, implicit-def %0
+ S_NOP 0, implicit-def %1
+ S_NOP 0, implicit-def %2
+ %3 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub3
+ S_NOP 0, implicit %3:sub0
+ S_NOP 0, implicit %3:sub1
+ S_NOP 0, implicit %3:sub2
+ %4 = COPY %3:sub0_sub1
+ %5 = COPY %3:sub2_sub3
+ S_NOP 0, implicit %4:sub0
+ S_NOP 0, implicit %4:sub1
+ S_NOP 0, implicit %5:sub0
+...
+---
+# Check defined lanes transfer; Includes checking for some special cases like
+# undef operands or IMPLICIT_DEF definitions.
+# CHECK-LABEL: name: test1
+# CHECK: %0 = REG_SEQUENCE %sgpr0, {{[0-9]+}}, %sgpr0, {{[0-9]+}}
+# CHECK: %1 = INSERT_SUBREG %0, %sgpr1, {{[0-9]+}}
+# CHECK: %2 = INSERT_SUBREG %0:sub2_sub3, %sgpr42, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %1:sub0
+# CHECK: S_NOP 0, implicit undef %1:sub1
+# CHECK: S_NOP 0, implicit %1:sub2
+# CHECK: S_NOP 0, implicit %1:sub3
+# CHECK: S_NOP 0, implicit %2:sub0
+# CHECK: S_NOP 0, implicit undef %2:sub1
+
+# CHECK: %3 = IMPLICIT_DEF
+# CHECK: %4 = INSERT_SUBREG %0, undef %3, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit undef %4:sub0
+# CHECK: S_NOP 0, implicit undef %4:sub1
+# CHECK: S_NOP 0, implicit %4:sub2
+# CHECK: S_NOP 0, implicit undef %4:sub3
+
+# CHECK: %5 = EXTRACT_SUBREG %0, {{[0-9]+}}
+# CHECK: %6 = EXTRACT_SUBREG %5, {{[0-9]+}}
+# CHECK: %7 = EXTRACT_SUBREG %5, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %5
+# CHECK: S_NOP 0, implicit %6
+# CHECK: S_NOP 0, implicit undef %7
+
+# CHECK: %8 = IMPLICIT_DEF
+# CHECK: %9 = EXTRACT_SUBREG undef %8, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit undef %9
+
+# CHECK: %10 = EXTRACT_SUBREG undef %0, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit undef %10
+name: test1
+isSSA: true
+registers:
+ - { id: 0, class: sreg_128 }
+ - { id: 1, class: sreg_128 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: sreg_32 }
+ - { id: 4, class: sreg_128 }
+ - { id: 5, class: sreg_64 }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_64 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+body: |
+ bb.0:
+ %0 = REG_SEQUENCE %sgpr0, %subreg.sub0, %sgpr0, %subreg.sub2
+ %1 = INSERT_SUBREG %0, %sgpr1, %subreg.sub3
+ %2 = INSERT_SUBREG %0:sub2_sub3, %sgpr42, %subreg.sub0
+ S_NOP 0, implicit %1:sub0
+ S_NOP 0, implicit %1:sub1
+ S_NOP 0, implicit %1:sub2
+ S_NOP 0, implicit %1:sub3
+ S_NOP 0, implicit %2:sub0
+ S_NOP 0, implicit %2:sub1
+
+ %3 = IMPLICIT_DEF
+ %4 = INSERT_SUBREG %0, %3, %subreg.sub0
+ S_NOP 0, implicit %4:sub0
+ S_NOP 0, implicit %4:sub1
+ S_NOP 0, implicit %4:sub2
+ S_NOP 0, implicit %4:sub3
+
+ %5 = EXTRACT_SUBREG %0, %subreg.sub0_sub1
+ %6 = EXTRACT_SUBREG %5, %subreg.sub0
+ %7 = EXTRACT_SUBREG %5, %subreg.sub1
+ S_NOP 0, implicit %5
+ S_NOP 0, implicit %6
+ S_NOP 0, implicit %7
+
+ %8 = IMPLICIT_DEF
+ %9 = EXTRACT_SUBREG %8, %subreg.sub1
+ S_NOP 0, implicit %9
+
+ %10 = EXTRACT_SUBREG undef %0, %subreg.sub2_sub3
+ S_NOP 0, implicit %10
+...
+---
+# Check used lanes transfer; Includes checking for some special cases like
+# undef operands.
+# CHECK-LABEL: name: test2
+# CHECK: S_NOP 0, implicit-def dead %0
+# CHECK: S_NOP 0, implicit-def %1
+# CHECK: S_NOP 0, implicit-def %2
+# CHECK: %3 = REG_SEQUENCE undef %0, {{[0-9]+}}, %1, {{[0-9]+}}, %2, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %3:sub1
+# CHECK: S_NOP 0, implicit %3:sub3
+
+# CHECK: S_NOP 0, implicit-def %4
+# CHECK: S_NOP 0, implicit-def dead %5
+# CHECK: %6 = REG_SEQUENCE %4, {{[0-9]+}}, undef %5, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %6
+
+# CHECK: S_NOP 0, implicit-def dead %7
+# CHECK: S_NOP 0, implicit-def %8
+# CHECK: %9 = INSERT_SUBREG undef %7, %8, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %9:sub2
+
+# CHECK: S_NOP 0, implicit-def %10
+# CHECK: S_NOP 0, implicit-def dead %11
+# CHECK: %12 = INSERT_SUBREG %10, undef %11, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %12:sub3
+
+# CHECK: S_NOP 0, implicit-def %13
+# CHECK: S_NOP 0, implicit-def dead %14
+# CHECK: %15 = REG_SEQUENCE %13, {{[0-9]+}}, undef %14, {{[0-9]+}}
+# CHECK: %16 = EXTRACT_SUBREG %15, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %16:sub1
+
+name: test2
+isSSA: true
+registers:
+ - { id: 0, class: sreg_32 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: sreg_128 }
+ - { id: 4, class: sreg_32 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_128 }
+ - { id: 8, class: sreg_64 }
+ - { id: 9, class: sreg_128 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: sreg_64 }
+ - { id: 12, class: sreg_128 }
+ - { id: 13, class: sreg_64 }
+ - { id: 14, class: sreg_64 }
+ - { id: 15, class: sreg_128 }
+ - { id: 16, class: sreg_64 }
+body: |
+ bb.0:
+ S_NOP 0, implicit-def %0
+ S_NOP 0, implicit-def %1
+ S_NOP 0, implicit-def %2
+ %3 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2_sub3
+ S_NOP 0, implicit %3:sub1
+ S_NOP 0, implicit %3:sub3
+
+ S_NOP 0, implicit-def %4
+ S_NOP 0, implicit-def %5
+ %6 = REG_SEQUENCE %4, %subreg.sub0, undef %5, %subreg.sub1
+ S_NOP 0, implicit %6
+
+ S_NOP 0, implicit-def %7
+ S_NOP 0, implicit-def %8
+ %9 = INSERT_SUBREG %7, %8, %subreg.sub2_sub3
+ S_NOP 0, implicit %9:sub2
+
+ S_NOP 0, implicit-def %10
+ S_NOP 0, implicit-def %11
+ %12 = INSERT_SUBREG %10, %11, %subreg.sub0_sub1
+ S_NOP 0, implicit %12:sub3
+
+ S_NOP 0, implicit-def %13
+ S_NOP 0, implicit-def %14
+ %15 = REG_SEQUENCE %13, %subreg.sub0_sub1, %14, %subreg.sub2_sub3
+ %16 = EXTRACT_SUBREG %15, %subreg.sub0_sub1
+ S_NOP 0, implicit %16:sub1
+...
+---
+# Check that copies to physregs use all lanes, copies from physregs define all
+# lanes. So we should not get a dead/undef flag here.
+# CHECK-LABEL: name: test3
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: %vcc = COPY %0
+# CHECK: %1 = COPY %vcc
+# CHECK: S_NOP 0, implicit %1
+name: test3
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_64 }
+body: |
+ bb.0:
+ S_NOP 0, implicit-def %0
+ %vcc = COPY %0
+
+ %1 = COPY %vcc
+ S_NOP 0, implicit %1
+...
+---
+# Check that implicit-def/kill do not count as def/uses.
+# CHECK-LABEL: name: test4
+# CHECK: S_NOP 0, implicit-def dead %0
+# CHECK: KILL undef %0
+# CHECK: %1 = IMPLICIT_DEF
+# CHECK: S_NOP 0, implicit undef %1
+name: test4
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_64 }
+body: |
+ bb.0:
+ S_NOP 0, implicit-def %0
+ KILL %0
+
+ %1 = IMPLICIT_DEF
+ S_NOP 0, implicit %1
+...
+---
+# Check that unused inputs are marked as undef, even if the vreg itself is
+# used.
+# CHECK-LABEL: name: test5
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: %1 = REG_SEQUENCE undef %0, {{[0-9]+}}, %0, {{[0-9]+}}
+# CHECK: S_NOP 0, implicit %1:sub1
+name: test5
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_32 }
+ - { id: 1, class: sreg_64 }
+body: |
+ bb.0:
+ S_NOP 0, implicit-def %0
+ %1 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1
+ S_NOP 0, implicit %1:sub1
+...
+---
+# Check "optimistic" dataflow fixpoint in phi-loops.
+# CHECK-LABEL: name: loop0
+# CHECK: bb.0:
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: S_NOP 0, implicit-def dead %1
+# CHECK: S_NOP 0, implicit-def dead %2
+# CHECK: %3 = REG_SEQUENCE %0, {{[0-9]+}}, undef %1, {{[0-9]+}}, undef %2, {{[0-9]+}}
+
+# CHECK: bb.1:
+# CHECK: %4 = PHI %3, %bb.0, %5, %bb.1
+
+# CHECK: bb.2:
+# CHECK: S_NOP 0, implicit %4:sub0
+# CHECK: S_NOP 0, implicit undef %4:sub3
+name: loop0
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_32 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sreg_32 }
+ - { id: 3, class: sreg_128 }
+ - { id: 4, class: sreg_128 }
+ - { id: 5, class: sreg_128 }
+body: |
+ bb.0:
+ successors: %bb.1
+ S_NOP 0, implicit-def %0
+ S_NOP 0, implicit-def %1
+ S_NOP 0, implicit-def %2
+ %3 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ %4 = PHI %3, %bb.0, %5, %bb.1
+
+ ; let's swiffle some lanes around for fun...
+ %5 = REG_SEQUENCE %4:sub0, %subreg.sub0, %4:sub2, %subreg.sub1, %4:sub1, %subreg.sub2, %4:sub3, %subreg.sub3
+
+ S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_NOP 0, implicit %4:sub0
+ S_NOP 0, implicit %4:sub3
+...
+---
+# Check a loop that needs to be traversed multiple times to reach the fixpoint
+# for the used lanes. The example reads sub3 lane at the end, however with each
+# loop iteration we should get 1 more lane marked as we cycles the sublanes
+# along. Sublanes sub0, sub1 and sub3 are rotate in the loop so only sub2
+# should be dead.
+# CHECK-LABEL: name: loop1
+# CHECK: bb.0:
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: S_NOP 0, implicit-def %1
+# CHECK: S_NOP 0, implicit-def dead %2
+# CHECK: S_NOP 0, implicit-def %3
+# CHECK: %4 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}, %3, {{[0-9]+}}
+
+# CHECK: bb.1:
+# CHECK: %5 = PHI %4, %bb.0, %6, %bb.1
+
+# CHECK: %6 = REG_SEQUENCE %5:sub1, {{[0-9]+}}, %5:sub3, {{[0-9]+}}, undef %5:sub2, {{[0-9]+}}, %5:sub0, {{[0-9]+}}
+
+# CHECK: bb.2:
+# CHECK: S_NOP 0, implicit %6:sub3
+name: loop1
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_32 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sreg_32 }
+ - { id: 3, class: sreg_32 }
+ - { id: 4, class: sreg_128 }
+ - { id: 5, class: sreg_128 }
+ - { id: 6, class: sreg_128 }
+body: |
+ bb.0:
+ successors: %bb.1
+ S_NOP 0, implicit-def %0
+ S_NOP 0, implicit-def %1
+ S_NOP 0, implicit-def dead %2
+ S_NOP 0, implicit-def %3
+ %4 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ %5 = PHI %4, %bb.0, %6, %bb.1
+
+ ; rotate lanes, but skip sub2 lane...
+ %6 = REG_SEQUENCE %5:sub1, %subreg.sub0, %5:sub3, %subreg.sub1, %5:sub2, %subreg.sub2, %5:sub0, %subreg.sub3
+
+ S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_NOP 0, implicit %6:sub3
+...
+---
+# Similar to loop1 test, but check for fixpoint of defined lanes.
+# Lanes are rotate between sub0, sub2, sub3 so only sub1 should be dead/undef.
+# CHECK-LABEL: name: loop2
+# CHECK: bb.0:
+# CHECK: S_NOP 0, implicit-def %0
+# CHECK: %1 = REG_SEQUENCE %0, {{[0-9]+}}
+
+# CHECK: bb.1:
+# CHECK: %2 = PHI %1, %bb.0, %3, %bb.1
+
+# CHECK: %3 = REG_SEQUENCE %2:sub3, {{[0-9]+}}, undef %2:sub1, {{[0-9]+}}, %2:sub0, {{[0-9]+}}, %2:sub2, {{[0-9]+}}
+
+# CHECK: bb.2:
+# CHECK: S_NOP 0, implicit %2:sub0
+# CHECK: S_NOP 0, implicit undef %2:sub1
+# CHECK: S_NOP 0, implicit %2:sub2
+# CHECK: S_NOP 0, implicit %2:sub3
+name: loop2
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_32 }
+ - { id: 1, class: sreg_128 }
+ - { id: 2, class: sreg_128 }
+ - { id: 3, class: sreg_128 }
+body: |
+ bb.0:
+ successors: %bb.1
+ S_NOP 0, implicit-def %0
+ %1 = REG_SEQUENCE %0, %subreg.sub0
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ %2 = PHI %1, %bb.0, %3, %bb.1
+
+ ; rotate subreg lanes, skipping sub1
+ %3 = REG_SEQUENCE %2:sub3, %subreg.sub0, %2:sub1, %subreg.sub1, %2:sub0, %subreg.sub2, %2:sub2, %subreg.sub3
+
+ S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_NOP 0, implicit %2:sub0
+ S_NOP 0, implicit undef %2:sub1
+ S_NOP 0, implicit %2:sub2
+ S_NOP 0, implicit %2:sub3
+...
diff --git a/test/CodeGen/AMDGPU/dot4-folding.ll b/test/CodeGen/AMDGPU/dot4-folding.ll
deleted file mode 100644
index 4df7b63bf98e..000000000000
--- a/test/CodeGen/AMDGPU/dot4-folding.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; Exactly one constant vector can be folded into dot4, which means exactly
-; 4 MOV instructions
-; CHECK: {{^}}main:
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-; CHECK-NOT: MOV
-
-define void @main(float addrspace(1)* %out) {
-main_body:
- %0 = load <4 x float>, <4 x float> addrspace(8)* null
- %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
- %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
- %3 = insertelement <4 x float> undef, float %2, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
- ret void
-}
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 171883e4c74b..5e1ebfde3e10 100644
--- a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -7,12 +7,11 @@
; GCN-LABEL: {{^}}reschedule_global_load_lds_store:
; GCN: buffer_load_dword
; GCN: buffer_load_dword
-; GCN: ds_write_b32
-; GCN: ds_write_b32
+; GCN: ds_write2_b32
; GCN: s_endpgm
define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
entry:
- %tid = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx = shl i32 %tid, 2
%gep0 = getelementptr i32, i32 addrspace(1)* %gptr0, i32 %idx
%gep1 = getelementptr i32, i32 addrspace(1)* %gptr1, i32 %idx
@@ -25,7 +24,7 @@ for.body: ; preds = %for.body, %entry
%gptr0.phi = phi i32 addrspace(1)* [ %gep0, %entry ], [ %gep0.inc, %for.body ]
%gptr1.phi = phi i32 addrspace(1)* [ %gep1, %entry ], [ %gep1.inc, %for.body ]
%lptr0.phi = phi i32 addrspace(3)* [ %gep2, %entry ], [ %gep2.inc, %for.body ]
- %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 1
+ %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 2
%val0 = load i32, i32 addrspace(1)* %gep0
store i32 %val0, i32 addrspace(3)* %lptr0.phi
%val1 = load i32, i32 addrspace(1)* %gep1
@@ -42,10 +41,7 @@ exit: ; preds = %for.body, %entry
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index e657991557e3..f461d6978f13 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -1,31 +1,31 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare void @llvm.amdgcn.s.barrier() #1
; Function Attrs: nounwind
; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
; CHECK: BB0_1:
; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
-; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], vcc, 4, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
+; SI-DAG: v_add_i32_e32 [[VADDR8:v[0-9]+]], vcc, 8, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], vcc, 0x84, [[VADDR]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
-; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
; CHECK: s_endpgm
define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
entry:
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%mul = shl nsw i32 %x.i, 1
br label %for.body
@@ -33,16 +33,16 @@ for.body: ; preds = %for.body, %entry
%sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
%offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
%k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
- tail call void @llvm.AMDGPU.barrier.local() #1
+ tail call void @llvm.amdgcn.s.barrier() #1
%arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
%tmp = load float, float addrspace(3)* %arrayidx, align 4
- %add1 = add nsw i32 %offset.02, 1
+ %add1 = add nsw i32 %offset.02, 2
%arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
%tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
%add3 = add nsw i32 %offset.02, 32
%arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
%tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
- %add5 = add nsw i32 %offset.02, 33
+ %add5 = add nsw i32 %offset.02, 34
%arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
%tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
%add7 = add nsw i32 %offset.02, 64
@@ -67,4 +67,4 @@ for.end: ; preds = %for.body
attributes #0 = { nounwind readnone }
attributes #1 = { convergent nounwind }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 7d6eddb01993..16fb019ae0f3 100644
--- a/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -1,7 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-declare void @llvm.AMDGPU.barrier.local() #2
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
@lds.obj = addrspace(3) global [256 x i32] undef, align 4
@@ -12,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
define void @write_ds_sub0_offset0_global() #0 {
entry:
- %x.i = call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
%sub1 = sub i32 0, %x.i
%tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
%arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
@@ -26,7 +25,7 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
define void @add_x_shl_neg_to_sub_max_offset() #1 {
- %x.i = call i32 @llvm.r600.read.tidig.x() #0
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
%add = add i32 65535, %shl
@@ -41,7 +40,7 @@ define void @add_x_shl_neg_to_sub_max_offset() #1 {
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
- %x.i = call i32 @llvm.r600.read.tidig.x() #0
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
%add = add i32 65536, %shl
@@ -60,7 +59,7 @@ define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
; GCN: s_endpgm
define void @add_x_shl_neg_to_sub_multi_use() #1 {
- %x.i = call i32 @llvm.r600.read.tidig.x() #0
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
%add0 = add i32 123, %shl
@@ -82,7 +81,7 @@ define void @add_x_shl_neg_to_sub_multi_use() #1 {
; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
; GCN: s_endpgm
define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
- %x.i = call i32 @llvm.r600.read.tidig.x() #0
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
%add = add i32 123, %shl
@@ -97,7 +96,7 @@ define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
- %x.i = call i32 @llvm.r600.read.tidig.x() #0
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
%add = add i32 1019, %shl
@@ -111,7 +110,7 @@ define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
- %x.i = call i32 @llvm.r600.read.tidig.x() #0
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
%neg = sub i32 0, %x.i
%shl = shl i32 %neg, 2
%add = add i32 1020, %shl
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 5170d9c82712..6e30cff9609d 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -13,7 +13,7 @@
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @simple_read2_f32(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
@@ -32,7 +32,7 @@ define void @simple_read2_f32(float addrspace(1)* %out) #0 {
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 255
@@ -50,7 +50,7 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
; SI: s_endpgm
define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 257
@@ -67,7 +67,7 @@ define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
; SI: s_endpgm
define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 0
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -99,7 +99,7 @@ define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
; SI: s_endpgm
define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 0
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -109,7 +109,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
%val1 = load float, float addrspace(3)* %arrayidx1, align 4
%sum.0 = fadd float %val0, %val1
- call void @llvm.AMDGPU.barrier.local() #2
+ call void @llvm.amdgcn.s.barrier() #2
%idx.2 = add nsw i32 %tid.x, 11
%arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
@@ -134,7 +134,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
; SI: s_endpgm
define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -171,7 +171,7 @@ define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
; SI: ds_read_b32
; SI: s_endpgm
define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
%index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
%gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
@@ -197,7 +197,7 @@ define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float ad
; SI: ds_read_b32
; SI: s_endpgm
define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
%index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
%gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
@@ -220,7 +220,7 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
; SI: s_endpgm
define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
%ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
%x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
@@ -244,7 +244,7 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
; SI: s_endpgm
define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
@@ -262,7 +262,7 @@ define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
; SI: s_endpgm
define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
@@ -281,7 +281,7 @@ define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
; SI-NOT: ds_read2_b32
; SI: s_endpgm
define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 1
%add.x = add nsw i32 %x.i, 8
@@ -297,7 +297,7 @@ define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %
; SI-NOT: ds_read2_b32
; SI: s_endpgm
define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 2
%add.x = add nsw i32 %x.i, 8
@@ -316,7 +316,7 @@ define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrs
; SI: buffer_store_dwordx2 [[RESULT]]
; SI: s_endpgm
define void @simple_read2_f64(double addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 8
@@ -332,7 +332,7 @@ define void @simple_read2_f64(double addrspace(1)* %out) #0 {
; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
; SI: s_endpgm
define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 255
@@ -350,7 +350,7 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
; SI: s_endpgm
define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 257
@@ -368,7 +368,7 @@ define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
; SI: s_endpgm
define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
%val0 = load double, double addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 7
@@ -438,8 +438,8 @@ define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
- %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
- %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+ %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
+ %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
%arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
%tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
%add47 = add nsw i32 %x.i, 1
@@ -494,20 +494,20 @@ define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in)
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare void @llvm.amdgcn.s.barrier() #2
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index d362c46bbf96..57e190e0cca0 100644
--- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -8,9 +8,8 @@
; SI-LABEL: {{^}}offset_order:
-; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
define void @offset_order(float addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 0061aaf2cdbd..9d8375d64037 100644
--- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -13,7 +13,7 @@
; CI: buffer_store_dwordx2 [[RESULT]]
; CI: s_endpgm
define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
%val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
%out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
@@ -27,7 +27,7 @@ define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out)
; CI: buffer_store_dwordx2 [[RESULT]]
; CI: s_endpgm
define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
%val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
%out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i
@@ -44,7 +44,7 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
; CI: buffer_store_dword v[[ADD2]]
; CI: s_endpgm
define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
%val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
%elt0 = extractelement <4 x float> %val0, i32 0
@@ -69,7 +69,7 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
; CI: buffer_store_dword v[[ADD1]]
; CI: s_endpgm
define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
%val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
%elt0 = extractelement <3 x float> %val0, i32 0
@@ -85,17 +85,11 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
}
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
-; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-
-; FIXME: These moves shouldn't be necessary, it should be able to
-; store the same register if offset1 was the non-zero offset.
-
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: buffer_store_dwordx4
+; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI: buffer_store_dwordx4 [[REG_ZW]]
; CI: s_endpgm
define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
%val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
%out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
@@ -104,13 +98,11 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
}
; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
-; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: buffer_store_dwordx4
+; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI: buffer_store_dwordx4 [[REG_ZW]]
; CI: s_endpgm
define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
%val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
%out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i
@@ -120,17 +112,13 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
; FIXME: Extra moves shuffling superregister
; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
+; CI-DAG: ds_read2_b64 [[VEC_HI:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-DAG: ds_read2_b64 [[VEC_LO:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
+; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
; CI: s_endpgm
define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
%val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
%out.gep = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %out, i32 %x.i
@@ -140,25 +128,18 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
; FIXME: Extra moves shuffling superregister
; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}}
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
-; CI: v_mov_b32
-; CI: v_mov_b32
-
+; CI-DAG: ds_read2_b64 [[VEC0_3:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b64 [[VEC4_7:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-DAG: ds_read2_b64 [[VEC8_11:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:4 offset1:5{{$}}
+; CI-DAG: ds_read2_b64 [[VEC12_15:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:6 offset1:7{{$}}
; CI: s_waitcnt lgkmcnt(0)
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
-; CI: buffer_store_dwordx4
+; CI-DAG: buffer_store_dwordx4 [[VEC0_3]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
+; CI-DAG: buffer_store_dwordx4 [[VEC4_7]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
+; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
+; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
; CI: s_endpgm
define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
%val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
%out.gep = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %out, i32 %x.i
@@ -173,7 +154,7 @@ define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
; CI: s_endpgm
define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
@@ -196,7 +177,7 @@ define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspa
; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
; CI: s_endpgm
define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
%arrayidx2 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 2
@@ -218,20 +199,11 @@ define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspa
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare i32 @llvm.amdgcn.workitem.id.y() #1
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 4a0571ea16f2..7a8a206033ba 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -11,7 +11,7 @@
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 64
@@ -30,7 +30,7 @@ define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -44,13 +44,13 @@ define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(
}
; SI-LABEL: @simple_read2st64_f32_max_offset
-; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:255 offset1:1
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -65,12 +65,12 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
; SI-LABEL: @simple_read2st64_f32_over_max_offset
; SI-NOT: ds_read2st64_b32
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
-; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
+; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
+; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
; SI: s_endpgm
define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -87,7 +87,7 @@ define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, floa
; SI-NOT: ds_read2st64_b32
; SI: s_endpgm
define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 63
@@ -103,7 +103,7 @@ define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
; SI-NOT: ds_read2st64_b32
; SI: s_endpgm
define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -123,7 +123,7 @@ define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
; SI: buffer_store_dwordx2 [[RESULT]]
; SI: s_endpgm
define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 64
@@ -142,7 +142,7 @@ define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
; SI: buffer_store_dwordx2 [[RESULT]]
; SI: s_endpgm
define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -162,7 +162,7 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
; SI: s_endpgm
define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
%val0 = load double, double addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 64
@@ -176,13 +176,13 @@ define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspac
; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
; SI-LABEL: @simple_read2st64_f64_max_offset
-; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:127 offset1:4
; SI: s_waitcnt lgkmcnt(0)
-; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}, v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}
; SI: buffer_store_dwordx2 [[RESULT]]
; SI: s_endpgm
define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 256
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -197,12 +197,12 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
; SI-LABEL: @simple_read2st64_f64_over_max_offset
; SI-NOT: ds_read2st64_b64
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
; SI: s_endpgm
define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -219,7 +219,7 @@ define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, dou
; SI-NOT: ds_read2st64_b64
; SI: s_endpgm
define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%add.x.0 = add nsw i32 %x.i, 64
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -240,7 +240,7 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double
; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
; SI: s_endpgm
define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
%val0 = load double, double addrspace(3)* %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 8
@@ -253,16 +253,10 @@ define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, do
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index 9d3a293f3b89..45fcc01b2add 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -10,7 +10,7 @@
; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
; SI: s_endpgm
define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
%val = load float, float addrspace(1)* %in.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
@@ -28,11 +28,11 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
; SI: s_endpgm
define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
%in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
- %val0 = load float, float addrspace(1)* %in.gep.0, align 4
- %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+ %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+ %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
store float %val0, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
@@ -47,11 +47,11 @@ define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1
; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
; SI: s_endpgm
define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
%in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
- %val0 = load float, float addrspace(1)* %in0.gep, align 4
- %val1 = load float, float addrspace(1)* %in1.gep, align 4
+ %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
+ %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
@@ -66,11 +66,11 @@ define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float
; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
; SI: s_endpgm
define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
%in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
- %val0 = load float, float addrspace(1)* %in0.gep, align 4
- %val1 = load float, float addrspace(1)* %in1.gep, align 4
+ %val0 = load volatile float, float addrspace(1)* %in0.gep, align 4
+ %val1 = load volatile float, float addrspace(1)* %in1.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
store float %val0, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 8
@@ -87,11 +87,11 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
; SI: s_endpgm
define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
%in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
- %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
- %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
+ %val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
+ %val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
%val0.0 = extractelement <2 x float> %val0, i32 0
%val1.1 = extractelement <2 x float> %val1, i32 1
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
@@ -108,7 +108,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
; SI: s_endpgm
define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
%val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
%val0 = extractelement <2 x float> %val, i32 0
@@ -127,7 +127,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
; SI: s_endpgm
define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
%val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
%val0 = extractelement <4 x float> %val, i32 0
@@ -147,11 +147,11 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
; SI: s_endpgm
define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
%in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
- %val0 = load float, float addrspace(1)* %in.gep.0, align 4
- %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+ %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+ %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
store float %val0, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 255
@@ -165,7 +165,7 @@ define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float
; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
; SI: s_endpgm
define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
%in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
%val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -179,11 +179,11 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
}
; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
-; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
%in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
%val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -209,11 +209,11 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
}
; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
-; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
%in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
%val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -244,7 +244,7 @@ define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, f
; SI: ds_write_b32
; SI: s_endpgm
define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
%in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
%val0 = load float, float addrspace(1)* %in0.gep, align 4
@@ -271,7 +271,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
; SI: s_endpgm
define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
%val = load double, double addrspace(1)* %in.gep, align 8
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
@@ -289,7 +289,7 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
; SI: s_endpgm
define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
%val = load double, double addrspace(1)* %in.gep, align 8
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
@@ -307,11 +307,11 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
; SI: s_endpgm
define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
%in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
- %val0 = load double, double addrspace(1)* %in.gep.0, align 8
- %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+ %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
+ %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
store double %val0, double addrspace(3)* %arrayidx0, align 8
%add.x = add nsw i32 %x.i, 8
@@ -372,8 +372,8 @@ define void @store_misaligned64_constant_large_offsets() {
@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
- %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+ %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
+ %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
%val = load float, float addrspace(1)* %in
%arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
store float %val, float addrspace(3)* %arrayidx44, align 4
@@ -411,7 +411,7 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
; CI: s_endpgm
define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
%val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
%out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i
@@ -420,20 +420,17 @@ define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out,
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare i32 @llvm.amdgcn.workitem.id.y() #1
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 5a1024ccf6d7..872e77361406 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -8,7 +8,7 @@
; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
; SI: s_endpgm
define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
%val = load float, float addrspace(1)* %in.gep, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
@@ -26,11 +26,11 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add
; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
; SI: s_endpgm
define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
%in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
- %val0 = load float, float addrspace(1)* %in.gep.0, align 4
- %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+ %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+ %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
%add.x.0 = add nsw i32 %x.i, 128
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
store float %val0, float addrspace(3)* %arrayidx0, align 4
@@ -47,11 +47,11 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
; SI: s_endpgm
define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
%in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
- %val0 = load float, float addrspace(1)* %in.gep.0, align 4
- %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+ %val0 = load volatile float, float addrspace(1)* %in.gep.0, align 4
+ %val1 = load volatile float, float addrspace(1)* %in.gep.1, align 4
%arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
store float %val0, float addrspace(3)* %arrayidx0, align 4
%add.x = add nsw i32 %x.i, 16320
@@ -67,11 +67,11 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl
; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
; SI: s_endpgm
define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
%in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
- %val0 = load double, double addrspace(1)* %in.gep.0, align 8
- %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+ %val0 = load volatile double, double addrspace(1)* %in.gep.0, align 8
+ %val1 = load volatile double, double addrspace(1)* %in.gep.1, align 8
%add.x.0 = add nsw i32 %x.i, 256
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
store double %val0, double addrspace(3)* %arrayidx0, align 8
@@ -86,7 +86,7 @@ define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, d
; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
; SI: s_endpgm
define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
- %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+ %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
%val = load double, double addrspace(1)* %in.gep, align 8
%arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
@@ -98,20 +98,11 @@ define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C,
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
-
-; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index f4409a0984a9..580dc00f935e 100644
--- a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -2,7 +2,7 @@
; RUN: not llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s
; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
-; CHECK: error: unsupported dynamic alloca in test_dynamic_stackalloc
+; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
%alloca = alloca i32, i32 %n
diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll
index 90af6782c4b4..c62e57c6eaac 100644
--- a/test/CodeGen/AMDGPU/elf.ll
+++ b/test/CodeGen/AMDGPU/elf.ll
@@ -22,9 +22,9 @@
; CONFIG-NEXT: .long 45096
; TYPICAL-NEXT: .long 0
; TONGA-NEXT: .long 576
-; CONFIG: .align 256
+; CONFIG: .p2align 8
; CONFIG: test:
-define void @test(i32 %p) #0 {
+define amdgpu_ps void @test(i32 %p) {
%i = add i32 %p, 2
%r = bitcast i32 %i to float
call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
@@ -32,5 +32,3 @@ define void @test(i32 %p) #0 {
}
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
index 267a323c5063..c67095438ee5 100644
--- a/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,8 +12,9 @@
; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
; CHECK-NOT: s_or_b64 exec, exec
; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
+define void @test(i32 addrspace(1)* %out) {
entry:
+ %cond = call i32 @llvm.r600.read.tidig.x() #0
%tmp0 = icmp eq i32 %cond, 0
br i1 %tmp0, label %if, label %loop
@@ -32,3 +33,7 @@ done:
store i32 %inc, i32 addrspace(1)* %tmp3
ret void
}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll
index 294c3a9c6782..3f27370d7037 100644
--- a/test/CodeGen/AMDGPU/extload-private.ll
+++ b/test/CodeGen/AMDGPU/extload-private.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}load_i8_sext_private:
; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
@@ -39,7 +39,7 @@ entry:
define void @load_i16_zext_private(i32 addrspace(1)* %out) {
entry:
%tmp0 = alloca i16
- %tmp1 = load i16, i16* %tmp0
+ %tmp1 = load volatile i16, i16* %tmp0
%tmp2 = zext i16 %tmp1 to i32
store i32 %tmp2, i32 addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll
index 662eb7a9716b..2cb5cf0422dc 100644
--- a/test/CodeGen/AMDGPU/extload.ll
+++ b/test/CodeGen/AMDGPU/extload.ll
@@ -1,53 +1,65 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: This seems to not ever actually become an extload
+; FUNC-LABEL: {{^}}global_anyext_load_i8:
+; GCN: buffer_load_dword v{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}
-; FUNC-LABEL: {{^}}anyext_load_i8:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
; EG: VTX_READ_32 [[VAL]]
-
-define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
%cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
- %load = load i32, i32 addrspace(1)* %cast, align 1
+ %load = load i32, i32 addrspace(1)* %cast
%x = bitcast i32 %load to <4 x i8>
%castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
- store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
+ store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut
ret void
}
-; FUNC-LABEL: {{^}}anyext_load_i16:
+; FUNC-LABEL: {{^}}global_anyext_load_i16:
+; GCN: buffer_load_dword v{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}
+
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
; EG: VTX_READ_32 [[VAL]]
-
-define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
%cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
- %load = load i32, i32 addrspace(1)* %cast, align 1
+ %load = load i32, i32 addrspace(1)* %cast
%x = bitcast i32 %load to <2 x i16>
%castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
- store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
+ store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut
ret void
}
-; FUNC-LABEL: {{^}}anyext_load_lds_i8:
+; FUNC-LABEL: {{^}}local_anyext_load_i8:
+; GCN: ds_read_b32 v{{[0-9]+}}
+; GCN: ds_write_b32 v{{[0-9]+}}
+
; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
; EG: LDS_WRITE * [[VAL]]
-define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
%cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
- %load = load i32, i32 addrspace(3)* %cast, align 1
+ %load = load i32, i32 addrspace(3)* %cast
%x = bitcast i32 %load to <4 x i8>
%castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
- store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
+ store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut
ret void
}
-; FUNC-LABEL: {{^}}anyext_load_lds_i16:
+; FUNC-LABEL: {{^}}local_anyext_load_i16:
+; GCN: ds_read_b32 v{{[0-9]+}}
+; GCN: ds_write_b32 v{{[0-9]+}}
+
; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
; EG: LDS_WRITE * [[VAL]]
-define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
%cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
- %load = load i32, i32 addrspace(3)* %cast, align 1
+ %load = load i32, i32 addrspace(3)* %cast
%x = bitcast i32 %load to <2 x i16>
%castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
- store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
+ store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut
ret void
}
diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
new file mode 100644
index 000000000000..4edff152e66e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -0,0 +1,126 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+ <4 x i32> addrspace(1)* noalias %out1,
+ i32 addrspace(1)* noalias %out2,
+ i32 addrspace(1)* %in) {
+ %elt0 = load volatile i32, i32 addrspace(1)* %in
+ %elt1 = load volatile i32, i32 addrspace(1)* %in
+ %elt2 = load volatile i32, i32 addrspace(1)* %in
+ %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+ %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+ %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+ %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+ %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+ store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
+ store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out1
+
+ %extract0 = extractelement <4 x i32> %vec3, i32 0
+ %extract1 = extractelement <4 x i32> %vec3, i32 1
+ %extract2 = extractelement <4 x i32> %vec3, i32 2
+ %extract3 = extractelement <4 x i32> %vec3, i32 3
+
+ store volatile i32 %extract0, i32 addrspace(1)* %out2
+ store volatile i32 %extract1, i32 addrspace(1)* %out2
+ store volatile i32 %extract2, i32 addrspace(1)* %out2
+ store volatile i32 %extract3, i32 addrspace(1)* %out2
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_extract_uses_v4i32:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+ <4 x i32> addrspace(1)* noalias %out1,
+ i32 addrspace(1)* noalias %out2,
+ i32 addrspace(1)* %in) {
+ %elt0 = load volatile i32, i32 addrspace(1)* %in
+ %elt1 = load volatile i32, i32 addrspace(1)* %in
+ %elt2 = load volatile i32, i32 addrspace(1)* %in
+ %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+ %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+ %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+ %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+ %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+ %extract0 = extractelement <4 x i32> %vec3, i32 0
+ %extract1 = extractelement <4 x i32> %vec3, i32 1
+ %extract2 = extractelement <4 x i32> %vec3, i32 2
+ %extract3 = extractelement <4 x i32> %vec3, i32 3
+
+ %op0 = add i32 %extract0, 3
+ %op1 = sub i32 %extract1, 9
+ %op2 = xor i32 %extract2, 1231412
+ %op3 = and i32 %extract3, 258233412312
+
+ store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
+
+ store volatile i32 %op0, i32 addrspace(1)* %out2
+ store volatile i32 %op1, i32 addrspace(1)* %out2
+ store volatile i32 %op2, i32 addrspace(1)* %out2
+ store volatile i32 %op3, i32 addrspace(1)* %out2
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+
+; GCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+ <4 x i32> addrspace(1)* noalias %out1,
+ i64 addrspace(1)* noalias %out2,
+ i32 addrspace(1)* %in) {
+ %elt0 = load volatile i32, i32 addrspace(1)* %in
+ %elt1 = load volatile i32, i32 addrspace(1)* %in
+ %elt2 = load volatile i32, i32 addrspace(1)* %in
+ %elt3 = load volatile i32, i32 addrspace(1)* %in
+
+ %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
+ %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
+ %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
+ %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
+
+ %bc.vec3 = bitcast <4 x i32> %vec3 to <2 x i64>
+ store <2 x i64> %bc.vec3, <2 x i64> addrspace(1)* %out0
+
+ %extract0 = extractelement <2 x i64> %bc.vec3, i32 0
+ %extract1 = extractelement <2 x i64> %bc.vec3, i32 1
+
+ store volatile i64 %extract0, i64 addrspace(1)* %out2
+ store volatile i64 %extract1, i64 addrspace(1)* %out2
+
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
new file mode 100644
index 000000000000..d0b19c825ee9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx2
+; GCN: buffer_store_dwordx2
+define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+ %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
+ %elt = extractelement <3 x double> %ld, i32 2
+ store volatile double %elt, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
+define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+ %dynelt = extractelement <3 x double> %foo, i32 %elt
+ store volatile double %dynelt, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
+define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+ %dynelt = extractelement <4 x double> %foo, i32 %elt
+ store volatile double %dynelt, double addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
new file mode 100644
index 000000000000..e012cb07163b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) #0 {
+ %p0 = extractelement <2 x i16> %foo, i32 0
+ %p1 = extractelement <2 x i16> %foo, i32 1
+ %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
+ store i16 %p1, i16 addrspace(1)* %out, align 2
+ store i16 %p0, i16 addrspace(1)* %out1, align 2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v3i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+ %p0 = extractelement <3 x i16> %foo, i32 0
+ %p1 = extractelement <3 x i16> %foo, i32 2
+ %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+ store i16 %p1, i16 addrspace(1)* %out, align 2
+ store i16 %p0, i16 addrspace(1)* %out1, align 2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+ %p0 = extractelement <4 x i16> %foo, i32 0
+ %p1 = extractelement <4 x i16> %foo, i32 2
+ %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
+ store i16 %p1, i16 addrspace(1)* %out, align 2
+ store i16 %p0, i16 addrspace(1)* %out1, align 2
+ ret void
+}
+
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+
+; GCN: buffer_store_short
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+ %p0 = extractelement <3 x i16> %foo, i32 %idx
+ %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+ store i16 %p0, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+
+; GCN: buffer_store_short
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
+ %p0 = extractelement <4 x i16> %foo, i32 %idx
+ %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+ store i16 %p0, i16 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index e32559139623..0a51c39f026f 100644
--- a/test/CodeGen/AMDGPU/extract-vector-elt-i64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -1,4 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; How the replacement of i64 stores with v2i32 stores resulted in
; breaking other users of the bitcast if they already existed
@@ -7,7 +8,7 @@
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) nounwind {
+define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
%vec = bitcast i64 %val to <2 x i32>
%elt0 = extractelement <2 x i32> %vec, i32 0
%elt1 = extractelement <2 x i32> %vec, i32 1
@@ -18,8 +19,8 @@ define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspa
ret void
}
-
-define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) nounwind {
+; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
+define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
%p0 = extractelement <2 x i64> %foo, i32 0
%p1 = extractelement <2 x i64> %foo, i32 1
%out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
@@ -28,16 +29,34 @@ define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) no
ret void
}
-define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) nounwind {
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
+define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
%dynelt = extractelement <2 x i64> %foo, i32 %elt
store volatile i64 %dynelt, i64 addrspace(1)* %out
ret void
}
-define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) nounwind {
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
+define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
%load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
%or = or <2 x i64> %load, %arst
%dynelt = extractelement <2 x i64> %or, i32 %elt
store volatile i64 %dynelt, i64 addrspace(1)* %out
ret void
}
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
+define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+ %dynelt = extractelement <3 x i64> %foo, i32 %elt
+ store volatile i64 %dynelt, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
+define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+ %dynelt = extractelement <4 x i64> %foo, i32 %elt
+ store volatile i64 %dynelt, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
new file mode 100644
index 000000000000..9005bfa07c2b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -0,0 +1,151 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+ %p0 = extractelement <1 x i8> %foo, i32 0
+ store i8 %p0, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+ %p0 = extractelement <2 x i8> %foo, i32 0
+ %p1 = extractelement <2 x i8> %foo, i32 1
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p1, i8 addrspace(1)* %out
+ store i8 %p0, i8 addrspace(1)* %out1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v3i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+ %p0 = extractelement <3 x i8> %foo, i32 0
+ %p1 = extractelement <3 x i8> %foo, i32 2
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p1, i8 addrspace(1)* %out
+ store i8 %p0, i8 addrspace(1)* %out1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+ %p0 = extractelement <4 x i8> %foo, i32 0
+ %p1 = extractelement <4 x i8> %foo, i32 2
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p1, i8 addrspace(1)* %out
+ store i8 %p0, i8 addrspace(1)* %out1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v8i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+ %p0 = extractelement <8 x i8> %foo, i32 0
+ %p1 = extractelement <8 x i8> %foo, i32 2
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p1, i8 addrspace(1)* %out
+ store i8 %p0, i8 addrspace(1)* %out1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v16i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+ %p0 = extractelement <16 x i8> %foo, i32 0
+ %p1 = extractelement <16 x i8> %foo, i32 2
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p1, i8 addrspace(1)* %out
+ store i8 %p0, i8 addrspace(1)* %out1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v32i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+ %p0 = extractelement <32 x i8> %foo, i32 0
+ %p1 = extractelement <32 x i8> %foo, i32 2
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p1, i8 addrspace(1)* %out
+ store i8 %p0, i8 addrspace(1)* %out1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v64i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+ %p0 = extractelement <64 x i8> %foo, i32 0
+ %p1 = extractelement <64 x i8> %foo, i32 2
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p1, i8 addrspace(1)* %out
+ store i8 %p0, i8 addrspace(1)* %out1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+ %p0 = extractelement <3 x i8> %foo, i32 %idx
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p0, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
+ %p0 = extractelement <4 x i8> %foo, i32 %idx
+ %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+ store i8 %p0, i8 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
deleted file mode 100644
index c7572efc6f5b..000000000000
--- a/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_short
-; SI: buffer_store_short
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
- %p0 = extractelement <2 x i16> %foo, i32 0
- %p1 = extractelement <2 x i16> %foo, i32 1
- %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
- store i16 %p1, i16 addrspace(1)* %out, align 2
- store i16 %p0, i16 addrspace(1)* %out1, align 2
- ret void
-}
-
-; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_short
-; SI: buffer_store_short
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
- %p0 = extractelement <4 x i16> %foo, i32 0
- %p1 = extractelement <4 x i16> %foo, i32 2
- %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
- store i16 %p1, i16 addrspace(1)* %out, align 2
- store i16 %p0, i16 addrspace(1)* %out1, align 2
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
new file mode 100644
index 000000000000..e160c20a03a0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure the add and load are reduced to 32-bits even with the
+; bitcast to vector.
+; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0:
+; GCN-DAG: s_load_dword [[B:s[0-9]+]]
+; GCN-DAG: buffer_load_dword [[A:v[0-9]+]]
+; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
+; GCN: buffer_store_dword [[ADD]]
+define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+ %a = load i64, i64 addrspace(1)* %in
+ %add = add i64 %a, %b
+ %val.bc = bitcast i64 %add to <2 x i32>
+ %extract = extractelement <2 x i32> %val.bc, i32 0
+ store i32 %extract, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}bitcast_fp_to_vector_extract_0:
+; GCN: buffer_load_dwordx2
+; GCN: v_add_f64
+; GCN: buffer_store_dword v
+define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+ %a = load double, double addrspace(1)* %in
+ %add = fadd double %a, %b
+ %val.bc = bitcast double %add to <2 x i32>
+ %extract = extractelement <2 x i32> %val.bc, i32 0
+ store i32 %extract, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}bitcast_int_to_fpvector_extract_0:
+; GCN: buffer_load_dwordx2
+; GCN: v_add_i32
+; GCN: buffer_store_dword
+define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+ %a = load i64, i64 addrspace(1)* %in
+ %add = add i64 %a, %b
+ %val.bc = bitcast i64 %add to <2 x float>
+ %extract = extractelement <2 x float> %val.bc, i32 0
+ store float %extract, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_extract_volatile_load_extract0:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_store_dword v
+define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+ %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %elt0 = extractelement <4 x i32> %vec, i32 0
+ store i32 %elt0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_extract_volatile_load_extract2:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_store_dword v
+
+define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+ %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %elt2 = extractelement <4 x i32> %vec, i32 2
+ store i32 %elt2, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_store_dword v
+define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+ %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %eltN = extractelement <4 x i32> %vec, i32 %idx
+ store i32 %eltN, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll
index 3c6136c1a7bd..db8093047a36 100644
--- a/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare double @fabs(double) readnone
declare double @llvm.fabs.f64(double) readnone
@@ -11,7 +11,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
; SI: v_and_b32
; SI: s_endpgm
define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%tidext = sext i32 %tid to i64
%gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
%val = load double, double addrspace(1)* %gep, align 8
diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll
index 5fac328c5981..11436794ac98 100644
--- a/test/CodeGen/AMDGPU/fadd.ll
+++ b/test/CodeGen/AMDGPU/fadd.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
; FUNC-LABEL: {{^}}fadd_f32:
; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll
new file mode 100644
index 000000000000..981d88dfe94e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -0,0 +1,351 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.canonicalize.f32(float) #0
+declare double @llvm.canonicalize.f64(double) #0
+
+; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
+; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
+; GCN: buffer_store_dword [[REG]]
+define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+ %val = load float, float addrspace(1)* %out
+ %canonicalized = call float @llvm.canonicalize.f32(float %val)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
+; GCN: buffer_store_dword [[REG]]
+define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float %val)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
+; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2 [[REG]]
+define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+ %val = load double, double addrspace(1)* %out
+ %canonicalized = call double @llvm.canonicalize.f64(double %val)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2 [[REG]]
+define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double %val)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
+ %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
index c8ef5b101c4d..fb5853b808e4 100644
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
@@ -12,11 +12,11 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; FUNC-LABEL: {{^}}fceil_f64:
; CI: v_ceil_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
+; SI-DAG: s_not_b64
+; SI-DAG: s_and_b64
; SI-DAG: cmp_gt_i32
; SI-DAG: cndmask_b32
; SI-DAG: cndmask_b32
@@ -25,8 +25,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; SI-DAG: cndmask_b32
; SI-DAG: v_cmp_lt_f64
; SI-DAG: v_cmp_lg_f64
-; SI: s_and_b64
-; SI: v_cndmask_b32
+; SI-DAG: v_cndmask_b32
; SI: v_cndmask_b32
; SI: v_add_f64
; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 3d8c55993089..738a35fb3b89 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -13,8 +13,8 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
-; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
-; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
; GCN: s_endpgm
define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll
index 7c022e38c808..3343b681b9fe 100644
--- a/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -4,8 +4,8 @@
; COMMON-LABEL: {{^}}fdiv_f64:
-; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0
-; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
@@ -31,8 +31,8 @@
; COMMON: s_endpgm
define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
%gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
- %num = load double, double addrspace(1)* %in
- %den = load double, double addrspace(1)* %gep.1
+ %num = load volatile double, double addrspace(1)* %in
+ %den = load volatile double, double addrspace(1)* %gep.1
%result = fdiv double %num, %den
store double %result, double addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 7cbf87336399..4021233e7785 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -1,19 +1,32 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; These tests check that fdiv is expanded correctly and also test that the
; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
; instruction groups.
+; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div.
+
; FUNC-LABEL: {{^}}fdiv_f32:
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
; SI-DAG: v_rcp_f32
; SI-DAG: v_mul_f32
+
+; I754-DAG: v_div_scale_f32
+; I754-DAG: v_rcp_f32
+; I754-DAG: v_fma_f32
+; I754-DAG: v_mul_f32
+; I754-DAG: v_fma_f32
+; I754-DAG: v_div_fixup_f32
define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fdiv float %a, %b
@@ -21,7 +34,37 @@ entry:
ret void
}
+; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
+entry:
+ %0 = fdiv fast float %a, %b
+ store float %0, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
+entry:
+ %0 = fdiv arcp float %a, %b
+ store float %0, float addrspace(1)* %out
+ ret void
+}
; FUNC-LABEL: {{^}}fdiv_v2f32:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
@@ -29,10 +72,22 @@ entry:
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
; SI-DAG: v_rcp_f32
; SI-DAG: v_mul_f32
; SI-DAG: v_rcp_f32
; SI-DAG: v_mul_f32
+
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
entry:
%0 = fdiv <2 x float> %a, %b
@@ -40,6 +95,50 @@ entry:
ret void
}
+; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fdiv fast <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_rcp_f32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fdiv arcp <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}fdiv_v4f32:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -50,6 +149,15 @@ entry:
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
; SI-DAG: v_rcp_f32
; SI-DAG: v_mul_f32
; SI-DAG: v_rcp_f32
@@ -58,6 +166,19 @@ entry:
; SI-DAG: v_mul_f32
; SI-DAG: v_rcp_f32
; SI-DAG: v_mul_f32
+
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_scale_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
+; I754: v_div_fixup_f32
define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1) * %in
@@ -66,3 +187,75 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
store <4 x float> %result, <4 x float> addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+ %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+ %a = load <4 x float>, <4 x float> addrspace(1) * %in
+ %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+ %result = fdiv fast <4 x float> %a, %b
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_rcp_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+; UNSAFE-FP: v_mul_f32_e32
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+ %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+ %a = load <4 x float>, <4 x float> addrspace(1) * %in
+ %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+ %result = fdiv arcp <4 x float> %a, %b
+ store <4 x float> %result, <4 x float> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r600.ll b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
index e7160ef5d726..5cb0c616d15f 100644
--- a/test/CodeGen/AMDGPU/fetch-limits.r600.ll
+++ b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
@@ -7,42 +7,50 @@
; CHECK: Fetch clause
; CHECK: Fetch clause
-define void @fetch_limits_r600() #0 {
+define amdgpu_ps void @fetch_limits_r600() {
entry:
- %0 = load <4 x float>, <4 x float> addrspace(8)* null
- %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
- %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
- %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
- %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
- %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
- %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
- %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
- %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
- %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
- %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
- %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
- %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
- %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
- %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
- %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
- %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
- %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
- %a = fadd <4 x float> %res0, %res1
- %b = fadd <4 x float> %res2, %res3
- %c = fadd <4 x float> %res4, %res5
- %d = fadd <4 x float> %res6, %res7
- %e = fadd <4 x float> %res8, %a
-
+ %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+ %tmp1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+ %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+ %tmp3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+ %tmp4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+ %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+ %tmp6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+ %tmp7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+ %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+ %tmp9 = shufflevector <4 x float> %tmp, <4 x float> %tmp, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp10 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp11 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp12 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp13 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp14 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp15 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp16 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp17 = shufflevector <4 x float> %tmp4, <4 x float> %tmp4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp18 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp19 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp20 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp21 = shufflevector <4 x float> %tmp6, <4 x float> %tmp6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp22 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp23 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp24 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp25 = shufflevector <4 x float> %tmp8, <4 x float> %tmp8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp26 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %a = fadd <4 x float> %tmp10, %tmp12
+ %b = fadd <4 x float> %tmp14, %tmp16
+ %c = fadd <4 x float> %tmp18, %tmp20
+ %d = fadd <4 x float> %tmp22, %tmp24
+ %e = fadd <4 x float> %tmp26, %a
%bc = fadd <4 x float> %b, %c
%de = fadd <4 x float> %d, %e
-
%bcde = fadd <4 x float> %bc, %de
-
- call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
ret void
}
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
index acaea2aa7943..d8f7c0daa8de 100644
--- a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
+++ b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
@@ -16,7 +16,7 @@
; CHECK: Fetch clause
; CHECK: Fetch clause
-define void @fetch_limits_r700() #0 {
+define amdgpu_ps void @fetch_limits_r700() {
entry:
%0 = load <4 x float>, <4 x float> addrspace(8)* null
%1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
@@ -35,47 +35,63 @@ entry:
%14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
%15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
%16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
- %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
- %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
- %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
- %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
- %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
- %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
- %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
- %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
- %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
- %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1)
- %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1)
- %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1)
- %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1)
- %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1)
- %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1)
- %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1)
- %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1)
- %a = fadd <4 x float> %res0, %res1
- %b = fadd <4 x float> %res2, %res3
- %c = fadd <4 x float> %res4, %res5
- %d = fadd <4 x float> %res6, %res7
- %e = fadd <4 x float> %res8, %res9
- %f = fadd <4 x float> %res10, %res11
- %g = fadd <4 x float> %res12, %res13
- %h = fadd <4 x float> %res14, %res15
- %i = fadd <4 x float> %res16, %a
-
+ %17 = shufflevector <4 x float> %0, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %18 = call <4 x float> @llvm.r600.tex(<4 x float> %17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %19 = shufflevector <4 x float> %1, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %20 = call <4 x float> @llvm.r600.tex(<4 x float> %19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %21 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %22 = call <4 x float> @llvm.r600.tex(<4 x float> %21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %23 = shufflevector <4 x float> %3, <4 x float> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %24 = call <4 x float> @llvm.r600.tex(<4 x float> %23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %25 = shufflevector <4 x float> %4, <4 x float> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %26 = call <4 x float> @llvm.r600.tex(<4 x float> %25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %27 = shufflevector <4 x float> %5, <4 x float> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %28 = call <4 x float> @llvm.r600.tex(<4 x float> %27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %29 = shufflevector <4 x float> %6, <4 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %30 = call <4 x float> @llvm.r600.tex(<4 x float> %29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %31 = shufflevector <4 x float> %7, <4 x float> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %32 = call <4 x float> @llvm.r600.tex(<4 x float> %31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %33 = shufflevector <4 x float> %8, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %34 = call <4 x float> @llvm.r600.tex(<4 x float> %33, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %35 = shufflevector <4 x float> %9, <4 x float> %9, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %36 = call <4 x float> @llvm.r600.tex(<4 x float> %35, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %37 = shufflevector <4 x float> %10, <4 x float> %10, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %38 = call <4 x float> @llvm.r600.tex(<4 x float> %37, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %39 = shufflevector <4 x float> %11, <4 x float> %11, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %40 = call <4 x float> @llvm.r600.tex(<4 x float> %39, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %41 = shufflevector <4 x float> %12, <4 x float> %12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %42 = call <4 x float> @llvm.r600.tex(<4 x float> %41, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %43 = shufflevector <4 x float> %13, <4 x float> %13, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %44 = call <4 x float> @llvm.r600.tex(<4 x float> %43, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %45 = shufflevector <4 x float> %14, <4 x float> %14, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %46 = call <4 x float> @llvm.r600.tex(<4 x float> %45, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %47 = shufflevector <4 x float> %15, <4 x float> %15, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %48 = call <4 x float> @llvm.r600.tex(<4 x float> %47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %49 = shufflevector <4 x float> %16, <4 x float> %16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %50 = call <4 x float> @llvm.r600.tex(<4 x float> %49, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %a = fadd <4 x float> %18, %20
+ %b = fadd <4 x float> %22, %24
+ %c = fadd <4 x float> %26, %28
+ %d = fadd <4 x float> %30, %32
+ %e = fadd <4 x float> %34, %36
+ %f = fadd <4 x float> %38, %40
+ %g = fadd <4 x float> %42, %44
+ %h = fadd <4 x float> %46, %48
+ %i = fadd <4 x float> %50, %a
%bc = fadd <4 x float> %b, %c
%de = fadd <4 x float> %d, %e
%fg = fadd <4 x float> %f, %g
%hi = fadd <4 x float> %h, %i
-
%bcde = fadd <4 x float> %bc, %de
%fghi = fadd <4 x float> %fg, %hi
-
%bcdefghi = fadd <4 x float> %bcde, %fghi
- call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
ret void
}
-attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll
index 45f8382c3929..ea708a2b7bbd 100644
--- a/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
@@ -13,8 +13,8 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
; FUNC-LABEL: {{^}}ffloor_f64:
; CI: v_floor_f64_e32
; SI: v_fract_f64_e32
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
+; SI-DAG: v_min_f64
+; SI-DAG: v_cmp_class_f64_e64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_add_f64
@@ -28,8 +28,8 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
; FUNC-LABEL: {{^}}ffloor_f64_neg:
; CI: v_floor_f64_e64
; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
+; SI-DAG: v_min_f64
+; SI-DAG: v_cmp_class_f64_e64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
@@ -44,8 +44,8 @@ define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
; CI: v_floor_f64_e64
; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
-; SI: v_min_f64
-; SI: v_cmp_class_f64_e64
+; SI-DAG: v_min_f64
+; SI-DAG: v_cmp_class_f64_e64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
@@ -67,15 +67,16 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
ret void
}
-; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; FIXME-CI: v_floor_f64_e32
-; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-; %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
-; store <3 x double> %y, <3 x double> addrspace(1)* %out
-; ret void
-; }
+; FUNC-LABEL: {{^}}ffloor_v3f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI-NOT: v_floor_f64_e32
+define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+ %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
+ store <3 x double> %y, <3 x double> addrspace(1)* %out
+ ret void
+}
; FUNC-LABEL: {{^}}ffloor_v4f64:
; CI: v_floor_f64_e32
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 86e0c07323bb..5ca57fd3d350 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -17,7 +17,7 @@
; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
-; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
%fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
store i32 %x, i32 addrspace(4)* %fptr, align 4
@@ -127,9 +127,6 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
ret void
}
-declare void @llvm.AMDGPU.barrier.local() #1
-declare i32 @llvm.r600.read.tidig.x() #3
-
attributes #0 = { nounwind }
attributes #1 = { nounwind convergent }
attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/flat_atomics.ll b/test/CodeGen/AMDGPU/flat_atomics.ll
new file mode 100644
index 000000000000..7400dbcf8909
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -0,0 +1,968 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}atomic_add_i32_offset:
+; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
+; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32:
+; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_addr64:
+; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
+; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_offset:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
+; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
+; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret:
+; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_addr64:
+; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
+; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_offset:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
+; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
+; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_offset:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_addr64:
+; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
+; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_offset:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
+; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
+; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_offset:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_addr64:
+; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64:
+; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_offset:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret:
+; GCN: flat_atomic_umin v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_addr64:
+; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64:
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}}
+ define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_offset:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_addr64:
+; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64:
+; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
+; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
+; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32:
+; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_addr64:
+; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
+; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; CMP_SWAP
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ %flag = extractvalue { i32, i1 } %val, 0
+ store i32 %flag, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ %flag = extractvalue { i32, i1 } %val, 0
+ store i32 %flag, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+entry:
+ %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+entry:
+ %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
+ %flag = extractvalue { i32, i1 } %val, 0
+ store i32 %flag, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
+; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
+; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+ %flag = extractvalue { i32, i1 } %val, 0
+ store i32 %flag, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_offset:
+; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
+; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32:
+; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
+define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+entry:
+ %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_addr64:
+; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64:
+; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32_offset:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4
+ %val = load atomic i32, i32 addrspace(4)* %gep seq_cst, align 4
+ store i32 %val, i32 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+ %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4
+ store i32 %val, i32 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %val = load atomic i32, i32 addrspace(4)* %gep seq_cst, align 4
+ store i32 %val, i32 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i32_addr64:
+; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
+ %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4
+ store i32 %val, i32 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32_offset:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
+ store atomic i32 %in, i32 addrspace(4)* %gep seq_cst, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
+entry:
+ store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ store atomic i32 %in, i32 addrspace(4)* %gep seq_cst, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i32_addr64:
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
+ store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
new file mode 100644
index 000000000000..0bd6c2dd5b86
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -0,0 +1,975 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}atomic_add_i64_offset:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64:
+; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
+; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_offset:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64:
+; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
+; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_offset:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
+; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
+; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_offset:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64:
+; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
+; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_offset:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
+; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
+; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_offset:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64:
+; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
+; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_offset:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
+; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
+; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_offset:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64:
+; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
+; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
+; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_offset:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
+; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
+; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64_offset:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4
+ %val = load atomic i64, i64 addrspace(4)* %gep seq_cst, align 8
+ store i64 %val, i64 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+entry:
+ %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8
+ store i64 %val, i64 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %val = load atomic i64, i64 addrspace(4)* %gep seq_cst, align 8
+ store i64 %val, i64 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_load_i64_addr64:
+; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
+ %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8
+ store i64 %val, i64 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64_offset:
+; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ store atomic i64 %in, i64 addrspace(4)* %gep seq_cst, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64:
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
+define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
+entry:
+ store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset:
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ store atomic i64 %in, i64 addrspace(4)* %gep seq_cst, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_i64_addr64:
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000
+ %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
+ %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
+ %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+entry:
+ %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+entry:
+ %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(4)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
+; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
+; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
+ %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(4)* %out2
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/floor.ll b/test/CodeGen/AMDGPU/floor.ll
index c6bfb8567a0f..43e58b942220 100644
--- a/test/CodeGen/AMDGPU/floor.ll
+++ b/test/CodeGen/AMDGPU/floor.ll
@@ -1,15 +1,14 @@
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = call float @floor(float %r0)
%vec = insertelement <4 x float> undef, float %r1, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
declare float @floor(float) readonly
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
index 6f3437048ed8..19deefe4d4a5 100644
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
declare double @llvm.fabs.f64(double) #0
declare double @llvm.fma.f64(double, double, double) #0
declare float @llvm.fma.f32(float, float, float) #0
@@ -14,15 +14,15 @@ declare float @llvm.fma.f32(float, float, float) #0
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
%mul = fmul double %a, %b
%fma = fadd double %mul, %c
@@ -42,7 +42,7 @@ define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addr
; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI: s_endpgm
define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -50,16 +50,16 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double
%gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
- %d = load double, double addrspace(1)* %gep.3
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
+ %d = load volatile double, double addrspace(1)* %gep.3
%mul = fmul double %a, %b
%fma0 = fadd double %mul, %c
%fma1 = fadd double %mul, %d
- store double %fma0, double addrspace(1)* %gep.out.0
- store double %fma1, double addrspace(1)* %gep.out.1
+ store volatile double %fma0, double addrspace(1)* %gep.out.0
+ store volatile double %fma1, double addrspace(1)* %gep.out.1
ret void
}
@@ -71,15 +71,15 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
%mul = fmul double %a, %b
%fma = fadd double %c, %mul
@@ -95,15 +95,15 @@ define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addr
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
%mul = fmul double %a, %b
%fma = fsub double %mul, %c
@@ -123,7 +123,7 @@ define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double
; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI: s_endpgm
define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -131,16 +131,16 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d
%gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
- %d = load double, double addrspace(1)* %gep.3
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
+ %d = load volatile double, double addrspace(1)* %gep.3
%mul = fmul double %a, %b
%fma0 = fsub double %mul, %c
%fma1 = fsub double %mul, %d
- store double %fma0, double addrspace(1)* %gep.out.0
- store double %fma1, double addrspace(1)* %gep.out.1
+ store volatile double %fma0, double addrspace(1)* %gep.out.0
+ store volatile double %fma1, double addrspace(1)* %gep.out.1
ret void
}
@@ -152,15 +152,15 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
%mul = fmul double %a, %b
%fma = fsub double %c, %mul
@@ -180,7 +180,7 @@ define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double
; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI: s_endpgm
define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -188,16 +188,16 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d
%gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
- %d = load double, double addrspace(1)* %gep.3
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
+ %d = load volatile double, double addrspace(1)* %gep.3
%mul = fmul double %a, %b
%fma0 = fsub double %c, %mul
%fma1 = fsub double %d, %mul
- store double %fma0, double addrspace(1)* %gep.out.0
- store double %fma1, double addrspace(1)* %gep.out.1
+ store volatile double %fma0, double addrspace(1)* %gep.out.0
+ store volatile double %fma1, double addrspace(1)* %gep.out.1
ret void
}
@@ -209,15 +209,15 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
%mul = fmul double %a, %b
%mul.neg = fsub double -0.0, %mul
@@ -238,7 +238,7 @@ define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double
; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI: s_endpgm
define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -246,18 +246,18 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o
%gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
- %d = load double, double addrspace(1)* %gep.3
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
+ %d = load volatile double, double addrspace(1)* %gep.3
%mul = fmul double %a, %b
%mul.neg = fsub double -0.0, %mul
%fma0 = fsub double %mul.neg, %c
%fma1 = fsub double %mul.neg, %d
- store double %fma0, double addrspace(1)* %gep.out.0
- store double %fma1, double addrspace(1)* %gep.out.1
+ store volatile double %fma0, double addrspace(1)* %gep.out.0
+ store volatile double %fma1, double addrspace(1)* %gep.out.1
ret void
}
@@ -272,7 +272,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o
; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI: s_endpgm
define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -280,18 +280,18 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o
%gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
- %a = load double, double addrspace(1)* %gep.0
- %b = load double, double addrspace(1)* %gep.1
- %c = load double, double addrspace(1)* %gep.2
- %d = load double, double addrspace(1)* %gep.3
+ %a = load volatile double, double addrspace(1)* %gep.0
+ %b = load volatile double, double addrspace(1)* %gep.1
+ %c = load volatile double, double addrspace(1)* %gep.2
+ %d = load volatile double, double addrspace(1)* %gep.3
%mul = fmul double %a, %b
%mul.neg = fsub double -0.0, %mul
%fma0 = fsub double %mul.neg, %c
%fma1 = fsub double %mul, %d
- store double %fma0, double addrspace(1)* %gep.out.0
- store double %fma1, double addrspace(1)* %gep.out.1
+ store volatile double %fma0, double addrspace(1)* %gep.out.0
+ store volatile double %fma1, double addrspace(1)* %gep.out.1
ret void
}
@@ -307,7 +307,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -315,11 +315,11 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %
%gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
%gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
- %x = load double, double addrspace(1)* %gep.0
- %y = load double, double addrspace(1)* %gep.1
- %z = load double, double addrspace(1)* %gep.2
- %u = load double, double addrspace(1)* %gep.3
- %v = load double, double addrspace(1)* %gep.4
+ %x = load volatile double, double addrspace(1)* %gep.0
+ %y = load volatile double, double addrspace(1)* %gep.1
+ %z = load volatile double, double addrspace(1)* %gep.2
+ %u = load volatile double, double addrspace(1)* %gep.3
+ %v = load volatile double, double addrspace(1)* %gep.4
%tmp0 = fmul double %u, %v
%tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
@@ -342,7 +342,7 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
@@ -350,11 +350,11 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
%gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
%gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
- %x = load double, double addrspace(1)* %gep.0
- %y = load double, double addrspace(1)* %gep.1
- %z = load double, double addrspace(1)* %gep.2
- %u = load double, double addrspace(1)* %gep.3
- %v = load double, double addrspace(1)* %gep.4
+ %x = load volatile double, double addrspace(1)* %gep.0
+ %y = load volatile double, double addrspace(1)* %gep.1
+ %z = load volatile double, double addrspace(1)* %gep.2
+ %u = load volatile double, double addrspace(1)* %gep.3
+ %v = load volatile double, double addrspace(1)* %gep.4
%tmp0 = fmul double %u, %v
%tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
@@ -373,8 +373,8 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
float addrspace(1)* %in1,
float addrspace(1)* %in2) {
- %x = load float, float addrspace(1)* %in1
- %y = load float, float addrspace(1)* %in2
+ %x = load volatile float, float addrspace(1)* %in1
+ %y = load volatile float, float addrspace(1)* %in2
%a = fadd float %x, 1.0
%m = fmul float %a, %y
store float %m, float addrspace(1)* %out
@@ -386,8 +386,8 @@ define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
float addrspace(1)* %in1,
float addrspace(1)* %in2) {
- %x = load float, float addrspace(1)* %in1
- %y = load float, float addrspace(1)* %in2
+ %x = load volatile float, float addrspace(1)* %in1
+ %y = load volatile float, float addrspace(1)* %in2
%a = fadd float %x, 1.0
%m = fmul float %y, %a
store float %m, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
index d6024aa0b4c5..d04a5946b98c 100644
--- a/test/CodeGen/AMDGPU/fma.ll
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -61,7 +61,7 @@ define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)*
}
; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
-; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}}
define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmad.ll b/test/CodeGen/AMDGPU/fmad.ll
index 935e35123f45..9c39bee753be 100644
--- a/test/CodeGen/AMDGPU/fmad.ll
+++ b/test/CodeGen/AMDGPU/fmad.ll
@@ -2,18 +2,16 @@
;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
%r2 = extractelement <4 x float> %reg0, i32 2
%r3 = fmul float %r0, %r1
%r4 = fadd float %r3, %r2
%vec = insertelement <4 x float> undef, float %r4, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
declare float @fabs(float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" } \ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fmax.ll b/test/CodeGen/AMDGPU/fmax.ll
index d7127f485c74..763040522718 100644
--- a/test/CodeGen/AMDGPU/fmax.ll
+++ b/test/CodeGen/AMDGPU/fmax.ll
@@ -2,16 +2,14 @@
;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
%r2 = fcmp oge float %r0, %r1
%r3 = select i1 %r2, float %r0, float %r1
%vec = insertelement <4 x float> undef, float %r3, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" } \ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
index f78c71b28264..9bbfe1e95c5b 100644
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -4,9 +4,9 @@
declare double @llvm.maxnum.f64(double, double) nounwind readnone
; SI-LABEL: {{^}}test_fmax3_f64:
-; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
; SI: buffer_store_dwordx2 [[RESULT]],
@@ -14,9 +14,9 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone
define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
%bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
%cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
- %a = load double, double addrspace(1)* %aptr, align 8
- %b = load double, double addrspace(1)* %bptr, align 8
- %c = load double, double addrspace(1)* %cptr, align 8
+ %a = load volatile double, double addrspace(1)* %aptr, align 8
+ %b = load volatile double, double addrspace(1)* %bptr, align 8
+ %c = load volatile double, double addrspace(1)* %cptr, align 8
%f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone
%f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone
store double %f1, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index c3028a6217d5..c0fde6e97f6f 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -11,9 +11,9 @@ declare float @llvm.maxnum.f32(float, float) nounwind readnone
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
- %a = load float, float addrspace(1)* %aptr, align 4
- %b = load float, float addrspace(1)* %bptr, align 4
- %c = load float, float addrspace(1)* %cptr, align 4
+ %a = load volatile float, float addrspace(1)* %aptr, align 4
+ %b = load volatile float, float addrspace(1)* %bptr, align 4
+ %c = load volatile float, float addrspace(1)* %cptr, align 4
%f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
%f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
store float %f1, float addrspace(1)* %out, align 4
@@ -29,9 +29,9 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
- %a = load float, float addrspace(1)* %aptr, align 4
- %b = load float, float addrspace(1)* %bptr, align 4
- %c = load float, float addrspace(1)* %cptr, align 4
+ %a = load volatile float, float addrspace(1)* %aptr, align 4
+ %b = load volatile float, float addrspace(1)* %bptr, align 4
+ %c = load volatile float, float addrspace(1)* %cptr, align 4
%f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
%f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index 828243888ac7..da498caa6b54 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; Make sure we don't try to form FMAX_LEGACY nodes with f64
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; FUNC-LABEL: @test_fmax_legacy_uge_f64
define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -20,7 +20,7 @@ define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspac
; FUNC-LABEL: @test_fmax_legacy_oge_f64
define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -35,7 +35,7 @@ define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspac
; FUNC-LABEL: @test_fmax_legacy_ugt_f64
define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -50,7 +50,7 @@ define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspac
; FUNC-LABEL: @test_fmax_legacy_ogt_f64
define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index d374fb67350c..4a4c92a38a35 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: Should replace unsafe-fp-math with no signed zeros.
@@ -18,8 +18,8 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp uge float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -38,8 +38,8 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp oge float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -58,8 +58,8 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp ugt float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -78,8 +78,8 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp ogt float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -142,8 +142,8 @@ define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 ad
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp ogt float %a, %b
%val = select i1 %cmp, float %a, float %b
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
index 3029bd02e4db..a2b33a794d99 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare float @llvm.maxnum.f32(float, float) #0
declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
@@ -207,7 +207,7 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0
; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
; SI: buffer_store_dword [[REG]]
; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
@@ -221,7 +221,7 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0
; SI-NOT: v_max_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
; SI: buffer_store_dword [[REG]]
; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll
new file mode 100644
index 000000000000..e66678069130
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmed3.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.minnum.f64(double, double) #0
+declare double @llvm.maxnum.f64(double, double) #0
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+
+ %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+ %med = call float @llvm.minnum.f32(float %max, float 4.0)
+
+ store float %med, float addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute0_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+
+ %max = call float @llvm.maxnum.f32(float 2.0, float %a)
+ %med = call float @llvm.minnum.f32(float 4.0, float %max)
+
+ store float %med, float addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_commute1_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+
+ %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+ %med = call float @llvm.minnum.f32(float 4.0, float %max)
+
+ store float %med, float addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
+; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+
+ %max = call float @llvm.maxnum.f32(float %a, float 4.0)
+ %med = call float @llvm.minnum.f32(float %max, float 2.0)
+
+ store float %med, float addrspace(1)* %outgep
+ ret void
+}
+
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
+; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+
+ %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+ %med = call float @llvm.minnum.f32(float %max, float 4.0)
+
+ store volatile float %med, float addrspace(1)* %outgep
+ store volatile float %max, float addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
+; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
+; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
+define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
+ %a = load double, double addrspace(1)* %gep0
+
+ %max = call double @llvm.maxnum.f64(double %a, double 2.0)
+ %med = call double @llvm.minnum.f64(double %max, double 4.0)
+
+ store double %med, double addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
+; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+
+ %max = call float @llvm.maxnum.f32(float %a, float 2.0)
+ %med = call float @llvm.minnum.f32(float %max, float 4.0)
+
+ store float %med, float addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32:
+; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+
+; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+
+ ; fmax_legacy
+ %cmp0 = fcmp ule float %a, 2.0
+ %max = select i1 %cmp0, float 2.0, float %a
+
+ ; fmin_legacy
+ %cmp1 = fcmp uge float %max, 4.0
+ %med = select i1 %cmp1, float 4.0, float %max
+
+ store float %med, float addrspace(1)* %outgep
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmin.ll b/test/CodeGen/AMDGPU/fmin.ll
index defa8c09638a..d044a7a0542c 100644
--- a/test/CodeGen/AMDGPU/fmin.ll
+++ b/test/CodeGen/AMDGPU/fmin.ll
@@ -2,16 +2,14 @@
;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
%r2 = fcmp uge float %r0, %r1
%r3 = select i1 %r2, float %r1, float %r0
%vec = insertelement <4 x float> undef, float %r3, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" } \ No newline at end of file
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 0a76699b43e1..2d1facfc3a40 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -12,9 +12,9 @@ declare float @llvm.minnum.f32(float, float) nounwind readnone
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
- %a = load float, float addrspace(1)* %aptr, align 4
- %b = load float, float addrspace(1)* %bptr, align 4
- %c = load float, float addrspace(1)* %cptr, align 4
+ %a = load volatile float, float addrspace(1)* %aptr, align 4
+ %b = load volatile float, float addrspace(1)* %bptr, align 4
+ %c = load volatile float, float addrspace(1)* %cptr, align 4
%f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
%f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
store float %f1, float addrspace(1)* %out, align 4
@@ -30,9 +30,9 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
- %a = load float, float addrspace(1)* %aptr, align 4
- %b = load float, float addrspace(1)* %bptr, align 4
- %c = load float, float addrspace(1)* %cptr, align 4
+ %a = load volatile float, float addrspace(1)* %aptr, align 4
+ %b = load volatile float, float addrspace(1)* %bptr, align 4
+ %c = load volatile float, float addrspace(1)* %cptr, align 4
%f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
%f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index e19a48f3f7e2..6982ee0c0cb3 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; FUNC-LABEL: @test_fmin_legacy_f64
define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
@@ -15,7 +15,7 @@ define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double>
; FUNC-LABEL: @test_fmin_legacy_ule_f64
define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -30,7 +30,7 @@ define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspac
; FUNC-LABEL: @test_fmin_legacy_ole_f64
define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -45,7 +45,7 @@ define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspac
; FUNC-LABEL: @test_fmin_legacy_olt_f64
define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -60,7 +60,7 @@ define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspac
; FUNC-LABEL: @test_fmin_legacy_ult_f64
define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 69a0a520a476..79acd02e6d1f 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -1,16 +1,20 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FIXME: Should replace unsafe-fp-math with no signed zeros.
declare i32 @llvm.r600.read.tidig.x() #1
-; FUNC-LABEL: @test_fmin_legacy_f32
+; The two inputs to the instruction are different SGPRs from the same
+; super register, so we can't fold both SGPR operands even though they
+; are both the same register.
+
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
; EG: MIN *
-; SI-SAFE: v_min_legacy_f32_e64
-; SI-NONAN: v_min_f32_e64
-define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
%r2 = fcmp uge float %r0, %r1
@@ -20,6 +24,23 @@ define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> in
ret void
}
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+
+; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
+; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
+
+define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+ %cmp = fcmp ule float %a, %b
+ %val = select i1 %cmp, float %a, float %b
+ store float %val, float addrspace(1)* %out, align 4
+ ret void
+}
+
; FUNC-LABEL: @test_fmin_legacy_ule_f32
; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
@@ -30,8 +51,8 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp ule float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -49,8 +70,8 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp ole float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -68,8 +89,8 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp olt float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -87,8 +108,8 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp ult float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -172,8 +193,8 @@ define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 ad
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%cmp = fcmp ole float %a, %b
%val0 = select i1 %cmp, float %a, float %b
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
index 4d7b52540d85..04cb01260bc0 100644
--- a/test/CodeGen/AMDGPU/fminnum.ll
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare float @llvm.minnum.f32(float, float) #0
@@ -206,7 +206,7 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0
; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
; SI: buffer_store_dword [[REG]]
; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
@@ -220,7 +220,7 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0
; SI-NOT: v_min_f32_e32
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
; SI: buffer_store_dword [[REG]]
; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 1ee92b2f7c08..867c5c252b6c 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -36,8 +36,8 @@ define void @multiple_use_fadd_fmac(float addrspace(1)* %out, float %x, float %y
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%mul2 = fmul fast float %x, 2.0
%mad = fadd fast float %mul2, %y
- store float %mul2, float addrspace(1)* %out
- store float %mad, float addrspace(1)* %out.gep.1
+ store volatile float %mul2, float addrspace(1)* %out
+ store volatile float %mad, float addrspace(1)* %out.gep.1
ret void
}
@@ -52,8 +52,8 @@ define void @multiple_use_fadd_fmad(float addrspace(1)* %out, float %x, float %y
%x.abs = call float @llvm.fabs.f32(float %x)
%mul2 = fmul fast float %x.abs, 2.0
%mad = fadd fast float %mul2, %y
- store float %mul2, float addrspace(1)* %out
- store float %mad, float addrspace(1)* %out.gep.1
+ store volatile float %mul2, float addrspace(1)* %out
+ store volatile float %mad, float addrspace(1)* %out.gep.1
ret void
}
@@ -66,8 +66,8 @@ define void @multiple_use_fadd_multi_fmad(float addrspace(1)* %out, float %x, fl
%mul2 = fmul fast float %x.abs, 2.0
%mad0 = fadd fast float %mul2, %y
%mad1 = fadd fast float %mul2, %z
- store float %mad0, float addrspace(1)* %out
- store float %mad1, float addrspace(1)* %out.gep.1
+ store volatile float %mad0, float addrspace(1)* %out
+ store volatile float %mad1, float addrspace(1)* %out.gep.1
ret void
}
@@ -80,7 +80,7 @@ define void @fmul_x2_xn2(float addrspace(1)* %out, float %x, float %y) #0 {
%mul2 = fmul fast float %x, 2.0
%muln2 = fmul fast float %x, -2.0
%mul = fmul fast float %mul2, %muln2
- store float %mul, float addrspace(1)* %out
+ store volatile float %mul, float addrspace(1)* %out
ret void
}
@@ -94,7 +94,7 @@ define void @fmul_x2_xn3(float addrspace(1)* %out, float %x, float %y) #0 {
%mul2 = fmul fast float %x, 2.0
%muln2 = fmul fast float %x, -3.0
%mul = fmul fast float %mul2, %muln2
- store float %mul, float addrspace(1)* %out
+ store volatile float %mul, float addrspace(1)* %out
ret void
}
diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll
index addc409c9eb1..9064ad3814d6 100644
--- a/test/CodeGen/AMDGPU/fmul.ll
+++ b/test/CodeGen/AMDGPU/fmul.ll
@@ -1,12 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
; FUNC-LABEL: {{^}}fmul_f32:
-; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; GCN: v_mul_f32
-; SI: v_mul_f32
+; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
entry:
%0 = fmul float %a, %b
@@ -14,16 +13,16 @@ entry:
ret void
}
-declare float @llvm.R600.load.input(i32) readnone
+declare float @llvm.r600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
; FUNC-LABEL: {{^}}fmul_v2f32:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+
; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-
-; SI: v_mul_f32
-; SI: v_mul_f32
define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
entry:
%0 = fmul <2 x float> %a, %b
@@ -32,15 +31,15 @@ entry:
}
; FUNC-LABEL: {{^}}fmul_v4f32:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+
; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float>, <4 x float> addrspace(1) * %in
@@ -51,9 +50,9 @@ define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
}
; FUNC-LABEL: {{^}}test_mul_2_k:
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
-; SI: s_endpgm
+; GCN: v_mul_f32
+; GCN-NOT: v_mul_f32
+; GCN: s_endpgm
define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
%y = fmul float %x, 2.0
%z = fmul float %y, 3.0
@@ -62,10 +61,10 @@ define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
}
; FUNC-LABEL: {{^}}test_mul_2_k_inv:
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
-; SI-NOT: v_mad_f32
-; SI: s_endpgm
+; GCN: v_mul_f32
+; GCN-NOT: v_mul_f32
+; GCN-NOT: v_mad_f32
+; GCN: s_endpgm
define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
%y = fmul float %x, 3.0
%z = fmul float %y, 2.0
@@ -76,10 +75,10 @@ define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
; There should be three multiplies here; %a should be used twice (once
; negated), not duplicated into mul x, 5.0 and mul x, -5.0.
; FUNC-LABEL: {{^}}test_mul_twouse:
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI: v_mul_f32
-; SI-NOT: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN-NOT: v_mul_f32
define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
%a = fmul float %x, 5.0
%b = fsub float -0.0, %a
@@ -89,4 +88,4 @@ define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
ret void
}
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll
index 600f0cb83578..c80374df4950 100644
--- a/test/CodeGen/AMDGPU/fmuladd.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
declare float @llvm.fmuladd.f32(float, float, float)
declare double @llvm.fmuladd.f64(double, double, double)
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
; CHECK-LABEL: {{^}}fmuladd_f32:
@@ -37,13 +37,13 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
store float %r3, float addrspace(1)* %gep.out
@@ -56,13 +56,13 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
store float %r3, float addrspace(1)* %gep.out
@@ -77,13 +77,13 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
define void @fadd_a_a_b_f32(float addrspace(1)* %out,
float addrspace(1)* %in1,
float addrspace(1)* %in2) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r0 = load float, float addrspace(1)* %gep.0
- %r1 = load float, float addrspace(1)* %gep.1
+ %r0 = load volatile float, float addrspace(1)* %gep.0
+ %r1 = load volatile float, float addrspace(1)* %gep.1
%add.0 = fadd float %r0, %r0
%add.1 = fadd float %add.0, %r1
@@ -99,13 +99,13 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
define void @fadd_b_a_a_f32(float addrspace(1)* %out,
float addrspace(1)* %in1,
float addrspace(1)* %in2) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r0 = load float, float addrspace(1)* %gep.0
- %r1 = load float, float addrspace(1)* %gep.1
+ %r0 = load volatile float, float addrspace(1)* %gep.0
+ %r1 = load volatile float, float addrspace(1)* %gep.1
%add.0 = fadd float %r0, %r0
%add.1 = fadd float %r1, %add.0
@@ -119,13 +119,13 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
store float %r3, float addrspace(1)* %gep.out
@@ -139,13 +139,13 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%r1.fneg = fsub float -0.000000e+00, %r1
@@ -161,13 +161,13 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
; CHECK: buffer_store_dword [[R2]]
define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%r1.fneg = fsub float -0.000000e+00, %r1
@@ -183,13 +183,13 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1
; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
; CHECK: buffer_store_dword [[RESULT]]
define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%r2.fneg = fsub float -0.000000e+00, %r2
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 8830e8273661..b03f318f4571 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
; FIXME: Check something here. Currently it seems fabs + fneg aren't
; into 2 modifiers, although theoretically that should work.
-; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
+; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
%fabs = call double @llvm.fabs.f64(double %x)
%fsub = fsub double -0.000000e+00, %fabs
@@ -24,8 +24,8 @@ define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64:
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
+; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
+; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
%fabs = call double @llvm.fabs.f64(double %x)
%fsub = fsub double -0.000000e+00, %fabs
@@ -34,7 +34,7 @@ define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y)
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabs_free_f64:
+; GCN-LABEL: {{^}}fneg_fabs_free_f64:
define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
%bc = bitcast i64 %in to double
%fabs = call double @llvm.fabs.f64(double %bc)
@@ -43,9 +43,9 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64:
+; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
%bc = bitcast i64 %in to double
%fabs = call double @fabs(double %bc)
@@ -54,13 +54,14 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabs_f64:
-; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
-; SI: s_load_dwordx2
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
-; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
+; GCN-LABEL: {{^}}fneg_fabs_f64:
+; GCN-DAG: s_load_dwordx2
+; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
+; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
+; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
%fabs = call double @llvm.fabs.f64(double %in)
%fsub = fsub double -0.000000e+00, %fabs
@@ -68,11 +69,11 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN-LABEL: {{^}}fneg_fabs_v2f64:
+; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; GCN-NOT: 0x80000000
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
%fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
@@ -80,13 +81,13 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in)
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
-; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
-; SI-NOT: 0x80000000
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN-LABEL: {{^}}fneg_fabs_v4f64:
+; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
+; GCN-NOT: 0x80000000
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
%fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
%fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll
index aa6df209035b..7627a4d32250 100644
--- a/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -39,7 +39,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double>
; unless the target returns true for isNegFree()
; FUNC-LABEL: {{^}}fneg_free_f64:
-; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}}
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
%bc = bitcast i64 %in to double
%fsub = fsub double 0.0, %bc
diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll
index 4fac5176fac9..b7ffaed70c5a 100644
--- a/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/test/CodeGen/AMDGPU/fp-classify.ll
@@ -1,9 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
-declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
-declare i32 @llvm.r600.read.tidig.x() #1
declare float @llvm.fabs.f32(float) #1
declare double @llvm.fabs.f64(double) #1
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index 12df6606e8ff..be23e10d7087 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; FUNC-LABEL: @fp_to_sint_f64_i32
; SI: v_cvt_i32_f64_e32
@@ -47,7 +47,7 @@ define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %
; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
%val = load double, double addrspace(1)* %gep, align 8
%cast = fptosi double %val to i64
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
index 301a94b4904c..b39aeadc8cce 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
declare float @llvm.fabs.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index 41bc2a780014..760019ebdc08 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; SI-LABEL: {{^}}fp_to_uint_i32_f64:
; SI: v_cvt_u32_f64_e32
@@ -47,7 +47,7 @@ define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %
; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
%val = load double, double addrspace(1)* %gep, align 8
%cast = fptoui double %val to i64
diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll
index 734a43be2296..ad06bdd90a9f 100644
--- a/test/CodeGen/AMDGPU/fpext.ll
+++ b/test/CodeGen/AMDGPU/fpext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}fpext_f32_to_f64:
@@ -18,6 +18,16 @@ define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %
ret void
}
+; FUNC-LABEL: {{^}}fpext_v3f32_to_v3f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
+ %result = fpext <3 x float> %in to <3 x double>
+ store <3 x double> %result, <3 x double> addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64:
; SI: v_cvt_f64_f32_e32
; SI: v_cvt_f64_f32_e32
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
new file mode 100644
index 000000000000..68b884363ec5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+
+; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double) #0
+declare double @llvm.floor.f64(double) #0
+
+; FUNC-LABEL: {{^}}fract_f64:
+; SI-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]{{\]}}, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]{{\]}}, -[[SUB0]]
+
+; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; CI: v_floor_f64_e32 [[FLOORX:v\[[0-9]+:[0-9]+\]]], [[X]]
+; CI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[FLOORX]]
+
+; GCN-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
+
+; GCN: buffer_store_dwordx2 [[FRACT]]
+define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+ %x = load double, double addrspace(1)* %src
+ %floor.x = call double @llvm.floor.f64(double %x)
+ %fract = fsub double %x, %floor.x
+ store double %fract, double addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg:
+; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO]]:[[HI]]{{\]}}, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO]]:[[HI]]{{\]}}, -[[SUB0]]
+
+; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; CI: v_floor_f64_e64 [[FLOORX:v\[[0-9]+:[0-9]+\]]], -[[X]]
+; CI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]], -[[FLOORX]]
+
+; GCN-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]]
+
+; GCN: buffer_store_dwordx2 [[FRACT]]
+define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+ %x = load double, double addrspace(1)* %src
+ %neg.x = fsub double -0.0, %x
+ %floor.neg.x = call double @llvm.floor.f64(double %neg.x)
+ %fract = fsub double %neg.x, %floor.neg.x
+ store double %fract, double addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg_abs:
+; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
+; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO]]:[[HI]]{{\]}}|, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO]]:[[HI]]{{\]}}|, -[[SUB0]]
+
+; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; CI: v_floor_f64_e64 [[FLOORX:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
+; CI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|, -[[FLOORX]]
+
+; GCN-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
+
+; GCN: buffer_store_dwordx2 [[FRACT]]
+define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+ %x = load double, double addrspace(1)* %src
+ %abs.x = call double @llvm.fabs.f64(double %x)
+ %neg.abs.x = fsub double -0.0, %abs.x
+ %floor.neg.abs.x = call double @llvm.floor.f64(double %neg.abs.x)
+ %fract = fsub double %neg.abs.x, %floor.neg.abs.x
+ store double %fract, double addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}multi_use_floor_fract_f64:
+; VI-UNSAFE: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
+; VI-UNSAFE-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[X]]
+; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
+; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]]
+; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]]
+define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+ %x = load double, double addrspace(1)* %src
+ %floor.x = call double @llvm.floor.f64(double %x)
+ %fract = fsub double %x, %floor.x
+ store volatile double %floor.x, double addrspace(1)* %out
+ store volatile double %fract, double addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll
new file mode 100644
index 000000000000..7d713f483047
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.floor.f32(float) #0
+
+; GCN-LABEL: {{^}}fract_f32:
+; GCN-SAFE: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; GCN-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+
+; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
+
+; GCN: buffer_store_dword [[RESULT]]
+define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+ %x = load float, float addrspace(1)* %src
+ %floor.x = call float @llvm.floor.f32(float %x)
+ %fract = fsub float %x, %floor.x
+ store float %fract, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fract_f32_neg:
+; GCN-SAFE: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
+; GCN-SAFE: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
+
+; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
+
+; GCN: buffer_store_dword [[RESULT]]
+define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+ %x = load float, float addrspace(1)* %src
+ %x.neg = fsub float -0.0, %x
+ %floor.x.neg = call float @llvm.floor.f32(float %x.neg)
+ %fract = fsub float %x.neg, %floor.x.neg
+ store float %fract, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}fract_f32_neg_abs:
+; GCN-SAFE: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+; GCN-SAFE: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
+
+; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+
+; GCN: buffer_store_dword [[RESULT]]
+define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+ %x = load float, float addrspace(1)* %src
+ %abs.x = call float @llvm.fabs.f32(float %x)
+ %neg.abs.x = fsub float -0.0, %abs.x
+ %floor.neg.abs.x = call float @llvm.floor.f32(float %neg.abs.x)
+ %fract = fsub float %neg.abs.x, %floor.neg.abs.x
+ store float %fract, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}multi_use_floor_fract_f32:
+; GCN-UNSAFE-DAG: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; GCN-UNSAFE-DAG: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[INPUT:v[0-9]+]]
+
+; GCN-UNSAFE: buffer_store_dword [[FLOOR]]
+; GCN-UNSAFE: buffer_store_dword [[FRACT]]
+define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+ %x = load float, float addrspace(1)* %src
+ %floor.x = call float @llvm.floor.f32(float %x)
+ %fract = fsub float %x, %floor.x
+ store volatile float %floor.x, float addrspace(1)* %out
+ store volatile float %fract, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
index f245ef08cb9d..e0fc263294ab 100644
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -5,11 +5,13 @@
; FUNC-LABEL: {{^}}frem_f32:
; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
-; GCN-DAG: v_cmp
-; GCN-DAG: v_mul_f32
+; GCN: v_div_scale_f32
+
; GCN: v_rcp_f32_e32
+; GCN: v_fma_f32
; GCN: v_mul_f32_e32
-; GCN: v_mul_f32_e32
+; GCN: v_div_fmas_f32
+; GCN: v_div_fixup_f32
; GCN: v_trunc_f32_e32
; GCN: v_mad_f32
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll
new file mode 100644
index 000000000000..ce0881c329be
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
+; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
+ %r0 = load double, double addrspace(1)* %in
+ %r1 = call double @llvm.sqrt.f64(double %r0)
+ store double %r1, double addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64:
+; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
+ %r0 = load double, double addrspace(1)* %in
+ %r1 = call double @llvm.sqrt.f64(double %r0)
+ store double %r1, double addrspace(1)* %out
+ ret void
+}
+
+declare double @llvm.sqrt.f64(double %Val) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
index 04101346cdf9..f98cac6ade3a 100644
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -1,29 +1,143 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
-; CHECK: {{^}}fsqrt_f32:
-; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
+; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+ %r0 = load float, float addrspace(1)* %in
+ %r1 = call float @llvm.sqrt.f32(float %r0)
+ store float %r1, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
+; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
+ %r0 = load float, float addrspace(1)* %in
+ %r1 = call float @llvm.sqrt.f32(float %r0)
+ store float %r1, float addrspace(1)* %out
+ ret void
+}
+
+
+; FUNC-LABEL: {{^}}s_sqrt_f32:
+; GCN: v_sqrt_f32_e32
+
+; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
+define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
+entry:
+ %fdiv = call float @llvm.sqrt.f32(float %in)
+ store float %fdiv, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_sqrt_v2f32:
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
+define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+ %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_sqrt_v4f32:
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
+define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+entry:
+ %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+ store <4 x float> %fdiv, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}elim_redun_check_neg0:
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp olt float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, float addrspace(1)* %out
+ ret void
+}
-define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
- %r0 = load float, float addrspace(1)* %in
- %r1 = call float @llvm.sqrt.f32(float %r0)
- store float %r1, float addrspace(1)* %out
- ret void
+; FUNC-LABEL: {{^}}elim_redun_check_pos0:
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp olt float %in, 0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, float addrspace(1)* %out
+ ret void
}
-; CHECK: {{^}}fsqrt_f64:
-; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}elim_redun_check_ult:
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp ult float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, float addrspace(1)* %out
+ ret void
+}
-define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
- %r0 = load double, double addrspace(1)* %in
- %r1 = call double @llvm.sqrt.f64(double %r0)
- store double %r1, double addrspace(1)* %out
- ret void
+; FUNC-LABEL: {{^}}elim_redun_check_v2:
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+ %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+ %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+ store <2 x float> %res, <2 x float> addrspace(1)* %out
+ ret void
}
-declare float @llvm.sqrt.f32(float %Val)
-declare double @llvm.sqrt.f64(double %Val)
+; FUNC-LABEL: {{^}}elim_redun_check_v2_ult
+; GCN: v_sqrt_f32_e32
+; GCN: v_sqrt_f32_e32
+; GCN-NOT: v_cndmask
+define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+ %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+ %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+ store <2 x float> %res, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+declare float @llvm.sqrt.f32(float %in) #0
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
index 38d573258a5e..3429df33c015 100644
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -1,7 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_fsub_f32:
; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
@@ -24,7 +23,7 @@ define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
ret void
}
-declare float @llvm.R600.load.input(i32) readnone
+declare float @llvm.r600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
index f34a48e30a86..f1b970a4f5fe 100644
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -47,7 +47,7 @@ define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
}
; SI-LABEL: {{^}}s_fsub_imm_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0
define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
%sub = fsub double 4.0, %a
store double %sub, double addrspace(1)* %out
@@ -55,7 +55,7 @@ define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
}
; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0
define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
%sub = fsub double %a, 4.0
store double %sub, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 83a8ad8901d2..c4138ad79c28 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -24,11 +24,11 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
; CI: v_trunc_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: s_lshr_b64
-; SI: s_not_b64
-; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
+; SI-DAG: s_not_b64
+; SI-DAG: s_and_b64
; SI-DAG: cmp_gt_i32
; SI-DAG: cndmask_b32
; SI-DAG: cndmask_b32
diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll
index edc08609a8aa..1beeab65ade3 100644
--- a/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
declare float @llvm.trunc.f32(float) nounwind readnone
declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
index bc5f031cd4a2..0f2fc836a245 100644
--- a/test/CodeGen/AMDGPU/global-constant.ll
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -12,7 +12,7 @@
; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly
; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0
; NOHSA: .text
-; HSA: .hsatext
+; HSA: .text
; GCN: readonly:
; GCN: readonly2:
define void @main(i32 %index, float addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/global-extload-i1.ll b/test/CodeGen/AMDGPU/global-extload-i1.ll
deleted file mode 100644
index bd9557d730fb..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i1.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; FIXME: Evergreen broken
-
-; FUNC-LABEL: {{^}}zextload_global_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %a = load i1, i1 addrspace(1)* %in
- %ext = zext i1 %a to i32
- store i32 %ext, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %a = load i1, i1 addrspace(1)* %in
- %ext = sext i1 %a to i32
- store i32 %ext, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32:
-; SI: s_endpgm
-define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
- %ext = zext <1 x i1> %load to <1 x i32>
- store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32:
-; SI: s_endpgm
-define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
- %ext = sext <1 x i1> %load to <1 x i32>
- store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
- %ext = zext <2 x i1> %load to <2 x i32>
- store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
- %ext = sext <2 x i1> %load to <2 x i32>
- store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
- %ext = zext <4 x i1> %load to <4 x i32>
- store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
- %ext = sext <4 x i1> %load to <4 x i32>
- store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
- %ext = zext <8 x i1> %load to <8 x i32>
- store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
- %ext = sext <8 x i1> %load to <8 x i32>
- store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
- %ext = zext <16 x i1> %load to <16 x i32>
- store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
- %ext = sext <16 x i1> %load to <16 x i32>
- store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
- ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32:
-; XSI: s_endpgm
-; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-; %ext = zext <32 x i1> %load to <32 x i32>
-; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32:
-; XSI: s_endpgm
-; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-; %ext = sext <32 x i1> %load to <32 x i32>
-; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32:
-; XSI: s_endpgm
-; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-; %ext = zext <64 x i1> %load to <64 x i32>
-; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32:
-; XSI: s_endpgm
-; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-; %ext = sext <64 x i1> %load to <64 x i32>
-; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
-; SI: buffer_store_dwordx2
-define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %a = load i1, i1 addrspace(1)* %in
- %ext = zext i1 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i1_to_i64:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %a = load i1, i1 addrspace(1)* %in
- %ext = sext i1 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
- %ext = zext <1 x i1> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
- %ext = sext <1 x i1> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
- %ext = zext <2 x i1> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
- %ext = sext <2 x i1> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
- %ext = zext <4 x i1> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
- %ext = sext <4 x i1> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
- %ext = zext <8 x i1> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
- %ext = sext <8 x i1> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
- %ext = zext <16 x i1> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
- %ext = sext <16 x i1> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64:
-; XSI: s_endpgm
-; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-; %ext = zext <32 x i1> %load to <32 x i64>
-; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64:
-; XSI: s_endpgm
-; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
-; %ext = sext <32 x i1> %load to <32 x i64>
-; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64:
-; XSI: s_endpgm
-; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-; %ext = zext <64 x i1> %load to <64 x i64>
-; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64:
-; XSI: s_endpgm
-; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
-; %ext = sext <64 x i1> %load to <64 x i64>
-; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-; ret void
-; }
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
deleted file mode 100644
index 103a40dee270..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
-
-; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
-; SI: buffer_load_ushort
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
- %a = load i16, i16 addrspace(1)* %in
- %ext = zext i16 %a to i32
- store i32 %ext, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
-; SI: buffer_load_sshort
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
- %a = load i16, i16 addrspace(1)* %in
- %ext = sext i16 %a to i32
- store i32 %ext, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
-; SI: buffer_load_ushort
-; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
- %ext = zext <1 x i16> %load to <1 x i32>
- store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
-; SI: buffer_load_sshort
-; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
- %ext = sext <1 x i16> %load to <1 x i32>
- store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
- %ext = zext <2 x i16> %load to <2 x i32>
- store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
- %ext = sext <2 x i16> %load to <2 x i32>
- store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
- %ext = zext <4 x i16> %load to <4 x i32>
- store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
- %ext = sext <4 x i16> %load to <4 x i32>
- store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
- %ext = zext <8 x i16> %load to <8 x i32>
- store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
- %ext = sext <8 x i16> %load to <8 x i32>
- store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
- %ext = zext <16 x i16> %load to <16 x i32>
- store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
- %ext = sext <16 x i16> %load to <16 x i32>
- store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
-; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
- %ext = zext <32 x i16> %load to <32 x i32>
- store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
-; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
- %ext = sext <32 x i16> %load to <32 x i32>
- store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
-; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
- %ext = zext <64 x i16> %load to <64 x i32>
- store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
-; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
- %ext = sext <64 x i16> %load to <64 x i32>
- store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
-; SI: buffer_load_ushort v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
- %a = load i16, i16 addrspace(1)* %in
- %ext = zext i16 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
-; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
- %a = load i16, i16 addrspace(1)* %in
- %ext = sext i16 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
- %ext = zext <1 x i16> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
- %ext = sext <1 x i16> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
- %ext = zext <2 x i16> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
- %ext = sext <2 x i16> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
- %ext = zext <4 x i16> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
- %ext = sext <4 x i16> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
- %ext = zext <8 x i16> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
- %ext = sext <8 x i16> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
- %ext = zext <16 x i16> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
- %ext = sext <16 x i16> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
-; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
- %ext = zext <32 x i16> %load to <32 x i64>
- store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
-; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
- %ext = sext <32 x i16> %load to <32 x i64>
- store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
-; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
- %ext = zext <64 x i16> %load to <64 x i64>
- store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
-; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
- %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
- %ext = sext <64 x i16> %load to <64 x i64>
- store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll
deleted file mode 100644
index e5e6be2199c3..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i32.ll
+++ /dev/null
@@ -1,308 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
-; SI: buffer_load_dword v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %a = load i32, i32 addrspace(1)* %in
- %ext = zext i32 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %a = load i32, i32 addrspace(1)* %in
- %ext = sext i32 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64:
-; SI: buffer_load_dword
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
- %ext = zext <1 x i32> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64:
-; SI: buffer_load_dword
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
- %ext = sext <1 x i32> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64:
-; SI: buffer_load_dwordx2
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
- %ext = zext <2 x i32> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64:
-; SI: buffer_load_dwordx2
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
- %ext = sext <2 x i32> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
- %ext = zext <4 x i32> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64:
-; SI: buffer_load_dwordx4
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
- %ext = sext <4 x i32> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
- %ext = zext <8 x i32> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
- %ext = sext <8 x i32> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: buffer_store_dwordx4
-; SI: s_endpgm
-define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
- %ext = sext <16 x i32> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
- %ext = zext <16 x i32> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-; SI-DAG: v_ashrrev_i32
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI: s_endpgm
-define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
- %ext = sext <32 x i32> %load to <32 x i64>
- store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-; SI-DAG: buffer_store_dwordx4
-
-; SI: s_endpgm
-define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
- %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
- %ext = zext <32 x i32> %load to <32 x i64>
- store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/global-extload-i8.ll b/test/CodeGen/AMDGPU/global-extload-i8.ll
deleted file mode 100644
index b31d5361d5a2..000000000000
--- a/test/CodeGen/AMDGPU/global-extload-i8.ll
+++ /dev/null
@@ -1,299 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}zextload_global_i8_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
- %a = load i8, i8 addrspace(1)* %in
- %ext = zext i8 %a to i32
- store i32 %ext, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i8_to_i32:
-; SI: buffer_load_sbyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
- %a = load i8, i8 addrspace(1)* %in
- %ext = sext i8 %a to i32
- store i32 %ext, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32:
-; SI: s_endpgm
-define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
- %ext = zext <1 x i8> %load to <1 x i32>
- store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32:
-; SI: s_endpgm
-define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
- %ext = sext <1 x i8> %load to <1 x i32>
- store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32:
-; SI: s_endpgm
-define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
- %ext = zext <2 x i8> %load to <2 x i32>
- store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32:
-; SI: s_endpgm
-define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
- %ext = sext <2 x i8> %load to <2 x i32>
- store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32:
-; SI: s_endpgm
-define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
- %ext = zext <4 x i8> %load to <4 x i32>
- store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32:
-; SI: s_endpgm
-define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
- %ext = sext <4 x i8> %load to <4 x i32>
- store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32:
-; SI: s_endpgm
-define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
- %ext = zext <8 x i8> %load to <8 x i32>
- store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32:
-; SI: s_endpgm
-define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
- %ext = sext <8 x i8> %load to <8 x i32>
- store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32:
-; SI: s_endpgm
-define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
- %ext = zext <16 x i8> %load to <16 x i32>
- store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32:
-; SI: s_endpgm
-define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
- %ext = sext <16 x i8> %load to <16 x i32>
- store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
- ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32:
-; XSI: s_endpgm
-; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-; %ext = zext <32 x i8> %load to <32 x i32>
-; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32:
-; XSI: s_endpgm
-; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-; %ext = sext <32 x i8> %load to <32 x i32>
-; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32:
-; XSI: s_endpgm
-; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-; %ext = zext <64 x i8> %load to <64 x i32>
-; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32:
-; XSI: s_endpgm
-; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-; %ext = sext <64 x i8> %load to <64 x i32>
-; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
-; ret void
-; }
-
-; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
-; SI: buffer_load_ubyte v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
- %a = load i8, i8 addrspace(1)* %in
- %ext = zext i8 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
-; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: buffer_store_dwordx2
-define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
- %a = load i8, i8 addrspace(1)* %in
- %ext = sext i8 %a to i64
- store i64 %ext, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64:
-; SI: s_endpgm
-define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
- %ext = zext <1 x i8> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64:
-; SI: s_endpgm
-define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
- %ext = sext <1 x i8> %load to <1 x i64>
- store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64:
-; SI: s_endpgm
-define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
- %ext = zext <2 x i8> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64:
-; SI: s_endpgm
-define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
- %ext = sext <2 x i8> %load to <2 x i64>
- store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64:
-; SI: s_endpgm
-define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
- %ext = zext <4 x i8> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64:
-; SI: s_endpgm
-define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
- %ext = sext <4 x i8> %load to <4 x i64>
- store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64:
-; SI: s_endpgm
-define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
- %ext = zext <8 x i8> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64:
-; SI: s_endpgm
-define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
- %ext = sext <8 x i8> %load to <8 x i64>
- store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64:
-; SI: s_endpgm
-define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
- %ext = zext <16 x i8> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64:
-; SI: s_endpgm
-define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
- %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
- %ext = sext <16 x i8> %load to <16 x i64>
- store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
- ret void
-}
-
-; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64:
-; XSI: s_endpgm
-; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-; %ext = zext <32 x i8> %load to <32 x i64>
-; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64:
-; XSI: s_endpgm
-; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
-; %ext = sext <32 x i8> %load to <32 x i64>
-; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64:
-; XSI: s_endpgm
-; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-; %ext = zext <64 x i8> %load to <64 x i64>
-; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-; ret void
-; }
-
-; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64:
-; XSI: s_endpgm
-; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
-; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
-; %ext = sext <64 x i8> %load to <64 x i64>
-; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
-; ret void
-; }
diff --git a/test/CodeGen/AMDGPU/global-variable-relocs.ll b/test/CodeGen/AMDGPU/global-variable-relocs.ll
new file mode 100644
index 000000000000..c39394a3527d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -0,0 +1,203 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
+
+@private = private addrspace(1) global [256 x i32] zeroinitializer
+@internal = internal addrspace(1) global [256 x i32] zeroinitializer
+@available_externally = available_externally addrspace(1) global [256 x i32] zeroinitializer
+@linkonce = linkonce addrspace(1) global [256 x i32] zeroinitializer
+@weak= weak addrspace(1) global [256 x i32] zeroinitializer
+@common = common addrspace(1) global [256 x i32] zeroinitializer
+@extern_weak = extern_weak addrspace(1) global [256 x i32]
+@linkonce_odr = linkonce_odr addrspace(1) global [256 x i32] zeroinitializer
+@weak_odr = weak_odr addrspace(1) global [256 x i32] zeroinitializer
+@external = external addrspace(1) global [256 x i32]
+@external_w_init = addrspace(1) global [256 x i32] zeroinitializer
+
+; CHECK-LABEL: private_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private+8
+; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @private_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: internal_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], internal+8
+; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @internal_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: available_externally_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], available_externally@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @available_externally_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: linkonce_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @linkonce_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: weak_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @weak_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: common_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], common@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @common_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: extern_weak_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], extern_weak@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @extern_weak_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: linkonce_odr_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce_odr@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @linkonce_odr_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: weak_odr_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak_odr@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @weak_odr_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: external_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @external_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: external_w_init_test:
+; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external_w_init@GOTPCREL+4
+; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], 0
+; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
+; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
+; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @external_w_init_test(i32 addrspace(1)* %out) {
+ %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
+ %val = load i32, i32 addrspace(1)* %ptr
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK: .local private
+; CHECK: .local internal
+; CHECK: .weak linkonce
+; CHECK: .weak weak
+; CHECK: .weak linkonce_odr
+; CHECK: .weak weak_odr
+; CHECK-NOT: external{{$}}
+; CHECK: .globl external_w_init
diff --git a/test/CodeGen/AMDGPU/global-zero-initializer.ll b/test/CodeGen/AMDGPU/global-zero-initializer.ll
deleted file mode 100644
index 45aa8bf4e1d7..000000000000
--- a/test/CodeGen/AMDGPU/global-zero-initializer.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
-
-; CHECK: error: unsupported initializer for address space in load_init_global_global
-
-@lds = addrspace(1) global [256 x i32] zeroinitializer
-
-define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
- %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10
- %ld = load i32, i32 addrspace(1)* %gep
- store i32 %ld, i32 addrspace(1)* %out
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
index 6786e4a2f375..743ad7c278be 100644
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -1,921 +1,1044 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=VI --check-prefix=FUNC %s
-
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}atomic_add_i32_offset:
-; GCN: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
+; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0
+; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
+define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
+ %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_huge_offset:
+; SI-DAG: v_mov_b32_e32 v[[PTRLO:[0-9]+]], 0xdeac
+; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd
+; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_add
+define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
+
+ %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
-; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-
define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_add_i32:
-; GCN: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_add_i32_ret:
-; GCN: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32_offset:
-; GCN: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
-; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32:
-; GCN: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32_ret:
-; GCN: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
-; GCN: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
-; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32:
-; GCN: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
-; GCN: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32_offset:
-; GCN: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
-; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32:
-; GCN: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32_ret:
-; GCN: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
-; GCN: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
-; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32:
-; GCN: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
-; GCN: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32_offset:
-; GCN: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
-; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32:
-; GCN: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32_ret:
-; GCN: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
-; GCN: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
-; GCN: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32:
-; GCN: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
-; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32_offset:
-; GCN: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
-; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32:
-; GCN: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32_ret:
-; GCN: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
-; GCN: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
-; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+
+; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
+
; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32:
-; GCN: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
-; GCN: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
+; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
+; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ %extract0 = extractvalue { i32, i1 } %val, 0
+ store i32 %extract0, i32 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
+; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+
+; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
+; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
+ %extract0 = extractvalue { i32, i1 } %val, 0
+ store i32 %extract0, i32 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
+; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+entry:
+ %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
+; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+entry:
+ %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
+ %extract0 = extractvalue { i32, i1 } %val, 0
+ store i32 %extract0, i32 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
+; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+ %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
+; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword v[[RET]]
+define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+ %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+ %extract0 = extractvalue { i32, i1 } %val, 0
+ store i32 %extract0, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
-; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
-; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
- %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
- %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32:
-; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+ %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
-; GCN: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; GCN: buffer_store_dword [[RET]]
define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
- %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+ %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
ret void
}
; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
; GCN: buffer_store_dword [[RET]]
define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
- %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
- store i32 %0, i32 addrspace(1)* %out2
+ %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+ store i32 %val, i32 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32_offset:
+; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
+ %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32:
+; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+ %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32_addr64_offset:
+; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i32_addr64:
+; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dword [[RET]]
+define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
+ %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4
+ store i32 %val, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_offset:
+; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
+ store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32:
+; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+entry:
+ store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
+; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
+ store atomic i32 %in, i32 addrspace(1)* %gep seq_cst, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
+; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
+define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+ store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
ret void
}
diff --git a/test/CodeGen/AMDGPU/global_atomics_i64.ll b/test/CodeGen/AMDGPU/global_atomics_i64.ll
new file mode 100644
index 000000000000..2bae66d5aea8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -0,0 +1,1037 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}atomic_add_i64_offset:
+; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
+; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
+; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
+; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64:
+; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret:
+; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_addr64:
+; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
+; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_offset:
+; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
+; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
+; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
+; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64:
+; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret:
+; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_addr64:
+; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
+; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_offset:
+; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
+; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
+; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
+; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64:
+; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret:
+; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
+; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
+; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_offset:
+; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
+; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
+; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
+; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64:
+; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret:
+; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_addr64:
+; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
+; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_offset:
+; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
+; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
+; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
+; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64:
+; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret:
+; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
+; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
+; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_offset:
+; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
+; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
+; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
+; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64:
+; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret:
+; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_addr64:
+; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
+; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_offset:
+; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
+; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
+; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
+; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64:
+; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret:
+; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
+; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
+; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_offset:
+; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
+; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
+; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
+; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64:
+; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret:
+; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_addr64:
+; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
+; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
+; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
+; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
+; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
+define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
+; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64:
+; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
+; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
+; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
+; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_offset:
+; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
+; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
+; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
+; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64:
+; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret:
+; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+entry:
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
+; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
+ ret void
+}
+
+; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
+; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
+ store i64 %tmp0, i64 addrspace(1)* %out2
+ ret void
+}
+
+
+
+
+
+
+
+
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset:
+; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
+; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940
+; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
+define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
+ %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
+; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
+; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
+
+; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
+; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64:
+; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+entry:
+ %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret:
+; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+entry:
+ %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
+; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
+define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
+; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
+define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+ %extract0 = extractvalue { i64, i1 } %val, 0
+ store i64 %extract0, i64 addrspace(1)* %out2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64_offset:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
+ %val = load atomic i64, i64 addrspace(1)* %gep seq_cst, align 8
+ store i64 %val, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+ %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
+ store i64 %val, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64_addr64_offset:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ %val = load atomic i64, i64 addrspace(1)* %gep seq_cst, align 8
+ store i64 %val, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_load_i64_addr64:
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: buffer_store_dwordx2 [[RET]]
+define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
+ %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8
+ store i64 %val, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64_offset:
+; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
+entry:
+ %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
+ store atomic i64 %in, i64 addrspace(1)* %gep seq_cst, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64:
+; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
+define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
+entry:
+ store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset:
+; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
+ store atomic i64 %in, i64 addrspace(1)* %gep seq_cst, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_store_i64_addr64:
+; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
+define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+entry:
+ %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
+ store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
deleted file mode 100644
index 014b0a5482ab..000000000000
--- a/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
-
-; FUNC-LABEL: {{^}}test_i8:
-; EG: CF_END
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
- %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s
- %1 = load i8, i8 addrspace(2)* %arrayidx, align 1
- store i8 %1, i8 addrspace(1)* %out
- ret void
-}
-
-@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
-
-; FUNC-LABEL: {{^}}test_i16:
-; EG: CF_END
-; SI: buffer_store_short
-; SI: s_endpgm
-define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
- %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s
- %1 = load i16, i16 addrspace(2)* %arrayidx, align 2
- store i16 %1, i16 addrspace(1)* %out
- ret void
-}
-
-%struct.bar = type { float, [5 x i8] }
-
-; The illegal i8s aren't handled
-@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
-
-; FUNC-LABEL: {{^}}struct_bar_gv_load:
-define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
- %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
- %load = load i8, i8 addrspace(2)* %gep, align 1
- store i8 %load, i8 addrspace(1)* %out, align 1
- ret void
-}
-
-
-; The private load isn't scalarzied.
-@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
- <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
- <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
- <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
-
-; FUNC-LABEL: {{^}}array_vector_gv_load:
-define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
- %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
- %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16
- store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index d4d13125cfbf..1f9b536cd80b 100644
--- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
@@ -10,13 +11,10 @@
; FUNC-LABEL: {{^}}float:
; GCN: s_load_dword
-; EG-DAG: MOV {{\** *}}T2.X
-; EG-DAG: MOV {{\** *}}T3.X
-; EG-DAG: MOV {{\** *}}T4.X
-; EG-DAG: MOV {{\** *}}T5.X
-; EG-DAG: MOV {{\** *}}T6.X
-; EG: MOVA_INT
-
+; EG: VTX_READ_32
+; EG: @float_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
define void @float(float addrspace(1)* %out, i32 %index) {
entry:
%0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
@@ -31,13 +29,10 @@ entry:
; GCN: s_load_dword
-; EG-DAG: MOV {{\** *}}T2.X
-; EG-DAG: MOV {{\** *}}T3.X
-; EG-DAG: MOV {{\** *}}T4.X
-; EG-DAG: MOV {{\** *}}T5.X
-; EG-DAG: MOV {{\** *}}T6.X
-; EG: MOVA_INT
-
+; EG: VTX_READ_32
+; EG: @i32_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
define void @i32(i32 addrspace(1)* %out, i32 %index) {
entry:
%0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
@@ -54,6 +49,10 @@ entry:
; FUNC-LABEL: {{^}}struct_foo_gv_load:
; GCN: s_load_dword
+; EG: VTX_READ_32
+; EG: @struct_foo_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
%gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
%load = load i32, i32 addrspace(2)* %gep, align 4
@@ -68,6 +67,11 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
; FUNC-LABEL: {{^}}array_v1_gv_load:
; GCN: s_load_dword
+
+; EG: VTX_READ_32
+; EG: @array_v1_gv
+; EG-NOT: MOVA_INT
+; EG-NOT: MOV
define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
%gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
%load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
@@ -75,6 +79,11 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
ret void
}
+; FUNC-LABEL: {{^}}gv_addressing_in_branch:
+
+; EG: VTX_READ_32
+; EG: @float_gv
+; EG-NOT: MOVA_INT
define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
entry:
%0 = icmp eq i32 0, %a
diff --git a/test/CodeGen/AMDGPU/gv-offset-folding.ll b/test/CodeGen/AMDGPU/gv-offset-folding.ll
new file mode 100644
index 000000000000..c75fdb35dd0e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -relocation-model=static < %s | FileCheck %s
+
+@lds = external addrspace(3) global [4 x i32]
+
+; Function Attrs: nounwind
+
+; Offset folding is an optimization done for global variables with relocations,
+; which allows you to store the offset in the r_addend of the relocation entry.
+; The offset is apllied to the variables address at link time, which eliminates
+; the need to emit shader instructions to do this calculation.
+; We don't use relocations for local memory, so we should never fold offsets
+; for local memory globals.
+
+; CHECK-LABEL: lds_no_offset:
+; CHECK ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
+define void @lds_no_offset() {
+entry:
+ %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1
+ store i32 0, i32 addrspace(3)* %ptr
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index a02cbf43c400..d21d66176a14 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; half args should be promoted to float
@@ -13,10 +13,11 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
}
; GCN-LABEL: {{^}}load_v2f16_arg:
-; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
-; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
+; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
+; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
+; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: s_endpgm
define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
store <2 x half> %arg, <2 x half> addrspace(1)* %out
@@ -42,10 +43,7 @@ define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: buffer_store_dwordx2
; GCN: s_endpgm
define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
store <4 x half> %arg, <4 x half> addrspace(1)* %out
@@ -280,11 +278,11 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(
}
; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
-; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
-; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
; GCN: s_endpgm
define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
%val = load <2 x half>, <2 x half> addrspace(1)* %in
@@ -318,22 +316,8 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
}
; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
@@ -378,10 +362,10 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace
}
; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
-; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]]
+; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
@@ -396,22 +380,18 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32
-; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]]
-; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
-
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN-NOT: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
+; GCN-DAG: v_cvt_f32_f16_e32
+; GCN-DAG: v_cvt_f32_f16_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN-NOT: v_cvt_f64_f32_e32
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
; GCN: s_endpgm
define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
%val = load <3 x half>, <3 x half> addrspace(1)* %in
@@ -459,8 +439,9 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa
; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
-; GCN-DAG: buffer_store_short [[CVT0]]
-; GCN-DAG: buffer_store_short [[CVT1]]
+; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
+; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]]
+; GCN-DAG: buffer_store_dword [[PACKED]]
; GCN: s_endpgm
define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
%val = load <2 x float>, <2 x float> addrspace(1)* %in
@@ -491,10 +472,7 @@ define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: buffer_store_dwordx2
; GCN: s_endpgm
define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
%val = load <4 x float>, <4 x float> addrspace(1)* %in
@@ -514,14 +492,7 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
; GCN: v_cvt_f16_f32_e32
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; GCN: buffer_store_dwordx4
; GCN: s_endpgm
define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
%val = load <8 x float>, <8 x float> addrspace(1)* %in
@@ -551,22 +522,8 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
; GCN-DAG: v_cvt_f16_f32_e32
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_dwordx4
+; GCN-DAG: buffer_store_dwordx4
; GCN: s_endpgm
define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
%val = load <16 x float>, <16 x float> addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/hsa-default-device.ll b/test/CodeGen/AMDGPU/hsa-default-device.ll
new file mode 100644
index 000000000000..631d6def4442
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-default-device.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+
+; Make sure that with an HSA triple, we don't default to an
+; unsupported device.
+
+; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
+define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+ store float 0.0, float addrspace(1)* %out0
+ ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
new file mode 100644
index 000000000000..36aa6779d382
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_default_ci:
+; GCN: compute_pgm_rsrc1_float_mode = 192
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_default_vi:
+; GCN: compute_pgm_rsrc1_float_mode = 192
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_f64_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 192
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_f32_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 48
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_f32_f64_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 240
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_no_denormals:
+; GCN: compute_pgm_rsrc1_float_mode = 0
+; GCN: compute_pgm_rsrc1_dx10_clamp = 1
+; GCN: compute_pgm_rsrc1_ieee_mode = 0
+define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="kaveri" }
+attributes #1 = { nounwind "target-cpu"="fiji" }
+attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-denormals" }
+attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-denormals" }
+attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" }
+attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-denormals" }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
new file mode 100644
index 000000000000..28c8b5d73b02
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
+
+; The SHT_NOTE section contains the output from the .hsa_code_object_*
+; directives.
+
+; ELF: Section {
+; ELF: Name: .text
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x6)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_EXECINSTR (0x4)
+; ELF: }
+
+; ELF: SHT_NOTE
+; ELF: 0000: 04000000 08000000 01000000 414D4400
+; ELF: 0010: 02000000 01000000 04000000 1B000000
+
+; ELF: 0020: 03000000 414D4400 04000700 07000000
+; ELF: 0030: 00000000 00000000 414D4400 414D4447
+; ELF: 0040: 50550000
+
+; ELF: Symbol {
+; ELF: Name: simple
+; ELF: Size: 288
+; ELF: Type: Function (0x2)
+; ELF: }
+
+; HSA: .hsa_code_object_version 2,1
+; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
+; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
+
+; HSA: .text
+
+; HSA-NOT: .amdgpu_hsa_kernel simple
+; HSA: {{^}}simple:
+; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: .end_amd_kernel_code_t
+; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+
+; Make sure we are setting the ATC bit:
+; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
+; On VI+ we also need to set MTYPE = 2
+; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
+; Make sure we generate flat store for HSA
+; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+
+; HSA: .Lfunc_end0:
+; HSA: .size simple, .Lfunc_end0-simple
+
+define void @simple(i32 addrspace(1)* %out) {
+entry:
+ store i32 0, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
index 90322ac3dc01..df478fbcf3b5 100644
--- a/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -1,14 +1,11 @@
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=ASM %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s | FileCheck %s --check-prefix=ELF
+@linkonce_odr_global_program = linkonce_odr addrspace(1) global i32 0
+@linkonce_global_program = linkonce addrspace(1) global i32 0
@internal_global_program = internal addrspace(1) global i32 0
@common_global_program = common addrspace(1) global i32 0
@external_global_program = addrspace(1) global i32 0
-@internal_global_agent = internal addrspace(1) global i32 0, section ".hsadata_global_agent"
-@common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent"
-@external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent"
-
@internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
@external_readonly = unnamed_addr addrspace(2) constant i32 0
@@ -16,133 +13,38 @@ define void @test() {
ret void
}
-; ASM: .amdgpu_hsa_module_global internal_global
-; ASM: .size internal_global_program, 4
-; ASM: .hsadata_global_program
-; ASM: internal_global_program:
-; ASM: .long 0
-
-; ASM: .amdgpu_hsa_module_global common_global
-; ASM: .size common_global_program, 4
-; ASM: .hsadata_global_program
-; ASM: common_global_program:
+; ASM: .type linkonce_odr_global_program,@object
+; ASM: .section .bss,#alloc,#write
+; ASM: .weak linkonce_odr_global_program
+; ASM: linkonce_odr_global_program:
; ASM: .long 0
+; ASM: .size linkonce_odr_global_program, 4
-; ASM: .amdgpu_hsa_program_global external_global
-; ASM: .size external_global_program, 4
-; ASM: .hsadata_global_program
-; ASM: external_global_program:
+; ASM: .type linkonce_global_program,@object
+; ASM: .weak linkonce_global_program
+; ASM: linkonce_global_program:
; ASM: .long 0
+; ASM: .size linkonce_global_program, 4
-; ASM: .amdgpu_hsa_module_global internal_global
-; ASM: .size internal_global_agent, 4
-; ASM: .hsadata_global_agent
-; ASM: internal_global_agent:
-; ASM: .long 0
+; ASM: .type internal_global_program,@object
+; ASM: .local internal_global_program
+; ASM: .comm internal_global_program,4,2
-; ASM: .amdgpu_hsa_module_global common_global
-; ASM: .size common_global_agent, 4
-; ASM: .hsadata_global_agent
-; ASM: common_global_agent:
-; ASM: .long 0
+; ASM: .type common_global_program,@object
+; ASM: .comm common_global_program,4,2
-; ASM: .amdgpu_hsa_program_global external_global
-; ASM: .size external_global_agent, 4
-; ASM: .hsadata_global_agent
-; ASM: external_global_agent:
+; ASM: external_global_program:
; ASM: .long 0
+; ASM: .size external_global_program, 4
-; ASM: .amdgpu_hsa_module_global internal_readonly
-; ASM: .size internal_readonly, 4
-; ASM: .hsatext
+; ASM: .type internal_readonly,@object
+; ASM: .text
; ASM: internal_readonly:
; ASM: .long 0
+; ASM: .size internal_readonly, 4
-; ASM: .amdgpu_hsa_program_global external_readonly
-; ASM: .size external_readonly, 4
-; ASM: .hsatext
+; ASM: .type external_readonly,@object
+; ASM: .globl external_readonly
; ASM: external_readonly:
; ASM: .long 0
-
-; ELF: Section {
-; ELF: Name: .hsadata_global_program
-; ELF: Type: SHT_PROGBITS (0x1)
-; ELF: Flags [ (0x100003)
-; ELF: SHF_ALLOC (0x2)
-; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
-; ELF: SHF_WRITE (0x1)
-; ELF: ]
-; ELF: }
-
-; ELF: Section {
-; ELF: Name: .hsadata_global_agent
-; ELF: Type: SHT_PROGBITS (0x1)
-; ELF: Flags [ (0x900003)
-; ELF: SHF_ALLOC (0x2)
-; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
-; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
-; ELF: SHF_WRITE (0x1)
-; ELF: ]
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: common_global_agent
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Section: .hsadata_global_agent
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: common_global_program
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Section: .hsadata_global_program
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: internal_global_agent
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_agent
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: internal_global_program
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_program
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: internal_readonly
-; ELF: Size: 4
-; ELF: Binding: Local
-; ELF: Type: Object
-; ELF: Section: .hsatext
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: external_global_agent
-; ELF: Size: 4
-; ELF: Binding: Global
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_agent
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: external_global_program
-; ELF: Size: 4
-; ELF: Binding: Global
-; ELF: Type: Object
-; ELF: Section: .hsadata_global_program
-; ELF: }
-
-; ELF: Symbol {
-; ELF: Name: external_readonly
-; ELF: Size: 4
-; ELF: Binding: Global
-; ELF: Type: Object
-; ELF: Section: .hsatext
-; ELF: }
+; ASM: .size external_readonly, 4
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index f82e98e79545..1b4a0f3090b8 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -2,7 +2,7 @@
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s
-; HSA: .hsa_code_object_version 1,0
+; HSA: .hsa_code_object_version 2,1
; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index c089dfd9a971..82d7da188ca3 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -9,34 +9,31 @@
; directives.
; ELF: Section {
-; ELF: Name: .hsatext
+; ELF: Name: .text
; ELF: Type: SHT_PROGBITS (0x1)
-; ELF: Flags [ (0xC00007)
+; ELF: Flags [ (0x6)
; ELF: SHF_ALLOC (0x2)
-; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
-; ELF: SHF_AMDGPU_HSA_CODE (0x400000)
; ELF: SHF_EXECINSTR (0x4)
-; ELF: SHF_WRITE (0x1)
; ELF: }
; ELF: SHT_NOTE
; ELF: 0000: 04000000 08000000 01000000 414D4400
-; ELF: 0010: 01000000 00000000 04000000 1B000000
+; ELF: 0010: 02000000 01000000 04000000 1B000000
; ELF: 0020: 03000000 414D4400 04000700 07000000
; ELF: 0030: 00000000 00000000 414D4400 414D4447
; ELF: 0040: 50550000
; ELF: Symbol {
; ELF: Name: simple
-; ELF: Size: 296
+; ELF: Size: 288
; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
; ELF: }
-; HSA: .hsa_code_object_version 1,0
+; HSA: .hsa_code_object_version 2,1
; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
-; HSA: .hsatext
+; HSA: .text
; HSA: .amdgpu_hsa_kernel simple
; HSA: {{^}}simple:
@@ -51,12 +48,12 @@
; On VI+ we also need to set MTYPE = 2
; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
; Make sure we generate flat store for HSA
-; HSA: flat_store_dword v{{[0-9]+}}
+; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
; HSA: .Lfunc_end0:
; HSA: .size simple, .Lfunc_end0-simple
-define void @simple(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @simple(i32 addrspace(1)* %out) {
entry:
store i32 0, i32 addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index b11a21137642..d6309a2dd5de 100644
--- a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -4,9 +4,8 @@
; SILowerI1Copies was not handling IMPLICIT_DEF
; SI-LABEL: {{^}}br_implicit_def:
; SI: BB#0:
-; SI-NEXT: s_and_saveexec_b64
-; SI-NEXT: s_xor_b64
-; SI-NEXT: BB#1:
+; SI-NEXT: s_and_b64 vcc, exec
+; SI-NEXT: s_cbranch_vccnz
define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
bb:
br i1 undef, label %bb1, label %bb2
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 105cd06b330a..4d50dc2f4023 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -10,9 +10,11 @@
; SI: s_and_saveexec_b64
; SI: s_xor_b64
; SI: s_endpgm
-define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
+define void @br_i1_phi(i32 %arg) {
bb:
- br i1 %arg1, label %bb2, label %bb3
+ %tidig = call i32 @llvm.r600.read.tidig.x() #0
+ %cmp = trunc i32 %tidig to i1
+ br i1 %cmp, label %bb2, label %bb3
bb2: ; preds = %bb
br label %bb3
@@ -22,9 +24,14 @@ bb3: ; preds = %bb2, %bb
br i1 %tmp, label %bb4, label %bb6
bb4: ; preds = %bb3
- %tmp5 = mul i32 undef, %arg
+ %val = load volatile i32, i32 addrspace(1)* undef
+ %tmp5 = mul i32 %val, %arg
br label %bb6
bb6: ; preds = %bb4, %bb3
ret void
}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index 8db9ea4ccf31..674eceee8122 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -23,7 +23,7 @@ entry:
; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
+; CHECK-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
store i64 -9223372036854775808, i64 addrspace(1) *%out
@@ -31,7 +31,7 @@ define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
}
; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
; CHECK: buffer_store_dword [[REG]]
define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
store i32 -2147483648, i32 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
}
; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
; CHECK: buffer_store_dword [[REG]]
define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
store float -0.0, float addrspace(1)* %out
@@ -322,7 +322,7 @@ define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0.0
@@ -333,7 +333,7 @@ define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0.5
@@ -344,7 +344,7 @@ define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, -0.5
@@ -355,7 +355,7 @@ define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 1.0
@@ -366,7 +366,7 @@ define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, -1.0
@@ -377,7 +377,7 @@ define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 2.0
@@ -388,7 +388,7 @@ define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, -2.0
@@ -399,7 +399,7 @@ define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 4.0
@@ -410,7 +410,7 @@ define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, -4.0
@@ -422,7 +422,7 @@ define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_1_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0x0000000000000001
@@ -433,7 +433,7 @@ define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_2_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0x0000000000000002
@@ -444,7 +444,7 @@ define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_16_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0x0000000000000010
@@ -455,7 +455,7 @@ define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0xffffffffffffffff
@@ -466,7 +466,7 @@ define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0xfffffffffffffffe
@@ -477,7 +477,7 @@ define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0xfffffffffffffff0
@@ -488,7 +488,7 @@ define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_63_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0x000000000000003F
@@ -499,7 +499,7 @@ define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}add_inline_imm_64_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
-; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]]
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
; CHECK: buffer_store_dwordx2 [[REG]]
define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
%y = fadd double %x, 0x0000000000000040
@@ -510,7 +510,7 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64:
; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
-; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}}
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
store double 0.0, double addrspace(1)* %out
@@ -520,7 +520,7 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
+; CHECK-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
store double -0.0, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e40cac22725c..66cec88e760c 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
@@ -87,13 +87,34 @@ entry:
; CHECK: s_cbranch_execnz
define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
entry:
- %id = call i32 @llvm.r600.read.tidig.x() #1
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
%index = add i32 %id, -512
%value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
store i32 %value, i32 addrspace(1)* %out
ret void
}
+; CHECK-LABEL: {{^}}extract_undef_offset_sgpr:
+define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %value = extractelement <4 x i32> %ld, i32 undef
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK-NEXT: v_movreld_b32
+define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %value = insertelement <4 x i32> %ld, i32 5, i32 undef
+ store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
; CHECK-LABEL: {{^}}insert_w_offset:
; CHECK: s_mov_b32 m0
; CHECK-NEXT: v_movreld_b32_e32
@@ -152,7 +173,7 @@ entry:
; CHECK: s_cbranch_execnz
define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
- %id = call i32 @llvm.r600.read.tidig.x() #1
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
%index = add i32 %id, -512
%value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
store <4 x i32> %value, <4 x i32> addrspace(1)* %out
@@ -167,12 +188,304 @@ entry:
; CHECK: s_cbranch_execnz
define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
entry:
- %id = call i32 @llvm.r600.read.tidig.x() #1
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
%index = add i32 %id, -16
%value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
store <4 x i32> %value, <4 x i32> addrspace(1)* %out
ret void
}
-declare i32 @llvm.r600.read.tidig.x() #1
+; When the block is split to insert the loop, make sure any other
+; places that need to be expanded in the same block are also handled.
+
+; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
+
+; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
+; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
+; CHECK: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dword [[MOVREL0]]
+; CHECK: buffer_store_dword [[MOVREL1]]
+define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id.ext = zext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+ %idx0 = load volatile i32, i32 addrspace(1)* %gep
+ %idx1 = add i32 %idx0, 1
+ %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
+ %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={SGPR4}" ()
+ %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
+ store volatile i32 %val0, i32 addrspace(1)* %out0
+ store volatile i32 %val1, i32 addrspace(1)* %out0
+ %cmp = icmp eq i32 %id, 0
+ br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+ store volatile i32 %live.out.reg, i32 addrspace(1)* undef
+ br label %bb2
+
+bb2:
+ ret void
+}
+
+; CHECK-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
+; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]]
+; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
+; CHECK-DAG: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL0:[0-9]+]], [[INS0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: v_mov_b32_e32 [[INS1:v[0-9]+]], 63
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL1:[0-9]+]], [[INS1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]:
+
+; CHECK: buffer_store_dword [[INS0]]
+define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id.ext = zext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+ %idx0 = load volatile i32, i32 addrspace(1)* %gep
+ %idx1 = add i32 %idx0, 1
+ %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
+ %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
+ %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
+ store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
+ %cmp = icmp eq i32 %id, 0
+ br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+ store volatile i32 %live.out.val, i32 addrspace(1)* undef
+ br label %bb2
+
+bb2:
+ ret void
+}
+
+; CHECK-LABEL: {{^}}extract_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @extract_adjacent_blocks(i32 %arg) #0 {
+bb:
+ %tmp = icmp eq i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb4
+
+bb1:
+ %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+ br label %bb7
+
+bb4:
+ %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+ br label %bb7
+
+bb7:
+ %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+ store volatile float %tmp8, float addrspace(1)* undef
+ ret void
+}
+
+; CHECK-LABEL: {{^}}insert_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+bb:
+ %tmp = icmp eq i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb4
+
+bb1: ; preds = %bb
+ %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
+ br label %bb7
+
+bb4: ; preds = %bb
+ %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
+ br label %bb7
+
+bb7: ; preds = %bb4, %bb1
+ %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+ store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
+ ret void
+}
+
+; FIXME: Should be able to fold zero input to movreld to inline imm?
+
+; CHECK-LABEL: {{^}}multi_same_block:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; CHECK-DAG: s_add_i32 m0, [[ARG]], -16
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, [[ZERO]]
+
+; CHECK: s_add_i32 m0, [[ARG]], -14
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; CHECK: s_mov_b32 m0, -1
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
+; CHECK: s_endpgm
+define void @multi_same_block(i32 %arg) #0 {
+bb:
+ %tmp1 = add i32 %arg, -16
+ %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 0.000000e+00, i32 %tmp1
+ %tmp3 = add i32 %arg, -16
+ %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float 0x3FB99999A0000000, i32 %tmp3
+ %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
+ %tmp6 = extractelement <6 x i32> %tmp5, i32 1
+ %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
+ %tmp8 = extractelement <6 x i32> %tmp7, i32 5
+ store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
+ store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
+ ret void
+}
+
+; offset puts outside of superegister bounaries, so clamp to 1st element.
+; CHECK-LABEL: {{^}}extract_largest_inbounds_offset:
+; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; CHECK: s_load_dword [[IDX:s[0-9]+]]
+; CHECK: s_mov_b32 m0, [[IDX]]
+; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
+; CHECK: buffer_store_dword [[EXTRACT]]
+define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %offset = add i32 %idx, 3
+ %value = extractelement <4 x i32> %ld, i32 %offset
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABL: {{^}}extract_out_of_bounds_offset:
+; CHECK: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; CHECK: s_load_dword [[IDX:s[0-9]+]]
+; CHECK: s_add_i32 m0, [[IDX]], 4
+; CHECK-NEXT: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
+; CHECK: buffer_store_dword [[EXTRACT]]
+define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %offset = add i32 %idx, 4
+ %value = extractelement <4 x i32> %ld, i32 %offset
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; Test that the or is folded into the base address register instead of
+; added to m0
+
+; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %idx.shl = shl i32 %idx.in, 2
+ %idx = or i32 %idx.shl, 1
+ %value = extractelement <4 x i32> %ld, i32 %idx
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
+; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
+; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
+; GCN-NOT: [[IDX_SHL]]
+; GCN: s_mov_b32 m0, [[IDX_SHL]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+ %idx.shl = shl i32 %idx.in, 2
+ %idx = or i32 %idx.shl, 1
+ %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
+ store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-undef.mir b/test/CodeGen/AMDGPU/indirect-addressing-undef.mir
new file mode 100644
index 000000000000..7cd35d41f30c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-addressing-undef.mir
@@ -0,0 +1,327 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-lower-control-flow -o - %s | FileCheck %s
+# Getting an undef that is specifically a VGPR is tricky from IR
+
+# CHECK-LABEL: name: extract_undef_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}}
+
+
+--- |
+ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+ define void @extract_undef_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %value = extractelement <4 x i32> %ld, i32 undef
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ define void @extract_undef_neg_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ entry:
+ %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
+ %value = extractelement <4 x i32> %ld, i32 undef
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ define void @insert_undef_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ entry:
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %value = insertelement <4 x i32> %ld, i32 5, i32 undef
+ store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+ ret void
+ }
+
+ define void @insert_undef_neg_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ entry:
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %value = insertelement <4 x i32> %ld, i32 5, i32 undef
+ store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+ ret void
+ }
+
+ define void @insert_undef_value_offset_vgpr(<4 x i32> addrspace(1)*%out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+ entry:
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %value = insertelement <4 x i32> %ld, i32 undef, i32 %idx
+ store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+ ret void
+ }
+
+...
+---
+name: extract_undef_offset_vgpr
+alignment: 0
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %sgpr0_sgpr1
+
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ S_WAITCNT 127
+ %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+ S_WAITCNT 3952
+ %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+ S_WAITCNT 127
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc
+# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1
+
+name: extract_undef_neg_offset_vgpr
+alignment: 0
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %sgpr0_sgpr1
+
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ S_WAITCNT 127
+ %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9
+ S_WAITCNT 3952
+ %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+ S_WAITCNT 127
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: insert_undef_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1
+
+name: insert_undef_offset_vgpr
+alignment: 0
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %sgpr0_sgpr1
+
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ %vgpr4 = V_MOV_B32_e32 5, implicit %exec
+ S_WAITCNT 127
+ %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in)
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+ S_WAITCNT 3952
+ %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+ S_WAITCNT 127
+ BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out)
+ S_ENDPGM
+
+...
+
+# CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc
+# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}}
+
+name: insert_undef_neg_offset_vgpr
+alignment: 0
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %sgpr0_sgpr1
+
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ %vgpr4 = V_MOV_B32_e32 5, implicit %exec
+ S_WAITCNT 127
+ %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in)
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+ S_WAITCNT 3952
+ %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+ S_WAITCNT 127
+ BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out)
+ S_ENDPGM
+
+...
+
+# CHECK-LABEL: insert_undef_value_offset_vgpr{{$}}
+# CHECK: bb.1:
+# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%)
+# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}}
+
+# CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec
+# CHECK: %m0 = S_MOV_B32 %vcc_lo
+# CHECK: %vgpr0 = V_MOVRELD_B32_e32 undef %vgpr10, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec
+
+# CHECK: bb.2:
+# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}}
+
+name: insert_undef_value_offset_vgpr
+alignment: 0
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %sgpr0_sgpr1
+
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ %vgpr4 = V_MOV_B32_e32 2, implicit %exec
+ S_WAITCNT 127
+ %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in)
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`)
+ S_WAITCNT 3952
+ %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, killed %vgpr4, 0, undef %vgpr10, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec
+ S_WAITCNT 127
+ BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out)
+ S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
index 2a3b29f54fa9..1f851f9de535 100644
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -1,24 +1,31 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s
-
-declare void @llvm.AMDGPU.barrier.local() convergent nounwind
+declare void @llvm.amdgcn.s.barrier() #0
; SI-LABEL: {{^}}private_access_f64_alloca:
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
+; SI-ALLOCA16: buffer_store_dwordx2
+; SI-ALLOCA16: buffer_load_dwordx2
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
; SI-PROMOTE: ds_write_b64
; SI-PROMOTE: ds_read_b64
-define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write_b64
+; CI-PROMOTE: ds_read_b64
+define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
%val = load double, double addrspace(1)* %in, align 8
- %array = alloca double, i32 16, align 8
- %ptr = getelementptr double, double* %array, i32 %b
+ %array = alloca [16 x double], align 8
+ %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
store double %val, double* %ptr, align 8
- call void @llvm.AMDGPU.barrier.local() convergent nounwind
+ call void @llvm.amdgcn.s.barrier()
%result = load double, double* %ptr, align 8
store double %result, double addrspace(1)* %out, align 8
ret void
@@ -26,19 +33,30 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
; SI-LABEL: {{^}}private_access_v2f64_alloca:
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
+; SI-ALLOCA16: buffer_store_dwordx4
+; SI-ALLOCA16: buffer_load_dwordx4
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
; SI-PROMOTE: ds_write_b64
; SI-PROMOTE: ds_write_b64
; SI-PROMOTE: ds_read_b64
; SI-PROMOTE: ds_read_b64
-define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write2_b64
+; CI-PROMOTE: ds_read2_b64
+define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
%val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
- %array = alloca <2 x double>, i32 16, align 16
- %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
+ %array = alloca [8 x <2 x double>], align 16
+ %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
store <2 x double> %val, <2 x double>* %ptr, align 16
- call void @llvm.AMDGPU.barrier.local() convergent nounwind
+ call void @llvm.amdgcn.s.barrier()
%result = load <2 x double>, <2 x double>* %ptr, align 16
store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
ret void
@@ -46,17 +64,25 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
; SI-LABEL: {{^}}private_access_i64_alloca:
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
+; SI-ALLOCA16: buffer_store_dwordx2
+; SI-ALLOCA16: buffer_load_dwordx2
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+
; SI-PROMOTE: ds_write_b64
; SI-PROMOTE: ds_read_b64
-define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write_b64
+; CI-PROMOTE: ds_read_b64
+define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
%val = load i64, i64 addrspace(1)* %in, align 8
- %array = alloca i64, i32 16, align 8
- %ptr = getelementptr i64, i64* %array, i32 %b
+ %array = alloca [8 x i64], align 8
+ %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
store i64 %val, i64* %ptr, align 8
- call void @llvm.AMDGPU.barrier.local() convergent nounwind
+ call void @llvm.amdgcn.s.barrier()
%result = load i64, i64* %ptr, align 8
store i64 %result, i64 addrspace(1)* %out, align 8
ret void
@@ -64,20 +90,35 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
; SI-LABEL: {{^}}private_access_v2i64_alloca:
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
+; SI-ALLOCA16: buffer_store_dwordx4
+; SI-ALLOCA16: buffer_load_dwordx4
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
; SI-PROMOTE: ds_write_b64
; SI-PROMOTE: ds_write_b64
; SI-PROMOTE: ds_read_b64
; SI-PROMOTE: ds_read_b64
-define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
+; CI-PROMOTE: ds_write2_b64
+; CI-PROMOTE: ds_read2_b64
+define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
%val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
- %array = alloca <2 x i64>, i32 16, align 16
- %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
+ %array = alloca [8 x <2 x i64>], align 16
+ %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
store <2 x i64> %val, <2 x i64>* %ptr, align 16
- call void @llvm.AMDGPU.barrier.local() convergent nounwind
+ call void @llvm.amdgcn.s.barrier()
%result = load <2 x i64>, <2 x i64>* %ptr, align 16
store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
ret void
}
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="2" "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index 9c8d3534f8ad..1f5b8be15e2e 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; CHECK: {{^}}inline_asm:
+; CHECK-LABEL: {{^}}inline_asm:
; CHECK: s_endpgm
; CHECK: s_endpgm
define void @inline_asm(i32 addrspace(1)* %out) {
@@ -11,13 +11,175 @@ entry:
ret void
}
-; CHECK: {{^}}inline_asm_shader:
+; CHECK-LABEL: {{^}}inline_asm_shader:
; CHECK: s_endpgm
; CHECK: s_endpgm
-define void @inline_asm_shader() #0 {
+define amdgpu_ps void @inline_asm_shader() {
entry:
call void asm sideeffect "s_endpgm", ""()
ret void
}
-attributes #0 = { "ShaderType"="0" }
+
+; CHECK: {{^}}branch_on_asm:
+; Make sure inline assembly is treted as divergent.
+; CHECK: s_mov_b32 s{{[0-9]+}}, 0
+; CHECK: s_and_saveexec_b64
+define void @branch_on_asm(i32 addrspace(1)* %out) {
+ %zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
+ %cmp = icmp eq i32 %zero, 0
+ br i1 %cmp, label %if, label %endif
+
+if:
+ store i32 0, i32 addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
+
+; CHECK-LABEL: {{^}}v_cmp_asm:
+; CHECK: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; CHECK: v_cmp_ne_i32_e64 s{{\[}}[[MASK_LO:[0-9]+]]:[[MASK_HI:[0-9]+]]{{\]}}, 0, [[SRC]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
+ %sgpr = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %in)
+ store i64 %sgpr, i64 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm:
+; CHECK: codeLenInByte = 12
+define void @code_size_inline_asm(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "v_nop_e64", ""()
+ ret void
+}
+
+; All inlineasm instructions are assumed to be the maximum size
+; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
+; CHECK: codeLenInByte = 12
+define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "v_nop_e32", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "
+ v_nop_e64
+ v_nop_e64
+ ", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "
+ v_nop_e64
+
+ v_nop_e64
+ ", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "; comment", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "
+; comment", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "; comment
+", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "; first comment ; second comment", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
+; CHECK: codeLenInByte = 4
+define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "; first comment;second comment", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "; comment
+ v_nop_e64 ; inline comment
+; separate comment
+ v_nop_e64
+
+ ; trailing comment
+ ; extra comment
+ ", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "v_nop_e64 ; inline comment
+; separate comment
+ v_nop_e64
+
+ ; trailing comment
+ ; extra comment
+ ", ""()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
+; CHECK: codeLenInByte = 20
+define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
+entry:
+ call void asm sideeffect "; comment
+ v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
+; separate comment
+ v_bfrev_b32_e32 v0, 1
+
+ ; trailing comment
+ ; extra comment
+ ", ""()
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/input-mods.ll b/test/CodeGen/AMDGPU/input-mods.ll
index 1c4d285cbcb1..720790df7e16 100644
--- a/test/CodeGen/AMDGPU/input-mods.ll
+++ b/test/CodeGen/AMDGPU/input-mods.ll
@@ -9,18 +9,16 @@
;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = call float @llvm.fabs.f32(float %r0)
%r2 = fsub float -0.000000e+00, %r1
%r3 = call float @llvm.exp2.f32(float %r2)
%vec = insertelement <4 x float> undef, float %r3, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
declare float @llvm.exp2.f32(float) readnone
declare float @llvm.fabs.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 7f9579e59782..367e7f734556 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; FIXME: Broken on evergreen
; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -9,168 +9,296 @@
; FIXME: Why is the constant moved into the intermediate register and
; not just directly into the vector component?
-; SI-LABEL: {{^}}insertelement_v4f32_0:
-; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
-; v_mov_b32_e32
-; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
-; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
-; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
+; GCN-LABEL: {{^}}insertelement_v4f32_0:
+; GCN: s_load_dwordx4
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
+; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
ret void
}
-; SI-LABEL: {{^}}insertelement_v4f32_1:
+; GCN-LABEL: {{^}}insertelement_v4f32_1:
define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
ret void
}
-; SI-LABEL: {{^}}insertelement_v4f32_2:
+; GCN-LABEL: {{^}}insertelement_v4f32_2:
define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
ret void
}
-; SI-LABEL: {{^}}insertelement_v4f32_3:
+; GCN-LABEL: {{^}}insertelement_v4f32_3:
define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
ret void
}
-; SI-LABEL: {{^}}insertelement_v4i32_0:
+; GCN-LABEL: {{^}}insertelement_v4i32_0:
define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
%vecins = insertelement <4 x i32> %a, i32 999, i32 0
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
-; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-LABEL: {{^}}insertelement_v3f32_1:
+define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+ %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
+ store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v3f32_2:
+define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+ %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
+ store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_v3f32_3:
+define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+ %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
+ store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}insertelement_to_sgpr:
+; GCN-NOT: v_readfirstlane
+define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
+ %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef
+ %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
+ %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %tmp2
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
%vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
-; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
-; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-DAG: buffer_store_dword v
+define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+ %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
+ store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
%vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v16f32:
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
%vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
-; SI: buffer_store_dwordx2
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx2
define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i32> %a, i32 5, i32 %b
store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
+; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 5
+; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+; GCN-DAG: buffer_store_dword v
+define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+ %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
+ store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx4
define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <4 x i32> %a, i32 5, i32 %b
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <8 x i32> %a, i32 5, i32 %b
store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v16i32:
+; GCN: v_movreld_b32
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <16 x i32> %a, i32 5, i32 %b
store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
ret void
}
-
-; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
-; FIXMESI: buffer_store_dwordx2
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i16> %a, i16 5, i32 %b
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
+define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+ %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
+ store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4i16:
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+; GCN: buffer_load_ushort v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:6
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
%vecins = insertelement <4 x i16> %a, i16 5, i32 %b
- store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
+ store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
ret void
}
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
-; FIXMESI: BUFFER_STORE_USHORT
+; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_short v{{[0-9]+}}, off
define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
-; FIXMESI: buffer_store_dword
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
+define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+ %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
+ store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off
+
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:3
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off
define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <4 x i8> %a, i8 5, i32 %b
- store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
+ store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
-; FIXMESI: buffer_store_dwordx2
+; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <8 x i8> %a, i8 5, i32 %b
- store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
+ store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
-; FIXMESI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <16 x i8> %a, i8 5, i32 %b
store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
@@ -179,7 +307,7 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
; the compiler doesn't crash.
-; SI-LABEL: {{^}}insert_split_bb:
+; GCN-LABEL: {{^}}insert_split_bb:
define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
entry:
%0 = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -203,30 +331,30 @@ endif:
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
-; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
-; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
-; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
+; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
+; GCN: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
+; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; SI: s_mov_b32 m0, [[SCALEDIDX]]
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
+; GCN: s_mov_b32 m0, [[SCALEDIDX]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
; Increment to next element.
; FIXME: Should be able to manipulate m0 directly instead of add and
; copy.
-; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
-; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
-; SI-DAG: s_mov_b32 m0, [[IDX1]]
-; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
+; FIXME: Should avoid resetting m0 to same value
+; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
+; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
+; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
%vecins = insertelement <2 x double> %a, double 8.0, i32 %b
store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
@@ -234,44 +362,52 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d
}
; FIXME: Inline immediate should be folded into v_movreld_b32.
-; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
+; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:
-; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
-; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
-; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
-; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
+; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
+; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i64> %a, i64 5, i32 %b
store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
ret void
}
+; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
+define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+ %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
+ store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
+ ret void
+}
+
; FIXME: Should be able to do without stack access. The used stack
; space is also 2x what should be required.
-; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
-; SI: SCRATCH_RSRC_DWORD
+; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
+; GCN: SCRATCH_RSRC_DWORD
; Stack store
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
; Write element
-; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; Stack reload
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; Store result
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-; SI: ScratchSize: 64
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
+; GCN: ScratchSize: 64
define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
%vecins = insertelement <4 x double> %a, double 8.0, i32 %b
@@ -279,29 +415,31 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
ret void
}
-; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
-; SI: SCRATCH_RSRC_DWORD
+; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
+; GCN: SCRATCH_RSRC_DWORD
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
-; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
-; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
-; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
-; SI: ScratchSize: 128
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
+; GCN: ScratchSize: 128
define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
%vecins = insertelement <8 x double> %a, double 8.0, i32 %b
store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
ret void
}
+
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
new file mode 100644
index 000000000000..c29434f5eca2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -0,0 +1,8 @@
+; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
+define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
+ %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
+ store volatile i32 0, i32 addrspace(1)* %stof
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 2a01a621fc42..347170f79e32 100644
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -7,8 +7,8 @@
; from constant/invariant memory.
; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_global_pointer_load:
-; GCN: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
+; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
; GCN: buffer_store_dword [[K]], [[PTR]]
define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
%ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
@@ -21,7 +21,7 @@ define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 add
; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_constant_pointer_load:
; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
-; GCN: buffer_store_dword [[K]], s{{\[}}[[SPTR_LO]]:
+; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
%ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
%ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
@@ -32,4 +32,4 @@ define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 a
!0 = !{}
-attributes #0 = { nounwind } \ No newline at end of file
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/jump-address.ll b/test/CodeGen/AMDGPU/jump-address.ll
index f55912e37401..9fde31f922cd 100644
--- a/test/CodeGen/AMDGPU/jump-address.ll
+++ b/test/CodeGen/AMDGPU/jump-address.ll
@@ -4,7 +4,7 @@
; CHECK: EXPORT
; CHECK-NOT: EXPORT
-define void @main() #0 {
+define amdgpu_ps void @main() {
main_body:
%0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
%1 = extractelement <4 x float> %0, i32 0
@@ -36,7 +36,7 @@ ENDIF: ; preds = %IF13, %ELSE, %main_
%17 = insertelement <4 x float> %16, float %temp1.0, i32 1
%18 = insertelement <4 x float> %17, float %temp2.0, i32 2
%19 = insertelement <4 x float> %18, float %temp3.0, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %19, i32 0, i32 0)
ret void
IF13: ; preds = %ELSE
@@ -47,6 +47,4 @@ IF13: ; preds = %ELSE
br label %ENDIF
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll
index 7e2291cfdc35..43448fbd7b33 100644
--- a/test/CodeGen/AMDGPU/kcache-fold.ll
+++ b/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -36,15 +36,15 @@ main_body:
%29 = extractelement <4 x float> %28, i32 3
%30 = fcmp ogt float %25, 0.000000e+00
%31 = select i1 %30, float %27, float %29
- %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
- %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
- %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
- %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+ %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
+ %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
+ %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
+ %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
%36 = insertelement <4 x float> undef, float %32, i32 0
%37 = insertelement <4 x float> %36, float %33, i32 1
%38 = insertelement <4 x float> %37, float %34, i32 2
%39 = insertelement <4 x float> %38, float %35, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
ret void
}
@@ -84,17 +84,17 @@ main_body:
%29 = extractelement <4 x float> %28, i32 2
%30 = fcmp ogt float %25, 0.000000e+00
%31 = select i1 %30, float %27, float %29
- %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
- %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
- %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
- %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+ %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
+ %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
+ %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
+ %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
%36 = insertelement <4 x float> undef, float %32, i32 0
%37 = insertelement <4 x float> %36, float %33, i32 1
%38 = insertelement <4 x float> %37, float %34, i32 2
%39 = insertelement <4 x float> %38, float %35, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
ret void
}
-declare float @llvm.AMDIL.clamp.(float, float, float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
new file mode 100644
index 000000000000..21c92dbc9098
--- /dev/null
+++ b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -0,0 +1,44 @@
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Test that the alignment of kernel arguments does not impact the
+; alignment of the stack
+
+; CHECK-LABEL: {{^}}no_args:
+; CHECK: ScratchSize: 8{{$}}
+define void @no_args() {
+ %alloca = alloca i8
+ store volatile i8 0, i8* %alloca
+ ret void
+}
+
+; CHECK-LABEL: {{^}}force_align32:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align32(<8 x i32>) {
+ %alloca = alloca i8
+ store volatile i8 0, i8* %alloca
+ ret void
+}
+
+; CHECK-LABEL: {{^}}force_align64:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align64(<16 x i32>) {
+ %alloca = alloca i8
+ store volatile i8 0, i8* %alloca
+ ret void
+}
+
+; CHECK-LABEL: {{^}}force_align128:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align128(<32 x i32>) {
+ %alloca = alloca i8
+ store volatile i8 0, i8* %alloca
+ ret void
+}
+
+; CHECK-LABEL: {{^}}force_align256:
+; CHECK: ScratchSize: 8{{$}}
+define void @force_align256(<64 x i32>) {
+ %alloca = alloca i8
+ store volatile i8 0, i8* %alloca
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index e9d98ac89e72..7567b38e0cea 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
@@ -475,3 +475,55 @@ entry:
; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
; ret void
; }
+
+; FUNC-LABEL: {{^}}i1_arg:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+ store i1 %x, i1 addrspace(1)* %out, align 1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+ %ext = zext i1 %x to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+ %ext = zext i1 %x to i64
+ store i64 %ext, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+ %ext = sext i1 %x to i32
+ store i32 %ext, i32addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: v_ashrrev_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+ %ext = sext i1 %x to i64
+ store i64 %ext, i64 addrspace(1)* %out, align 8
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 84380b421051..099f0639b34c 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
@@ -7,17 +7,19 @@
; ALL-LABEL: {{^}}large_alloca_compute_shader:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-DAG: ; fixup A - offset: 4, value: SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
+; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe8f000
+; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000
; GCNHSA: .amd_kernel_code_t
; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
-; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
+; GCNHSA: compute_pgm_rsrc2_user_sgpr = 8
; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
@@ -29,7 +31,7 @@
; GCNHSA: enable_sgpr_queue_ptr = 0
; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
; GCNHSA: enable_sgpr_dispatch_id = 0
-; GCNHSA: enable_sgpr_flat_scratch_init = 0
+; GCNHSA: enable_sgpr_flat_scratch_init = 1
; GCNHSA: enable_sgpr_private_segment_size = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
@@ -39,8 +41,8 @@
; GCNHSA: .end_amd_kernel_code_t
-; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
-; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
; Scratch size = alloca size + emergency stack slot
; ALL: ; ScratchSize: 32772
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index b6f8093313cb..fb0e15eb0cb9 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -2,17 +2,17 @@
; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
; ALL-LABEL: {{^}}large_alloca_pixel_shader:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s10, -1
+; CI-DAG: s_mov_b32 s11, 0xe8f000
+; VI-DAG: s_mov_b32 s11, 0xe80000
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
; ALL: ; ScratchSize: 32772
-define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
+define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
%large = alloca [8192 x i32], align 4
%gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
store volatile i32 %x, i32* %gep
@@ -23,17 +23,17 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
}
; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s10, -1
+; CI-DAG: s_mov_b32 s11, 0xe8f000
+; VI-DAG: s_mov_b32 s11, 0xe80000
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
; ALL: ; ScratchSize: 32772
-define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
+define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 {
%large = alloca [8192 x i32], align 4
%gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
store volatile i32 %x, i32* %gep
@@ -44,4 +44,3 @@ define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
}
attributes #0 = { nounwind }
-attributes #1 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
new file mode 100644
index 000000000000..f661939214c0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -0,0 +1,117 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK: @occupancy_0(
+; CHECK: alloca [5 x i32]
+define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK: @occupancy_max(
+; CHECK: alloca [5 x i32]
+define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
+attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="3" "amdgpu-max-work-group-size"="256" }
+attributes #2 = { nounwind "amdgpu-max-waves-per-eu"="1" "amdgpu-max-work-group-size"="1600" }
+attributes #3 = { nounwind "amdgpu-max-waves-per-eu"="0" }
+attributes #4 = { nounwind "amdgpu-max-waves-per-eu"="-1" }
diff --git a/test/CodeGen/AMDGPU/large-work-group-registers.ll b/test/CodeGen/AMDGPU/large-work-group-registers.ll
new file mode 100644
index 000000000000..468633da56d8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-work-group-registers.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -post-RA-scheduler=0 < %s | FileCheck %s
+
+; CHECK: NumVgprs: 64
+define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
+main_body:
+ %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
+ %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
+ %10 = extractelement <3 x i32> %7, i32 0
+ %11 = extractelement <3 x i32> %7, i32 1
+ %12 = mul i32 %10, %11
+ %bc = bitcast <3 x i32> %7 to <3 x float>
+ %13 = extractelement <3 x float> %bc, i32 1
+ %14 = insertelement <512 x float> undef, float %13, i32 %12
+ call void @llvm.amdgcn.s.barrier()
+ %15 = extractelement <3 x i32> %6, i32 0
+ %16 = extractelement <3 x i32> %7, i32 0
+ %17 = shl i32 %15, 5
+ %18 = add i32 %17, %16
+ %19 = shl i32 %18, 4
+ %20 = extractelement <3 x i32> %7, i32 1
+ %21 = shl i32 %20, 2
+ %22 = sext i32 %21 to i64
+ %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
+ %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
+ %25 = load i32, i32 addrspace(3)* %24, align 4
+ %26 = extractelement <512 x float> %14, i32 %25
+ %27 = insertelement <4 x float> undef, float %26, i32 0
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+
+declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
+
+attributes #0 = { "amdgpu-max-work-group-size"="1024" }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/lds-alignment.ll b/test/CodeGen/AMDGPU/lds-alignment.ll
new file mode 100644
index 000000000000..99334585e589
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -0,0 +1,268 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s
+
+@lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
+@lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
+
+@lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
+@lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32
+
+@lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
+@lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
+
+declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #0
+
+
+; HSA-LABEL: {{^}}test_no_round_size_1:
+; HSA: workgroup_group_segment_byte_size = 38
+define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
+ ret void
+}
+
+; There are two objects, so one requires padding to to be correctly
+; aligned after the other.
+
+; (38 -> 48) + 38 = 92
+
+; I don't think it is necessary to add padding after since if there
+; were to be a dynamically sized LDS kernel arg, the runtime should
+; add the alignment padding if necessary alignment padding if needed.
+
+; HSA-LABEL: {{^}}test_round_size_2:
+; HSA: workgroup_group_segment_byte_size = 86
+; HSA: group_segment_alignment = 4
+define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
+
+ %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.1.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.1.bc, i32 38, i32 4, i1 false)
+
+ ret void
+}
+
+; 38 + (10 pad) + 38
+; HSA-LABEL: {{^}}test_round_size_2_align_8:
+; HSA: workgroup_group_segment_byte_size = 86
+; HSA: group_segment_alignment = 4
+define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+ ret void
+}
+
+; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
+; HSA: workgroup_group_segment_byte_size = 38
+; HSA: group_segment_alignment = 4
+define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
+ ret void
+}
+
+; HSA-LABEL: {{^}}test_round_lds_arg:
+; HSA: workgroup_group_segment_byte_size = 0
+; HSA: group_segment_alignment = 4
+define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
+ ret void
+}
+
+; FIXME: Parameter alignment not considered
+; HSA-LABEL: {{^}}test_high_align_lds_arg:
+; HSA: workgroup_group_segment_byte_size = 0
+; HSA: group_segment_alignment = 4
+define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
+ ret void
+}
+
+; (7 * 8) + (39 * 4) = 212
+; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
+; HSA: workgroup_group_segment_byte_size = 212
+; HSA: group_segment_alignment = 4
+define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
+
+ %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
+
+ ret void
+}
+
+; (39 * 4) + (4 pad) + (7 * 8) = 216
+; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
+; HSA: workgroup_group_segment_byte_size = 216
+; HSA: group_segment_alignment = 4
+define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
+
+ %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
+
+ ret void
+}
+; Test how the size needed for padding changes based on when the
+; global is encountered during lowering. There should be a consistent
+; order to minimize padding waste.
+;
+; The way global addresses are lowered now, this is in inverse of
+; first use order which isn't great.
+;
+; This should be the optimal order for these globals. If sorted to
+; minimize padding, the minimum possible size is: align 32, align 8,
+; align 16
+
+
+; align 32, 16, 8
+; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
+; HSA-LABEL: {{^}}test_round_size_3_order0:
+; HSA: workgroup_group_segment_byte_size = 134
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+ ret void
+}
+
+; align 32, 8, 16
+; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
+; HSA-LABEL: {{^}}test_round_size_3_order1:
+; HSA: workgroup_group_segment_byte_size = 134
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+ ret void
+}
+
+; align 16, 32, 8
+; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
+; HSA-LABEL: {{^}}test_round_size_3_order2:
+; HSA: workgroup_group_segment_byte_size = 150
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+ ret void
+}
+
+; align 16, 8, 32
+; 38 + (2 pad) + 38 + (2 pad) + 38
+; HSA-LABEL: {{^}}test_round_size_3_order3:
+; HSA: workgroup_group_segment_byte_size = 118
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+ ret void
+}
+
+; align 8, 32, 16
+; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
+; HSA-LABEL: {{^}}test_round_size_3_order4:
+; HSA: workgroup_group_segment_byte_size = 142
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+ ret void
+}
+
+; align 8, 16, 32
+; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
+; HSA-LABEL: {{^}}test_round_size_3_order5:
+; HSA: workgroup_group_segment_byte_size = 126
+; HSA: group_segment_alignment = 4
+define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+ %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
+
+ %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
+ call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
+ call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
+
+ ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll
index bf8df63be9fd..9875814b03d3 100644
--- a/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -1,7 +1,7 @@
; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
-; CHECK: error: unsupported initializer for address space in load_init_lds_global
+; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space
@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
diff --git a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
new file mode 100644
index 000000000000..0c734c6d99dd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure that m0 is not reinitialized in the loop.
+
+; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
+; GCN: s_cbranch_scc1 BB0_3
+
+; Initialize in preheader
+; GCN: s_mov_b32 m0, -1
+
+; GCN: BB0_2:
+; GCN: ds_read_b32
+; GCN: buffer_store_dword
+
+; GCN: s_cbranch_vccz BB0_2
+
+; GCN: BB0_3:
+; GCN-NEXT: s_endpgm
+define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
+bb:
+ %tmp = icmp sgt i32 %n, 0
+ br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader: ; preds = %bb
+ br label %.lr.ph
+
+._crit_edge.loopexit: ; preds = %.lr.ph
+ br label %._crit_edge
+
+._crit_edge: ; preds = %._crit_edge.loopexit, %bb
+ ret void
+
+.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %i.01 = phi i32 [ %tmp4, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+ %tmp1 = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %i.01
+ %tmp2 = load i32, i32 addrspace(3)* %tmp1, align 4
+ %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %indvars.iv
+ store i32 %tmp2, i32 addrspace(1)* %tmp3, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %tmp4 = add nuw nsw i32 %i.01, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll
index 44ffc36af149..abe472e423fc 100644
--- a/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
;
; This test checks that the lds input queue will is empty at the end of
; the ALU clause.
@@ -14,7 +14,7 @@ define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32
entry:
%0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
%1 = load i32, i32 addrspace(3)* %0
- call void @llvm.AMDGPU.barrier.local()
+ call void @llvm.r600.group.barrier()
; This will start a new clause for the vertex fetch
%2 = load i32, i32 addrspace(1)* %in
@@ -23,7 +23,7 @@ entry:
ret void
}
-declare void @llvm.AMDGPU.barrier.local()
+declare void @llvm.r600.group.barrier() nounwind convergent
; The machine scheduler does not do proper alias analysis and assumes that
; loads from global values (Note that a global value is different that a
diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll
index 3e8328659fdb..1607713090e3 100644
--- a/test/CodeGen/AMDGPU/lds-size.ll
+++ b/test/CodeGen/AMDGPU/lds-size.ll
@@ -1,11 +1,17 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
; This test makes sure we do not double count global values when they are
; used in different basic blocks.
-; CHECK: .long 166120
-; CHECK-NEXT: .long 1
-; CHECK-LABEL: {{^}}test:
+; GCN: .long 47180
+; GCN-NEXT: .long 32900
+
+; EG: .long 166120
+; EG-NEXT: .long 1
+; ALL: {{^}}test:
+
+; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
define void @test(i32 addrspace(1)* %out, i32 %cond) {
diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index fb51bc0e50c2..cb5d73fb0d8b 100644
--- a/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -1,7 +1,7 @@
; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
-; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
+; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space
@lds = addrspace(3) global [256 x i32] zeroinitializer
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index 9d2320cb2d19..82fbb7f46186 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -54,11 +54,11 @@ entry:
; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
define void @inline_literal_dot4(float addrspace(1)* %out) {
entry:
- %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store float %0, float addrspace(1)* %out
ret void
}
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/liveness.mir b/test/CodeGen/AMDGPU/liveness.mir
new file mode 100644
index 000000000000..ce49294d5b36
--- /dev/null
+++ b/test/CodeGen/AMDGPU/liveness.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -run-pass liveintervals -verify-machineinstrs -o /dev/null -debug-only=regalloc %s 2>&1 | FileCheck %s
+# REQUIRES: asserts
+# We currently maintain a main liveness range which operates like a superset of
+# all subregister liveranges. We may need to create additional SSA values at
+# merge point in this main liverange even though none of the subregister
+# liveranges needed it.
+#
+# Should see three distinct value numbers:
+# CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
+--- |
+ define void @test0() { ret void }
+...
+---
+name: test0
+registers:
+ - { id: 0, class: sreg_64 }
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ S_NOP 0, implicit-def undef %0:sub0
+ S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+ S_NOP 0, implicit-def %0:sub1
+ S_NOP 0, implicit %0:sub1
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_NOP 0, implicit %0:sub0
+...
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
deleted file mode 100644
index ca8ddbae9fbc..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
-
-; Legacy name
-declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}s_abs_i32:
-; SI: s_abs_i32
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
- %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone
- store i32 %abs, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}v_abs_i32:
-; SI: v_sub_i32_e32
-; SI: v_max_i32_e32
-; SI: s_endpgm
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
- %val = load i32, i32 addrspace(1)* %src, align 4
- %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone
- store i32 %abs, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}abs_i32_legacy_amdil:
-; SI: v_sub_i32_e32
-; SI: v_max_i32_e32
-; SI: s_endpgm
-
-; EG: SUB_INT
-; EG: MAX_INT
-define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
- %val = load i32, i32 addrspace(1)* %src, align 4
- %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone
- store i32 %abs, i32 addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
deleted file mode 100644
index db883972d646..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_global:
-; EG: GROUP_BARRIER
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_global(i32 addrspace(1)* %out) {
-entry:
- %0 = call i32 @llvm.r600.read.tidig.x()
- %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
- store i32 %0, i32 addrspace(1)* %1
- call void @llvm.AMDGPU.barrier.global()
- %2 = call i32 @llvm.r600.read.local.size.x()
- %3 = sub i32 %2, 1
- %4 = sub i32 %3, %0
- %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
- %6 = load i32, i32 addrspace(1)* %5
- store i32 %6, i32 addrspace(1)* %1
- ret void
-}
-
-declare void @llvm.AMDGPU.barrier.global()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
deleted file mode 100644
index 48fb2e0b1a8d..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_barrier_local:
-; EG: GROUP_BARRIER
-
-; SI: buffer_store_dword
-; SI: s_waitcnt
-; SI: s_barrier
-
-define void @test_barrier_local(i32 addrspace(1)* %out) {
-entry:
- %0 = call i32 @llvm.r600.read.tidig.x()
- %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
- store i32 %0, i32 addrspace(1)* %1
- call void @llvm.AMDGPU.barrier.local()
- %2 = call i32 @llvm.r600.read.local.size.x()
- %3 = sub i32 %2, 1
- %4 = sub i32 %3, %0
- %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
- %6 = load i32, i32 addrspace(1)* %5
- store i32 %6, i32 addrspace(1)* %1
- ret void
-}
-
-declare void @llvm.AMDGPU.barrier.local()
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.local.size.x() #0
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
deleted file mode 100644
index 517a55abc098..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfi_arg_arg_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
- %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
- store i32 %bfi, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_arg_arg_imm:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
- %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
- store i32 %bfi, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_arg_imm_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
- %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
- store i32 %bfi, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfi_imm_arg_arg:
-; SI: v_bfi_b32
-; EG: BFI_INT
-define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
- %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone
- store i32 %bfi, i32 addrspace(1)* %out, align 4
- ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
deleted file mode 100644
index 50492289d744..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfm_arg_arg:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; EG: BFM_INT
-define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
- %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
- store i32 %bfm, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_arg_imm:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b
-; EG: BFM_INT
-define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
- %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
- store i32 %bfm, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_imm_arg:
-; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}}
-; EG: BFM_INT
-define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
- %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
- store i32 %bfm, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_imm_imm:
-; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8
-; EG: BFM_INT
-define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
- %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
- store i32 %bfm, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_pattern:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) {
- %a = shl i32 1, %x
- %b = sub i32 %a, 1
- %c = shl i32 %b, %y
- store i32 %c, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}bfm_pattern_simple:
-; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) {
- %a = shl i32 1, %x
- %b = sub i32 %a, 1
- store i32 %b, i32 addrspace(1)* %out
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
index 11ec963ab314..cfe4cc00ee81 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
@@ -1,10 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare float @llvm.fabs.f32(float) nounwind readnone
declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
-declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
; FUNC-LABEL: {{^}}clamp_0_1_f32:
; SI: s_load_dword [[ARG:s[0-9]+]],
@@ -55,13 +54,3 @@ define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounw
store float %clamp, float addrspace(1)* %out, align 4
ret void
}
-
-; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
- %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
- store float %clamp, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
index e95a51093cb7..78b88122229b 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
@@ -1,59 +1,57 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: {{^}}cube:
+; CHECK-LABEL: {{^}}cube:
; CHECK: CUBE T{{[0-9]}}.X
; CHECK: CUBE T{{[0-9]}}.Y
; CHECK: CUBE T{{[0-9]}}.Z
; CHECK: CUBE * T{{[0-9]}}.W
-define void @cube() #0 {
+define amdgpu_ps void @cube() {
main_body:
- %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
- %1 = extractelement <4 x float> %0, i32 3
- %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
- %3 = extractelement <4 x float> %2, i32 0
- %4 = fdiv float %3, %1
- %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
- %6 = extractelement <4 x float> %5, i32 1
- %7 = fdiv float %6, %1
- %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
- %9 = extractelement <4 x float> %8, i32 2
- %10 = fdiv float %9, %1
- %11 = insertelement <4 x float> undef, float %4, i32 0
- %12 = insertelement <4 x float> %11, float %7, i32 1
- %13 = insertelement <4 x float> %12, float %10, i32 2
- %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
- %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
- %16 = extractelement <4 x float> %15, i32 0
- %17 = extractelement <4 x float> %15, i32 1
- %18 = extractelement <4 x float> %15, i32 2
- %19 = extractelement <4 x float> %15, i32 3
- %20 = call float @fabs(float %18)
- %21 = fdiv float 1.000000e+00, %20
- %22 = fmul float %16, %21
- %23 = fadd float %22, 1.500000e+00
- %24 = fmul float %17, %21
- %25 = fadd float %24, 1.500000e+00
- %26 = insertelement <4 x float> undef, float %25, i32 0
- %27 = insertelement <4 x float> %26, float %23, i32 1
- %28 = insertelement <4 x float> %27, float %19, i32 2
- %29 = insertelement <4 x float> %28, float %25, i32 3
- %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
- call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
+ %tmp = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+ %tmp1 = extractelement <4 x float> %tmp, i32 3
+ %tmp2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+ %tmp3 = extractelement <4 x float> %tmp2, i32 0
+ %tmp4 = fdiv float %tmp3, %tmp1
+ %tmp5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+ %tmp6 = extractelement <4 x float> %tmp5, i32 1
+ %tmp7 = fdiv float %tmp6, %tmp1
+ %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+ %tmp9 = extractelement <4 x float> %tmp8, i32 2
+ %tmp10 = fdiv float %tmp9, %tmp1
+ %tmp11 = insertelement <4 x float> undef, float %tmp4, i32 0
+ %tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1
+ %tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2
+ %tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3
+ %tmp15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp14)
+ %tmp16 = extractelement <4 x float> %tmp15, i32 0
+ %tmp17 = extractelement <4 x float> %tmp15, i32 1
+ %tmp18 = extractelement <4 x float> %tmp15, i32 2
+ %tmp19 = extractelement <4 x float> %tmp15, i32 3
+ %tmp20 = call float @llvm.fabs.f32(float %tmp18)
+ %tmp21 = fdiv float 1.000000e+00, %tmp20
+ %tmp22 = fmul float %tmp16, %tmp21
+ %tmp23 = fadd float %tmp22, 1.500000e+00
+ %tmp24 = fmul float %tmp17, %tmp21
+ %tmp25 = fadd float %tmp24, 1.500000e+00
+ %tmp26 = insertelement <4 x float> undef, float %tmp25, i32 0
+ %tmp27 = insertelement <4 x float> %tmp26, float %tmp23, i32 1
+ %tmp28 = insertelement <4 x float> %tmp27, float %tmp19, i32 2
+ %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 3
+ %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %tmp31, i32 0, i32 0)
ret void
}
; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
-; Function Attrs: readnone
-declare float @fabs(float) #1
+; Function Attrs: nounwind readnone
+declare float @llvm.fabs.f32(float) #0
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
deleted file mode 100644
index 8b32f696449e..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
-declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
-
-; SI-LABEL: {{^}}test_unpack_byte0_to_float:
-; SI: v_cvt_f32_ubyte0
-define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
- %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
- store float %cvt, float addrspace(1)* %out, align 4
- ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte1_to_float:
-; SI: v_cvt_f32_ubyte1
-define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
- %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
- store float %cvt, float addrspace(1)* %out, align 4
- ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte2_to_float:
-; SI: v_cvt_f32_ubyte2
-define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
- %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
- store float %cvt, float addrspace(1)* %out, align 4
- ret void
-}
-
-; SI-LABEL: {{^}}test_unpack_byte3_to_float:
-; SI: v_cvt_f32_ubyte3
-define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %val = load i32, i32 addrspace(1)* %in, align 4
- %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
- store float %cvt, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
deleted file mode 100644
index 6049dca04012..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-
-declare double @llvm.fabs.f64(double %Val)
-declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}fract_f64:
-; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
- %val = load double, double addrspace(1)* %src, align 4
- %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
- store double %fract, double addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f64_neg:
-; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
- %val = load double, double addrspace(1)* %src, align 4
- %neg = fsub double 0.0, %val
- %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
- store double %fract, double addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f64_neg_abs:
-; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
-; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
-; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
-; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
-; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]]
-; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
-; CI: buffer_store_dwordx2 [[FRC]]
-define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
- %val = load double, double addrspace(1)* %src, align 4
- %abs = call double @llvm.fabs.f64(double %val)
- %neg = fsub double 0.0, %abs
- %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
- store double %fract, double addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
deleted file mode 100644
index 7501b4b75465..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float %Val)
-declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
-
-; Legacy name
-declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}fract_f32:
-; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
- %val = load float, float addrspace(1)* %src, align 4
- %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone
- store float %fract, float addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
-; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
-; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
- %val = load float, float addrspace(1)* %src, align 4
- %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone
- store float %fract, float addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_neg:
-; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
-; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
-; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
- %val = load float, float addrspace(1)* %src, align 4
- %neg = fsub float 0.0, %val
- %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
- store float %fract, float addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}fract_f32_neg_abs:
-; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
-; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
-; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
-; GCN: buffer_store_dword [[RESULT]]
-; EG: FRACT
-define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
- %val = load float, float addrspace(1)* %src, align 4
- %abs = call float @llvm.fabs.f32(float %val)
- %neg = fsub float 0.0, %abs
- %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
- store float %fract, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
deleted file mode 100644
index 42102e30f071..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-; FIXME: Store of i32 seems to be broken pre-EG somehow?
-
-declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_imad24:
-; SI: v_mad_i32_i24
-; CM: MULADD_INT24
-; R600: MULLO_INT
-; R600: ADD_INT
-define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
- %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
- store i32 %mad, i32 addrspace(1)* %out, align 4
- ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
deleted file mode 100644
index 46662f96c290..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_imax:
-; SI: v_max_i32_e32
-define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
- %load = load i32, i32 addrspace(1)* %in, align 4
- %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load)
- %bc = bitcast i32 %max to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; SI-LABEL: {{^}}scalar_imax:
-; SI: s_max_i32
-define void @scalar_imax(i32 %p0, i32 %p1) #0 {
-entry:
- %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
- %bc = bitcast i32 %max to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.imax(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
deleted file mode 100644
index 34b454e23755..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_imin:
-; SI: v_min_i32_e32
-define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
- %load = load i32, i32 addrspace(1)* %in, align 4
- %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load)
- %bc = bitcast i32 %min to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; SI-LABEL: {{^}}scalar_imin:
-; SI: s_min_i32
-define void @scalar_imin(i32 %p0, i32 %p1) #0 {
-entry:
- %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
- %bc = bitcast i32 %min to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.imin(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
deleted file mode 100644
index fdc1172260b9..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_imul24:
-; SI: v_mul_i32_i24
-; CM: MUL_INT24
-; R600: MULLO_INT
-define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
- %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
- store i32 %mul, i32 addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
index 057708e7b5cc..59997d27683d 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@@ -5,7 +5,7 @@
; SI-NOT: v_cmpx_le_f32
; SI: s_mov_b64 exec, 0
-define void @kill_gs_const() #0 {
+define amdgpu_gs void @kill_gs_const() {
main_body:
%0 = icmp ule i32 0, 3
%1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
@@ -21,7 +21,7 @@ main_body:
; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
+define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
entry:
%tmp0 = fcmp olt float %13, 0.0
call void @llvm.AMDGPU.kill(float %14)
@@ -33,7 +33,4 @@ entry:
declare void @llvm.AMDGPU.kill(float)
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="2" }
-attributes #1 = { "ShaderType"="0" }
-
!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
deleted file mode 100644
index a59c0ce6d675..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
-declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
-
-; SI-LABEL: {{^}}test_ldexp_f32:
-; SI: v_ldexp_f32
-; SI: s_endpgm
-define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
- %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone
- store float %result, float addrspace(1)* %out, align 4
- ret void
-}
-
-; SI-LABEL: {{^}}test_ldexp_f64:
-; SI: v_ldexp_f64
-; SI: s_endpgm
-define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
- %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone
- store double %result, double addrspace(1)* %out, align 8
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
deleted file mode 100644
index 4cafd563685e..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_legacy_f32:
-; SI: v_rsq_legacy_f32_e32
-; EG: RECIPSQRT_IEEE
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
- %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
- store float %rsq, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
deleted file mode 100644
index 83b56a5029d3..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> inreg %reg0) #0 {
- %r0 = extractelement <4 x float> %reg0, i32 0
- %r1 = extractelement <4 x float> %reg0, i32 1
- %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
- %vec = insertelement <4 x float> undef, float %r2, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
- ret void
-}
-
-declare float @llvm.AMDGPU.mul(float ,float ) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" } \ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
deleted file mode 100644
index d2a655bf909c..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
-declare double @llvm.sqrt.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}rcp_f64:
-; SI: v_rcp_f64_e32
-define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
- %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
- store double %rcp, double addrspace(1)* %out, align 8
- ret void
-}
-
-; FUNC-LABEL: {{^}}rcp_pat_f64:
-; SI: v_rcp_f64_e32
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
- %rcp = fdiv double 1.0, %src
- store double %rcp, double addrspace(1)* %out, align 8
- ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
-; SI-UNSAFE: v_rsq_f64_e32
-; SI-SAFE-NOT: v_rsq_f64_e32
-; SI-SAFE: v_sqrt_f64
-; SI-SAFE: v_rcp_f64
-define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
- %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
- %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
- store double %rcp, double addrspace(1)* %out, align 8
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
deleted file mode 100644
index edd6e9a72f1b..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
-
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
-declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
-
-declare float @llvm.sqrt.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rcp_f32:
-; SI: v_rcp_f32_e32
-; EG: RECIP_IEEE
-define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
- %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
- store float %rcp, float addrspace(1)* %out, align 4
- ret void
-}
-
-; FIXME: Evergreen only ever does unsafe fp math.
-; FUNC-LABEL: {{^}}rcp_pat_f32:
-
-; SI-SAFE: v_rcp_f32_e32
-; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32
-
-; EG: RECIP_IEEE
-
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
- %rcp = fdiv float 1.0, %src
- store float %rcp, float addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_rcp_pat_f32:
-; SI-UNSAFE: v_rsq_f32_e32
-; SI-SAFE: v_sqrt_f32_e32
-; SI-SAFE: v_rcp_f32_e32
-
-; EG: RECIPSQRT_IEEE
-define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
- %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
- %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
- store float %rcp, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
deleted file mode 100644
index 67f1d22c7178..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
-
-declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_clamped_f64:
-; SI: v_rsq_clamp_f64_e32
-
-; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
-; TODO: this constant should be folded:
-; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
-; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
-; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
-; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
-; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
-; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
-; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
-
-define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
- %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
- store double %rsq_clamped, double addrspace(1)* %out, align 8
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
deleted file mode 100644
index eeff2536b232..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-
-declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_clamped_f32:
-; SI: v_rsq_clamp_f32_e32
-
-; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}}
-; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
-; TODO: this constant should be folded:
-; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff
-; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]]
-
-; EG: RECIPSQRT_CLAMPED
-
-define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
- %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
- store float %rsq_clamped, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
deleted file mode 100644
index 36b72f14db19..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
-
-; FUNC-LABEL: {{^}}rsq_f32:
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
- %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
- store float %rsq, float addrspace(1)* %out, align 4
- ret void
-}
-
-; TODO: Really these should be constant folded
-; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
- %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
- store float %rsq, float addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
-; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-; EG: RECIPSQRT_IEEE
-define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
- %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
- store float %rsq, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
deleted file mode 100644
index 10206609bb57..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
-;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
-
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
- %addr = load <4 x float>, <4 x float> addrspace(1)* %in
- %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
- %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
- %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
- %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
- %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
- %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
- %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
- %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
- %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
- %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
- %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
- %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
- %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
- %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
- %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
- %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
- store <4 x float> %res16, <4 x float> addrspace(1)* %out
- ret void
-}
-
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
deleted file mode 100644
index a30a8e083eb6..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI: {{^}}amdgpu_trunc:
-; SI: v_trunc_f32
-
-define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
-entry:
- %0 = call float @llvm.AMDGPU.trunc(float %x)
- store float %0, float addrspace(1)* %out
- ret void
-}
-
-declare float @llvm.AMDGPU.trunc(float ) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
deleted file mode 100644
index 77a073b0cb03..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}test_umad24:
-; SI: v_mad_u32_u24
-; EG: MULADD_UINT24
-; R600: MULLO_UINT
-; R600: ADD_INT
-define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
- %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
- store i32 %mad, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}commute_umad24:
-; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
-; SI: buffer_store_dword [[RESULT]]
-define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1
-
- %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4
- %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4
- %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
- store i32 %mad, i32 addrspace(1)* %out.gep, align 4
- ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
deleted file mode 100644
index a97d103016d3..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_umax:
-; SI: v_max_u32_e32
-define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
- %load = load i32, i32 addrspace(1)* %in, align 4
- %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load)
- %bc = bitcast i32 %max to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; SI-LABEL: {{^}}scalar_umax:
-; SI: s_max_u32
-define void @scalar_umax(i32 %p0, i32 %p1) #0 {
-entry:
- %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
- %bc = bitcast i32 %max to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; SI-LABEL: {{^}}trunc_zext_umax:
-; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
-; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: and
-; SI: buffer_store_short [[RESULT]],
-define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
- %tmp5 = load i8, i8 addrspace(1)* %src, align 1
- %tmp2 = zext i8 %tmp5 to i32
- %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
- %tmp4 = trunc i32 %tmp3 to i8
- %tmp6 = zext i8 %tmp4 to i16
- store i16 %tmp6, i16 addrspace(1)* %out, align 2
- ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.umax(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
deleted file mode 100644
index 2acd10e0c631..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}vector_umin:
-; SI: v_min_u32_e32
-define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
-main_body:
- %load = load i32, i32 addrspace(1)* %in, align 4
- %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load)
- %bc = bitcast i32 %min to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; SI-LABEL: {{^}}scalar_umin:
-; SI: s_min_u32
-define void @scalar_umin(i32 %p0, i32 %p1) #0 {
-entry:
- %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
- %bc = bitcast i32 %min to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
- ret void
-}
-
-; SI-LABEL: {{^}}trunc_zext_umin:
-; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
-; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: and
-; SI: buffer_store_short [[RESULT]],
-define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
- %tmp5 = load i8, i8 addrspace(1)* %src, align 1
- %tmp2 = zext i8 %tmp5 to i32
- %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
- %tmp4 = trunc i32 %tmp3 to i8
- %tmp6 = zext i8 %tmp4 to i16
- store i16 %tmp6, i16 addrspace(1)* %out, align 2
- ret void
-}
-
-; Function Attrs: readnone
-declare i32 @llvm.AMDGPU.umin(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
deleted file mode 100644
index 76624a078b3a..000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_umul24:
-; SI: v_mul_u32_u24
-; R600: MUL_UINT24
-; R600: MULLO_UINT
-define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
- %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
- store i32 %mul, i32 addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
index fdc324087015..ca1faebb77e7 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
@@ -10,7 +10,7 @@
;GCN: v_interp_p1_f32
;GCN: v_interp_p2_f32
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) {
main_body:
%5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
%6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
@@ -25,7 +25,7 @@ main_body:
; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
-define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
main_body:
%22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
%23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
@@ -42,19 +42,18 @@ main_body:
}
; Function Attrs: readnone
-declare float @fabs(float) #2
+declare float @fabs(float) #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
+declare i32 @llvm.SI.packf16(float, float) #0
; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.constant(i32, i32, i32) #1
+declare float @llvm.SI.fs.constant(i32, i32, i32) #0
; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
index 275cb580bc9b..aef9f660436e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
@@ -2,10 +2,10 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;CHECK-LABEL: {{^}}gather4_v2:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_v2() #0 {
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_v2() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -15,10 +15,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4() #0 {
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -28,10 +28,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_cl:
-;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl() #0 {
+;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_cl() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -41,10 +41,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_l:
-;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l() #0 {
+;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_l() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -54,10 +54,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_b:
-;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b() #0 {
+;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -67,10 +67,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_b_cl:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl() #0 {
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_cl() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -80,10 +80,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_b_cl_v8:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl_v8() #0 {
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_cl_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -93,10 +93,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_lz_v2:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz_v2() #0 {
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_lz_v2() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -106,10 +106,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_lz:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz() #0 {
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_lz() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -121,10 +121,10 @@ main_body:
;CHECK-LABEL: {{^}}gather4_o:
-;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_o() #0 {
+;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -134,10 +134,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_cl_o:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl_o() #0 {
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_cl_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -147,10 +147,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_cl_o_v8:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_cl_o_v8() #0 {
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_cl_o_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -160,10 +160,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_l_o:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l_o() #0 {
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_l_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -173,10 +173,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_l_o_v8:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_l_o_v8() #0 {
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_l_o_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -186,10 +186,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_b_o:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_o() #0 {
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -199,10 +199,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_b_o_v8:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_o_v8() #0 {
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_o_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -212,10 +212,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_b_cl_o:
-;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_b_cl_o() #0 {
+;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_b_cl_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -225,10 +225,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_lz_o:
-;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_lz_o() #0 {
+;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_lz_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -240,10 +240,10 @@ main_body:
;CHECK-LABEL: {{^}}gather4_c:
-;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c() #0 {
+;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -253,10 +253,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_cl:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl() #0 {
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_cl() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -266,10 +266,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_cl_v8:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl_v8() #0 {
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_cl_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -279,10 +279,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_l:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l() #0 {
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_l() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -292,10 +292,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_l_v8:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l_v8() #0 {
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_l_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -305,10 +305,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_b:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b() #0 {
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -318,10 +318,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_b_v8:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_v8() #0 {
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -331,10 +331,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_b_cl:
-;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_cl() #0 {
+;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_cl() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -344,10 +344,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_lz:
-;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz() #0 {
+;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_lz() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -359,10 +359,10 @@ main_body:
;CHECK-LABEL: {{^}}gather4_c_o:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_o() #0 {
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -372,10 +372,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_o_v8:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_o_v8() #0 {
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_o_v8() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -385,10 +385,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_cl_o:
-;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_cl_o() #0 {
+;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_cl_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -398,10 +398,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_l_o:
-;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_l_o() #0 {
+;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_l_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -411,10 +411,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_b_o:
-;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_o() #0 {
+;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -424,10 +424,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
-;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_b_cl_o() #0 {
+;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_b_cl_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -437,10 +437,10 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_lz_o:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz_o() #0 {
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_lz_o() {
main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
%r2 = extractelement <4 x float> %r, i32 2
@@ -450,60 +450,76 @@ main_body:
}
;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @gather4_c_lz_o_v8() #0 {
-main_body:
- %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
- %r0 = extractelement <4 x float> %r, i32 0
- %r1 = extractelement <4 x float> %r, i32 1
- %r2 = extractelement <4 x float> %r, i32 2
- %r3 = extractelement <4 x float> %r, i32 3
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
- ret void
-}
-
-
-
-declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
+define amdgpu_ps void @gather4_c_lz_o_v8() {
+main_body:
+ %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r0 = extractelement <4 x float> %r, i32 0
+ %r1 = extractelement <4 x float> %r, i32 1
+ %r2 = extractelement <4 x float> %r, i32 2
+ %r3 = extractelement <4 x float> %r, i32 3
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_sgpr_bug:
+;
+; This crashed at some point due to a bug in FixSGPRCopies. Derived from the
+; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877
+;
+;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
+;CHECK: s_waitcnt lgkmcnt(0)
+;CHECK: s_mov_b32 s[[LO]], 0
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8
+define amdgpu_ps float @gather4_sgpr_bug() {
+main_body:
+ %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef, align 16
+ %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
+ %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp4 = extractelement <4 x float> %tmp2, i32 1
+ %tmp9 = fadd float undef, %tmp4
+ ret float %tmp9
+}
+
+declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
index 06ee98e91b31..ac34d31b97c1 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
@@ -2,10 +2,10 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;CHECK-LABEL: {{^}}getlod:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod() #0 {
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
+define amdgpu_ps void @getlod() {
main_body:
- %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
@@ -13,10 +13,10 @@ main_body:
}
;CHECK-LABEL: {{^}}getlod_v2:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod_v2() #0 {
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
+define amdgpu_ps void @getlod_v2() {
main_body:
- %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
@@ -24,10 +24,10 @@ main_body:
}
;CHECK-LABEL: {{^}}getlod_v4:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getlod_v4() #0 {
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
+define amdgpu_ps void @getlod_v4() {
main_body:
- %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+ %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
%r1 = extractelement <4 x float> %r, i32 1
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
@@ -35,11 +35,10 @@ main_body:
}
-declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll
index 0fac8d799562..50341e3e207f 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.image.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.ll
@@ -2,8 +2,8 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;CHECK-LABEL: {{^}}image_load:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @image_load() #0 {
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @image_load() {
main_body:
%r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -15,8 +15,8 @@ main_body:
}
;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @image_load_mip() #0 {
+;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @image_load_mip() {
main_body:
%r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -28,8 +28,8 @@ main_body:
}
;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @getresinfo() #0 {
+;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @getresinfo() {
main_body:
%r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -40,11 +40,10 @@ main_body:
ret void
}
-declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
index ce9558cbf81d..7cdd9559994e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
@@ -2,11 +2,11 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
; CHECK-LABEL: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
-define void @v1(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xd
+define amdgpu_ps void @v1(i32 %a1) {
entry:
%0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%2 = extractelement <4 x float> %1, i32 0
%3 = extractelement <4 x float> %1, i32 2
%4 = extractelement <4 x float> %1, i32 3
@@ -15,11 +15,11 @@ entry:
}
; CHECK-LABEL: {{^}}v2:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11
-define void @v2(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xb
+define amdgpu_ps void @v2(i32 %a1) {
entry:
%0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%2 = extractelement <4 x float> %1, i32 0
%3 = extractelement <4 x float> %1, i32 1
%4 = extractelement <4 x float> %1, i32 3
@@ -28,11 +28,11 @@ entry:
}
; CHECK-LABEL: {{^}}v3:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
-define void @v3(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xe
+define amdgpu_ps void @v3(i32 %a1) {
entry:
%0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%2 = extractelement <4 x float> %1, i32 1
%3 = extractelement <4 x float> %1, i32 2
%4 = extractelement <4 x float> %1, i32 3
@@ -41,11 +41,11 @@ entry:
}
; CHECK-LABEL: {{^}}v4:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7
-define void @v4(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x7
+define amdgpu_ps void @v4(i32 %a1) {
entry:
%0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%2 = extractelement <4 x float> %1, i32 0
%3 = extractelement <4 x float> %1, i32 1
%4 = extractelement <4 x float> %1, i32 2
@@ -54,11 +54,11 @@ entry:
}
; CHECK-LABEL: {{^}}v5:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
-define void @v5(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xa
+define amdgpu_ps void @v5(i32 %a1) {
entry:
%0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%2 = extractelement <4 x float> %1, i32 1
%3 = extractelement <4 x float> %1, i32 3
call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
@@ -66,11 +66,11 @@ entry:
}
; CHECK-LABEL: {{^}}v6:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6
-define void @v6(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x6
+define amdgpu_ps void @v6(i32 %a1) {
entry:
%0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%2 = extractelement <4 x float> %1, i32 1
%3 = extractelement <4 x float> %1, i32 2
call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
@@ -78,19 +78,17 @@ entry:
}
; CHECK-LABEL: {{^}}v7:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9
-define void @v7(i32 %a1) #0 {
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x9
+define amdgpu_ps void @v7(i32 %a1) {
entry:
%0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%2 = extractelement <4 x float> %1, i32 0
%3 = extractelement <4 x float> %1, i32 3
call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
ret void
}
-declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+declare <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) readnone
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
index 4bc638a28063..60077dc218fd 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
@@ -3,8 +3,8 @@
;CHECK-LABEL: {{^}}sample:
;CHECK: s_wqm
-;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample() #0 {
+;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -17,8 +17,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_cl:
;CHECK: s_wqm
-;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cl() #0 {
+;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -31,8 +31,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_d:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d() #0 {
+;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -45,8 +45,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_d_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d_cl() #0 {
+;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -59,8 +59,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_l:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_l() #0 {
+;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_l() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -73,8 +73,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_b:
;CHECK: s_wqm
-;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b() #0 {
+;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -87,8 +87,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_b_cl:
;CHECK: s_wqm
-;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b_cl() #0 {
+;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -101,8 +101,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_lz:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_lz() #0 {
+;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_lz() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -115,8 +115,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_cd:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd() #0 {
+;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -129,8 +129,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_cd_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd_cl() #0 {
+;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -143,8 +143,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c:
;CHECK: s_wqm
-;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c() #0 {
+;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -157,8 +157,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_cl:
;CHECK: s_wqm
-;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cl() #0 {
+;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -171,8 +171,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_d:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d() #0 {
+;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -185,8 +185,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_d_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d_cl() #0 {
+;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -199,8 +199,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_l:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_l() #0 {
+;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_l() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -213,8 +213,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_b:
;CHECK: s_wqm
-;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b() #0 {
+;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -227,8 +227,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_b_cl:
;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b_cl() #0 {
+;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -241,8 +241,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_lz:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_lz() #0 {
+;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_lz() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -255,8 +255,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_cd:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd() #0 {
+;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -269,8 +269,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_cd_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd_cl() #0 {
+;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -282,29 +282,28 @@ main_body:
}
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
index 9d8935414ed9..34d4f6825690 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
@@ -3,8 +3,8 @@
;CHECK-LABEL: {{^}}sample:
;CHECK: s_wqm
-;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample() #0 {
+;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -17,8 +17,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_cl:
;CHECK: s_wqm
-;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cl() #0 {
+;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -31,8 +31,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_d:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d() #0 {
+;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -45,8 +45,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_d_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_d_cl() #0 {
+;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_d_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -59,8 +59,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_l:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_l() #0 {
+;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_l() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -73,8 +73,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_b:
;CHECK: s_wqm
-;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b() #0 {
+;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -87,8 +87,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_b_cl:
;CHECK: s_wqm
-;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_b_cl() #0 {
+;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_b_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -101,8 +101,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_lz:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_lz() #0 {
+;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_lz() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -115,8 +115,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_cd:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd() #0 {
+;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -129,8 +129,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_cd_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_cd_cl() #0 {
+;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_cd_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -143,8 +143,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c:
;CHECK: s_wqm
-;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c() #0 {
+;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -157,8 +157,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_cl:
;CHECK: s_wqm
-;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cl() #0 {
+;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -171,8 +171,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_d:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d() #0 {
+;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -185,8 +185,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_d_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_d_cl() #0 {
+;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_d_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -199,8 +199,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_l:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_l() #0 {
+;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_l() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -213,8 +213,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_b:
;CHECK: s_wqm
-;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b() #0 {
+;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -227,8 +227,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_b_cl:
;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_b_cl() #0 {
+;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_b_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -241,8 +241,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_lz:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_lz() #0 {
+;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_lz() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -255,8 +255,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_cd:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd() #0 {
+;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -269,8 +269,8 @@ main_body:
;CHECK-LABEL: {{^}}sample_c_cd_cl:
;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @sample_c_cd_cl() #0 {
+;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @sample_c_cd_cl() {
main_body:
%r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%r0 = extractelement <4 x float> %r, i32 0
@@ -282,29 +282,28 @@ main_body:
}
-declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.imageload.ll b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
deleted file mode 100644
index b67716c3b665..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
- %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
- %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
- %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
- %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
- %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
- %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
- %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
- %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
- %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
- %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
- %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1,
- <32 x i8> undef, i32 1)
- %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2,
- <32 x i8> undef, i32 2)
- %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3,
- <32 x i8> undef, i32 3)
- %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4,
- <32 x i8> undef, i32 4)
- %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5,
- <32 x i8> undef, i32 5)
- %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6,
- <32 x i8> undef, i32 6)
- %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10,
- <32 x i8> undef, i32 10)
- %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11,
- <32 x i8> undef, i32 11)
- %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15,
- <32 x i8> undef, i32 15)
- %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16,
- <32 x i8> undef, i32 16)
- %e1 = extractelement <4 x i32> %res1, i32 0
- %e2 = extractelement <4 x i32> %res2, i32 1
- %e3 = extractelement <4 x i32> %res3, i32 2
- %e4 = extractelement <4 x i32> %res4, i32 3
- %t0 = extractelement <4 x i32> %res5, i32 0
- %t1 = extractelement <4 x i32> %res5, i32 1
- %e5 = add i32 %t0, %t1
- %t2 = extractelement <4 x i32> %res6, i32 0
- %t3 = extractelement <4 x i32> %res6, i32 2
- %e6 = add i32 %t2, %t3
- %t10 = extractelement <4 x i32> %res10, i32 2
- %t11 = extractelement <4 x i32> %res10, i32 3
- %e10 = add i32 %t10, %t11
- %t12 = extractelement <4 x i32> %res11, i32 0
- %t13 = extractelement <4 x i32> %res11, i32 1
- %t14 = extractelement <4 x i32> %res11, i32 2
- %t15 = add i32 %t12, %t13
- %e11 = add i32 %t14, %t15
- %t28 = extractelement <4 x i32> %res15, i32 0
- %t29 = extractelement <4 x i32> %res15, i32 1
- %t30 = extractelement <4 x i32> %res15, i32 2
- %t31 = extractelement <4 x i32> %res15, i32 3
- %t32 = add i32 %t28, %t29
- %t33 = add i32 %t30, %t31
- %e15 = add i32 %t32, %t33
- %e16 = extractelement <4 x i32> %res16, i32 3
- %s1 = add i32 %e1, %e2
- %s2 = add i32 %s1, %e3
- %s3 = add i32 %s2, %e4
- %s4 = add i32 %s3, %e5
- %s5 = add i32 %s4, %e6
- %s9 = add i32 %s5, %e10
- %s10 = add i32 %s9, %e11
- %s14 = add i32 %s10, %e15
- %s15 = add i32 %s14, %e16
- %s16 = bitcast i32 %s15 to float
- call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
- ret void
-}
-
-; Test that ccordinates are stored in vgprs and not sgprs
-; CHECK: vgpr_coords
-; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
-define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-main_body:
- %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0
- %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2
- %22 = getelementptr float, float addrspace(2)* %21, i32 0
- %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1
- %24 = getelementptr float, float addrspace(2)* %21, i32 1
- %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1
- %26 = getelementptr float, float addrspace(2)* %21, i32 4
- %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1
- %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
- %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2
- %30 = bitcast float %27 to i32
- %31 = bitcast float %23 to i32
- %32 = bitcast float %25 to i32
- %33 = insertelement <4 x i32> undef, i32 %31, i32 0
- %34 = insertelement <4 x i32> %33, i32 %32, i32 1
- %35 = insertelement <4 x i32> %34, i32 %30, i32 2
- %36 = insertelement <4 x i32> %35, i32 undef, i32 3
- %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
- %38 = extractelement <4 x i32> %37, i32 0
- %39 = extractelement <4 x i32> %37, i32 1
- %40 = extractelement <4 x i32> %37, i32 2
- %41 = extractelement <4 x i32> %37, i32 3
- %42 = bitcast i32 %38 to float
- %43 = bitcast i32 %39 to float
- %44 = bitcast i32 %40 to float
- %45 = bitcast i32 %41 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
- ret void
-}
-
-declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-
-!0 = !{!"const", null}
-!1 = !{}
-!2 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
index f6c258539d5b..d0cc00d81b4e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -7,14 +7,14 @@
; FIXME: Out of bounds immediate offset crashes
; CHECK-LABEL: {{^}}main:
-; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
-define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
+define amdgpu_vs void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
main_body:
%tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
%tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -40,14 +40,13 @@ main_body:
}
; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1
+declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { nounwind readonly }
+attributes #0 = { nounwind readonly }
!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
index 0155757632d4..6984b4cf488a 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
@@ -6,7 +6,7 @@
; GCN: v_cvt_pkrtz_f16_f32
; GCN-NOT: v_cvt_pkrtz_f16_f32
-define void @main(float %src) #0 {
+define amdgpu_ps void @main(float %src) {
main_body:
%p1 = call i32 @llvm.SI.packf16(float undef, float %src)
%p2 = call i32 @llvm.SI.packf16(float %src, float undef)
@@ -21,9 +21,8 @@ main_body:
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
+declare i32 @llvm.SI.packf16(float, float) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
deleted file mode 100644
index ac95fd0b83a2..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
+++ /dev/null
@@ -1,111 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
-; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
- i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
- %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1)
- %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2)
- %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3)
- %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4)
- %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5)
- %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6)
- %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7)
- %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8)
- %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9)
- %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10)
- %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11)
- %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12)
- %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13)
- %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14)
- %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15)
- %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16)
- %e1 = extractelement <4 x i32> %res1, i32 0
- %e2 = extractelement <4 x i32> %res2, i32 1
- %e3 = extractelement <4 x i32> %res3, i32 2
- %e4 = extractelement <4 x i32> %res4, i32 3
- %t0 = extractelement <4 x i32> %res5, i32 0
- %t1 = extractelement <4 x i32> %res5, i32 1
- %e5 = add i32 %t0, %t1
- %t2 = extractelement <4 x i32> %res6, i32 0
- %t3 = extractelement <4 x i32> %res6, i32 2
- %e6 = add i32 %t2, %t3
- %t4 = extractelement <4 x i32> %res7, i32 0
- %t5 = extractelement <4 x i32> %res7, i32 3
- %e7 = add i32 %t4, %t5
- %t6 = extractelement <4 x i32> %res8, i32 1
- %t7 = extractelement <4 x i32> %res8, i32 2
- %e8 = add i32 %t6, %t7
- %t8 = extractelement <4 x i32> %res9, i32 1
- %t9 = extractelement <4 x i32> %res9, i32 3
- %e9 = add i32 %t8, %t9
- %t10 = extractelement <4 x i32> %res10, i32 2
- %t11 = extractelement <4 x i32> %res10, i32 3
- %e10 = add i32 %t10, %t11
- %t12 = extractelement <4 x i32> %res11, i32 0
- %t13 = extractelement <4 x i32> %res11, i32 1
- %t14 = extractelement <4 x i32> %res11, i32 2
- %t15 = add i32 %t12, %t13
- %e11 = add i32 %t14, %t15
- %t16 = extractelement <4 x i32> %res12, i32 0
- %t17 = extractelement <4 x i32> %res12, i32 1
- %t18 = extractelement <4 x i32> %res12, i32 3
- %t19 = add i32 %t16, %t17
- %e12 = add i32 %t18, %t19
- %t20 = extractelement <4 x i32> %res13, i32 0
- %t21 = extractelement <4 x i32> %res13, i32 2
- %t22 = extractelement <4 x i32> %res13, i32 3
- %t23 = add i32 %t20, %t21
- %e13 = add i32 %t22, %t23
- %t24 = extractelement <4 x i32> %res14, i32 1
- %t25 = extractelement <4 x i32> %res14, i32 2
- %t26 = extractelement <4 x i32> %res14, i32 3
- %t27 = add i32 %t24, %t25
- %e14 = add i32 %t26, %t27
- %t28 = extractelement <4 x i32> %res15, i32 0
- %t29 = extractelement <4 x i32> %res15, i32 1
- %t30 = extractelement <4 x i32> %res15, i32 2
- %t31 = extractelement <4 x i32> %res15, i32 3
- %t32 = add i32 %t28, %t29
- %t33 = add i32 %t30, %t31
- %e15 = add i32 %t32, %t33
- %e16 = extractelement <4 x i32> %res16, i32 3
- %s1 = add i32 %e1, %e2
- %s2 = add i32 %s1, %e3
- %s3 = add i32 %s2, %e4
- %s4 = add i32 %s3, %e5
- %s5 = add i32 %s4, %e6
- %s6 = add i32 %s5, %e7
- %s7 = add i32 %s6, %e8
- %s8 = add i32 %s7, %e9
- %s9 = add i32 %s8, %e10
- %s10 = add i32 %s9, %e11
- %s11 = add i32 %s10, %e12
- %s12 = add i32 %s11, %e13
- %s13 = add i32 %s12, %e14
- %s14 = add i32 %s13, %e15
- %s15 = add i32 %s14, %e16
- %s16 = bitcast i32 %s15 to float
- call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
- ret void
-}
-
-declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.sample.ll
deleted file mode 100644
index 509c45f588b8..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.sample.ll
+++ /dev/null
@@ -1,160 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: image_sample {{v[0-9]+}}, 2
-;CHECK-DAG: image_sample {{v[0-9]+}}, 1
-;CHECK-DAG: image_sample {{v[0-9]+}}, 4
-;CHECK-DAG: image_sample {{v[0-9]+}}, 8
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: image_sample {{v[0-9]+}}, 8
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
- %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
- %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
- %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
- %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
- %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
- %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
- %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
- %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
- %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
- %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
- %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
- %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
- %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
- %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
- %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
- %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
- %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
- <32 x i8> undef, <16 x i8> undef, i32 1)
- %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
- <32 x i8> undef, <16 x i8> undef, i32 2)
- %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
- <32 x i8> undef, <16 x i8> undef, i32 3)
- %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
- <32 x i8> undef, <16 x i8> undef, i32 4)
- %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
- <32 x i8> undef, <16 x i8> undef, i32 5)
- %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
- <32 x i8> undef, <16 x i8> undef, i32 6)
- %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
- <32 x i8> undef, <16 x i8> undef, i32 7)
- %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
- <32 x i8> undef, <16 x i8> undef, i32 8)
- %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
- <32 x i8> undef, <16 x i8> undef, i32 9)
- %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
- <32 x i8> undef, <16 x i8> undef, i32 10)
- %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
- <32 x i8> undef, <16 x i8> undef, i32 11)
- %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
- <32 x i8> undef, <16 x i8> undef, i32 12)
- %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
- <32 x i8> undef, <16 x i8> undef, i32 13)
- %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
- <32 x i8> undef, <16 x i8> undef, i32 14)
- %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
- <32 x i8> undef, <16 x i8> undef, i32 15)
- %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
- <32 x i8> undef, <16 x i8> undef, i32 16)
- %e1 = extractelement <4 x float> %res1, i32 0
- %e2 = extractelement <4 x float> %res2, i32 1
- %e3 = extractelement <4 x float> %res3, i32 2
- %e4 = extractelement <4 x float> %res4, i32 3
- %t0 = extractelement <4 x float> %res5, i32 0
- %t1 = extractelement <4 x float> %res5, i32 1
- %e5 = fadd float %t0, %t1
- %t2 = extractelement <4 x float> %res6, i32 0
- %t3 = extractelement <4 x float> %res6, i32 2
- %e6 = fadd float %t2, %t3
- %t4 = extractelement <4 x float> %res7, i32 0
- %t5 = extractelement <4 x float> %res7, i32 3
- %e7 = fadd float %t4, %t5
- %t6 = extractelement <4 x float> %res8, i32 1
- %t7 = extractelement <4 x float> %res8, i32 2
- %e8 = fadd float %t6, %t7
- %t8 = extractelement <4 x float> %res9, i32 1
- %t9 = extractelement <4 x float> %res9, i32 3
- %e9 = fadd float %t8, %t9
- %t10 = extractelement <4 x float> %res10, i32 2
- %t11 = extractelement <4 x float> %res10, i32 3
- %e10 = fadd float %t10, %t11
- %t12 = extractelement <4 x float> %res11, i32 0
- %t13 = extractelement <4 x float> %res11, i32 1
- %t14 = extractelement <4 x float> %res11, i32 2
- %t15 = fadd float %t12, %t13
- %e11 = fadd float %t14, %t15
- %t16 = extractelement <4 x float> %res12, i32 0
- %t17 = extractelement <4 x float> %res12, i32 1
- %t18 = extractelement <4 x float> %res12, i32 3
- %t19 = fadd float %t16, %t17
- %e12 = fadd float %t18, %t19
- %t20 = extractelement <4 x float> %res13, i32 0
- %t21 = extractelement <4 x float> %res13, i32 2
- %t22 = extractelement <4 x float> %res13, i32 3
- %t23 = fadd float %t20, %t21
- %e13 = fadd float %t22, %t23
- %t24 = extractelement <4 x float> %res14, i32 1
- %t25 = extractelement <4 x float> %res14, i32 2
- %t26 = extractelement <4 x float> %res14, i32 3
- %t27 = fadd float %t24, %t25
- %e14 = fadd float %t26, %t27
- %t28 = extractelement <4 x float> %res15, i32 0
- %t29 = extractelement <4 x float> %res15, i32 1
- %t30 = extractelement <4 x float> %res15, i32 2
- %t31 = extractelement <4 x float> %res15, i32 3
- %t32 = fadd float %t28, %t29
- %t33 = fadd float %t30, %t31
- %e15 = fadd float %t32, %t33
- %e16 = extractelement <4 x float> %res16, i32 3
- %s1 = fadd float %e1, %e2
- %s2 = fadd float %s1, %e3
- %s3 = fadd float %s2, %e4
- %s4 = fadd float %s3, %e5
- %s5 = fadd float %s4, %e6
- %s6 = fadd float %s5, %e7
- %s7 = fadd float %s6, %e8
- %s8 = fadd float %s7, %e9
- %s9 = fadd float %s8, %e10
- %s10 = fadd float %s9, %e11
- %s11 = fadd float %s10, %e12
- %s12 = fadd float %s11, %e13
- %s13 = fadd float %s12, %e14
- %s14 = fadd float %s13, %e15
- %s15 = fadd float %s14, %e16
- call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
- ret void
-}
-
-; CHECK: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
-define void @v1(i32 %a1) #0 {
-entry:
- %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
- %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
- %2 = extractelement <4 x float> %1, i32 0
- %3 = extractelement <4 x float> %1, i32 1
- %4 = extractelement <4 x float> %1, i32 2
- %5 = extractelement <4 x float> %1, i32 3
- call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5)
- ret void
-}
-
-
-declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sampled.ll b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
deleted file mode 100644
index f2badff2a99c..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
-
-define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
- %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
- %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
- %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
- %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
- %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
- %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
- %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
- %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
- %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
- %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
- %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
- %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
- %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
- %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
- %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
- %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
- %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
- <32 x i8> undef, <16 x i8> undef, i32 1)
- %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
- <32 x i8> undef, <16 x i8> undef, i32 2)
- %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
- <32 x i8> undef, <16 x i8> undef, i32 3)
- %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
- <32 x i8> undef, <16 x i8> undef, i32 4)
- %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
- <32 x i8> undef, <16 x i8> undef, i32 5)
- %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
- <32 x i8> undef, <16 x i8> undef, i32 6)
- %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
- <32 x i8> undef, <16 x i8> undef, i32 7)
- %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
- <32 x i8> undef, <16 x i8> undef, i32 8)
- %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
- <32 x i8> undef, <16 x i8> undef, i32 9)
- %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
- <32 x i8> undef, <16 x i8> undef, i32 10)
- %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
- <32 x i8> undef, <16 x i8> undef, i32 11)
- %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
- <32 x i8> undef, <16 x i8> undef, i32 12)
- %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
- <32 x i8> undef, <16 x i8> undef, i32 13)
- %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
- <32 x i8> undef, <16 x i8> undef, i32 14)
- %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
- <32 x i8> undef, <16 x i8> undef, i32 15)
- %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
- <32 x i8> undef, <16 x i8> undef, i32 16)
- %e1 = extractelement <4 x float> %res1, i32 0
- %e2 = extractelement <4 x float> %res2, i32 1
- %e3 = extractelement <4 x float> %res3, i32 2
- %e4 = extractelement <4 x float> %res4, i32 3
- %t0 = extractelement <4 x float> %res5, i32 0
- %t1 = extractelement <4 x float> %res5, i32 1
- %e5 = fadd float %t0, %t1
- %t2 = extractelement <4 x float> %res6, i32 0
- %t3 = extractelement <4 x float> %res6, i32 2
- %e6 = fadd float %t2, %t3
- %t4 = extractelement <4 x float> %res7, i32 0
- %t5 = extractelement <4 x float> %res7, i32 3
- %e7 = fadd float %t4, %t5
- %t6 = extractelement <4 x float> %res8, i32 1
- %t7 = extractelement <4 x float> %res8, i32 2
- %e8 = fadd float %t6, %t7
- %t8 = extractelement <4 x float> %res9, i32 1
- %t9 = extractelement <4 x float> %res9, i32 3
- %e9 = fadd float %t8, %t9
- %t10 = extractelement <4 x float> %res10, i32 2
- %t11 = extractelement <4 x float> %res10, i32 3
- %e10 = fadd float %t10, %t11
- %t12 = extractelement <4 x float> %res11, i32 0
- %t13 = extractelement <4 x float> %res11, i32 1
- %t14 = extractelement <4 x float> %res11, i32 2
- %t15 = fadd float %t12, %t13
- %e11 = fadd float %t14, %t15
- %t16 = extractelement <4 x float> %res12, i32 0
- %t17 = extractelement <4 x float> %res12, i32 1
- %t18 = extractelement <4 x float> %res12, i32 3
- %t19 = fadd float %t16, %t17
- %e12 = fadd float %t18, %t19
- %t20 = extractelement <4 x float> %res13, i32 0
- %t21 = extractelement <4 x float> %res13, i32 2
- %t22 = extractelement <4 x float> %res13, i32 3
- %t23 = fadd float %t20, %t21
- %e13 = fadd float %t22, %t23
- %t24 = extractelement <4 x float> %res14, i32 1
- %t25 = extractelement <4 x float> %res14, i32 2
- %t26 = extractelement <4 x float> %res14, i32 3
- %t27 = fadd float %t24, %t25
- %e14 = fadd float %t26, %t27
- %t28 = extractelement <4 x float> %res15, i32 0
- %t29 = extractelement <4 x float> %res15, i32 1
- %t30 = extractelement <4 x float> %res15, i32 2
- %t31 = extractelement <4 x float> %res15, i32 3
- %t32 = fadd float %t28, %t29
- %t33 = fadd float %t30, %t31
- %e15 = fadd float %t32, %t33
- %e16 = extractelement <4 x float> %res16, i32 3
- %s1 = fadd float %e1, %e2
- %s2 = fadd float %s1, %e3
- %s3 = fadd float %s2, %e4
- %s4 = fadd float %s3, %e5
- %s5 = fadd float %s4, %e6
- %s6 = fadd float %s5, %e7
- %s7 = fadd float %s6, %e8
- %s8 = fadd float %s7, %e9
- %s9 = fadd float %s8, %e10
- %s10 = fadd float %s9, %e11
- %s11 = fadd float %s10, %e12
- %s12 = fadd float %s11, %e13
- %s13 = fadd float %s12, %e14
- %s14 = fadd float %s13, %e15
- %s15 = fadd float %s14, %e16
- call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
- ret void
-}
-
-declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
index 2198590f2dfe..2d4987643a2b 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
@@ -1,20 +1,17 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-; BOTH-LABEL: {{^}}main:
-; BOTH: s_mov_b32 m0, s0
+; GCN-LABEL: {{^}}main:
+; GCN: s_mov_b32 m0, s0
; VI-NEXT: s_nop 0
-; BOTH-NEXT: s_sendmsg Gs_done(nop)
-; BOTH-NEXT: s_endpgm
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
-define void @main(i32 inreg %a) #0 {
-main_body:
+define amdgpu_gs void @main(i32 inreg %a) #0 {
call void @llvm.SI.sendmsg(i32 3, i32 %a)
ret void
}
-; Function Attrs: nounwind
-declare void @llvm.SI.sendmsg(i32, i32) #1
+declare void @llvm.SI.sendmsg(i32, i32) #0
-attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind }
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
index 09675d503355..c4bb27676e7d 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
@@ -4,10 +4,10 @@
; CHECK-LABEL: {{^}}main:
; CHECK: s_mov_b32 m0, 0
; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg Gs(emit stream 0)
-; CHECK: s_sendmsg Gs(cut stream 1)
-; CHECK: s_sendmsg Gs(emit-cut stream 2)
-; CHECK: s_sendmsg Gs_done(nop)
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
define void @main() {
main_body:
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
index 71f51548a5f8..645c6a6b8d7e 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -3,7 +3,7 @@
;CHECK-LABEL: {{^}}test1:
;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test1(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
%vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
@@ -13,7 +13,7 @@ define void @test1(i32 %a1, i32 %vaddr) #0 {
;CHECK-LABEL: {{^}}test2:
;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test2(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
%vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
@@ -23,7 +23,7 @@ define void @test2(i32 %a1, i32 %vaddr) #0 {
;CHECK-LABEL: {{^}}test3:
;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test3(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
%vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
@@ -33,7 +33,7 @@ define void @test3(i32 %a1, i32 %vaddr) #0 {
;CHECK-LABEL: {{^}}test4:
;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test4(i32 %vdata, i32 %vaddr) #0 {
+define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
i32 1, i32 0)
@@ -43,5 +43,3 @@ define void @test4(i32 %vdata, i32 %vaddr) #0 {
declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tid.ll b/test/CodeGen/AMDGPU/llvm.SI.tid.ll
deleted file mode 100644
index f6e6d7050ba7..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.tid.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
-
-;GCN: v_mbcnt_lo_u32_b32_e64
-;SI: v_mbcnt_hi_u32_b32_e32
-;VI: v_mbcnt_hi_u32_b32_e64
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
-main_body:
- %4 = call i32 @llvm.SI.tid()
- %5 = bitcast i32 %4 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
- ret void
-}
-
-declare i32 @llvm.SI.tid() readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
new file mode 100644
index 000000000000..93911d4a91f1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -0,0 +1,387 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
+define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+ %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
+define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+ %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
+define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32:
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
+; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
+ %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ store i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
+; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+ %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+ store i32 %result, i32 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ store i32 %result, i32 addrspace(4)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32:
+; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+ %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
+ %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ store i32 %result, i32 addrspace(4)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+ %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+ store i64 %result, i64 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ store i64 %result, i64 addrspace(4)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+ %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
+ %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ store i64 %result, i64 addrspace(4)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+ %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ ret void
+}
+
+@lds0 = addrspace(3) global [512 x i32] undef
+
+; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8
+define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %idx.0 = add nsw i32 %tid.x, 2
+ %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
+ %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+ store i32 %idx.0, i32 addrspace(1)* %add_use
+ store i32 %val0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
+define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+ %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+ %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
+define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_dec_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
+define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
+; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
+ %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ store i64 %result, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
+; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+ %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ ret void
+}
+
+@lds1 = addrspace(3) global [512 x i64] undef, align 8
+
+; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
+define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %idx.0 = add nsw i32 %tid.x, 2
+ %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
+ %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+ store i32 %idx.0, i32 addrspace(1)* %add_use
+ store i64 %val0, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
new file mode 100644
index 000000000000..181d68c8ea75
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -0,0 +1,383 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
+define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+ %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
+define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
+define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+ %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
+define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ store i32 %result, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32:
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
+; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
+ %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ store i32 %result, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
+; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
+ %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+ ret void
+}
+
+@lds0 = addrspace(3) global [512 x i32] undef, align 4
+
+; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %idx.0 = add nsw i32 %tid.x, 2
+ %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
+ %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+ store i32 %idx.0, i32 addrspace(1)* %add_use
+ store i32 %val0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
+define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+ %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+ %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
+define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ store i64 %result, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_atomic_inc_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
+define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+ %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
+; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
+ %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ store i64 %result, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
+; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
+ %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+ store i32 %result, i32 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ store i32 %result, i32 addrspace(4)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32:
+; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+ %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
+define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+ %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
+ %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ store i32 %result, i32 addrspace(4)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
+define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
+ %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
+ %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+ ret void
+}
+
+@lds1 = addrspace(3) global [512 x i64] undef, align 8
+
+; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
+; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
+define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %idx.0 = add nsw i32 %tid.x, 2
+ %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
+ %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+ store i32 %idx.0, i32 addrspace(1)* %add_use
+ store i64 %val0, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
+
+
+
+
+
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+ store i64 %result, i64 addrspace(4)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ store i64 %result, i64 addrspace(4)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+ ret void
+}
+
+; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+ %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+ %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
+ %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ store i64 %result, i64 addrspace(4)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
+; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
+ %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
+ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
new file mode 100644
index 000000000000..98f7058b5ef8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -0,0 +1,126 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
+
+;CHECK-LABEL: {{^}}test1:
+;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
+;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
+define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
+main_body:
+ %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+ %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
+ %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
+ %ofs.5 = add i32 %voffset, 42
+ %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
+ %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
+ %unused = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+ %out = bitcast i32 %o6 to float
+ ret float %out
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
+define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
+main_body:
+ %t1 = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t7 = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t8 = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %out = bitcast i32 %t9 to float
+ ret float %out
+}
+
+; Ideally, we would teach tablegen & friends that cmpswap only modifies the
+; first vgpr. Since we don't do that yet, the register allocator will have to
+; create copies which we don't bother to track here.
+;
+;CHECK-LABEL: {{^}}test3:
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
+;CHECK: s_waitcnt vmcnt(0)
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
+;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc
+define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
+main_body:
+ %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+ %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
+ %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
+ %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
+ %ofs.5 = add i32 %voffset, 42
+ %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
+ %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
+
+; Detecting the no-return variant doesn't work right now because of how the
+; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
+; Since there probably isn't a reasonable use-case of cmpswap that discards
+; the return value, that seems okay.
+;
+; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+ %out = bitcast i32 %o6 to float
+ ret float %out
+}
+
+;CHECK-LABEL: {{^}}test4:
+;CHECK: buffer_atomic_add v0,
+define amdgpu_ps float @test4() {
+main_body:
+ %v = call i32 @llvm.amdgcn.buffer.atomic.add(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false)
+ %v.float = bitcast i32 %v to float
+ ret float %v.float
+}
+
+declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
new file mode 100644
index 000000000000..67c7baba3e14
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -0,0 +1,133 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
+
+;CHECK-LABEL: {{^}}buffer_load:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
+;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+ %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+ %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+ %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
+ %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
+ %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
+ ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs:
+;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
+;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c
+;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen
+;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095
+;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093
+;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
+;VI: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+main_body:
+ %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
+ %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
+ %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
+ %d.3 = fadd <4 x float> %d.0, %d.1
+ %data = fadd <4 x float> %d.2, %d.3
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
+;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65
+;VI-NOT: s_mov
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81
+;VI: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) {
+main_body:
+ %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
+ %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
+ %data = fadd <4 x float> %d.0, %d.1
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_idx:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+main_body:
+ %ofs = add i32 %1, 58
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both:
+;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both_reversed:
+;CHECK: v_mov_b32_e32 v2, v0
+;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x:
+;CHECK: buffer_load_format_x v0, off, s[0:3], 0
+;CHECK: s_waitcnt
+define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
+ ret float %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_xy:
+;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0
+;CHECK: s_waitcnt
+define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
+ ret <2 x float> %data
+}
+
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
new file mode 100644
index 000000000000..010ad276da10
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -0,0 +1,119 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
+
+;CHECK-LABEL: {{^}}buffer_load:
+;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+ %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+ %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+ %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
+ %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
+ %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
+ ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs:
+;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
+;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
+;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
+;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_idx:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+main_body:
+ %ofs = add i32 %1, 58
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both:
+;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both_reversed:
+;CHECK: v_mov_b32_e32 v2, v0
+;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+main_body:
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x1:
+;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+ %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+ ret float %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x2:
+;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+ %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+ ret <2 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_negative_offset:
+;CHECK: v_add_i32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
+;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
+define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
+main_body:
+ %ofs.1 = add i32 %ofs, -16
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0)
+ ret <4 x float> %data
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
new file mode 100644
index 000000000000..555a1d23ebe9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@@ -0,0 +1,95 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_store:
+;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0
+;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both_reversed:
+;CHECK: v_mov_b32_e32 v6, v4
+;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
+ ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
new file mode 100644
index 000000000000..5ae255c7a26c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -0,0 +1,95 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_store:
+;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
+;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both_reversed:
+;CHECK: v_mov_b32_e32 v6, v4
+;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
+ ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+ %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+main_body:
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 80eb3b93f8e5..668c669e41e8 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
-declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i1 @llvm.amdgcn.class.f32(float, i32) #1
+declare i1 @llvm.amdgcn.class.f64(double, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fabs.f32(float) #1
declare double @llvm.fabs.f64(double) #1
@@ -15,7 +15,7 @@ declare double @llvm.fabs.f64(double) #1
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
- %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -31,7 +31,7 @@ define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
; SI: s_endpgm
define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
%a.fabs = call float @llvm.fabs.f32(float %a) #1
- %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -47,7 +47,7 @@ define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
; SI: s_endpgm
define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
%a.fneg = fsub float -0.0, %a
- %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -64,7 +64,7 @@ define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
%a.fabs = call float @llvm.fabs.f32(float %a) #1
%a.fneg.fabs = fsub float -0.0, %a.fabs
- %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -77,7 +77,7 @@ define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b)
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -90,7 +90,7 @@ define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -105,7 +105,7 @@ define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -119,7 +119,7 @@ define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -133,12 +133,12 @@ define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.in
- %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %gep.out, align 4
ret void
@@ -151,12 +151,12 @@ define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%b = load i32, i32 addrspace(1)* %gep.in
- %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float 1.0, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %gep.out, align 4
ret void
@@ -171,12 +171,12 @@ define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%b = load i32, i32 addrspace(1)* %gep.in
- %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float 1024.0, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %gep.out, align 4
ret void
@@ -187,11 +187,11 @@ define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i3
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
- %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -202,12 +202,12 @@ define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
%a.fabs = call double @llvm.fabs.f64(double %a) #1
- %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -218,12 +218,12 @@ define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
%a.fneg = fsub double -0.0, %a
- %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -234,13 +234,13 @@ define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
%a.fabs = call double @llvm.fabs.f64(double %a) #1
%a.fneg.fabs = fsub double -0.0, %a.fabs
- %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -250,7 +250,7 @@ define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b)
; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
; SI: s_endpgm
define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -260,7 +260,7 @@ define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
; SI: s_endpgm
define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -276,7 +276,7 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -291,12 +291,12 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load double, double addrspace(1)* %in
- %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %gep.out, align 4
ret void
@@ -307,12 +307,12 @@ define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace
; SI: v_cmp_class_f64_e32 vcc,
; SI: s_endpgm
define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%b = load i32, i32 addrspace(1)* %gep.in
- %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %gep.out, align 4
ret void
@@ -322,12 +322,12 @@ define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %
; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
; SI: s_endpgm
define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%b = load i32, i32 addrspace(1)* %gep.in
- %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double 1024.0, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %gep.out, align 4
ret void
@@ -339,13 +339,13 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.in
- %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
- %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
+ %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
+ %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 3) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
@@ -359,14 +359,14 @@ define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.in
- %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
- %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
- %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+ %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
+ %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
+ %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
%or.0 = or i1 %class0, %class1
%or.1 = or i1 %or.0, %class2
@@ -382,21 +382,21 @@ define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.in
- %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
- %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
- %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
- %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
- %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
- %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
- %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
- %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
- %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
- %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
+ %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
+ %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 2) #1
+ %class2 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
+ %class3 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
+ %class4 = call i1 @llvm.amdgcn.class.f32(float %a, i32 16) #1
+ %class5 = call i1 @llvm.amdgcn.class.f32(float %a, i32 32) #1
+ %class6 = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
+ %class7 = call i1 @llvm.amdgcn.class.f32(float %a, i32 128) #1
+ %class8 = call i1 @llvm.amdgcn.class.f32(float %a, i32 256) #1
+ %class9 = call i1 @llvm.amdgcn.class.f32(float %a, i32 512) #1
%or.0 = or i1 %class0, %class1
%or.1 = or i1 %or.0, %class2
%or.2 = or i1 %or.1, %class3
@@ -417,13 +417,13 @@ define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float ad
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.in
- %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
- %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+ %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
+ %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 8) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
@@ -437,13 +437,13 @@ define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)
; SI-NOT: v_cmp_class
; SI: s_endpgm
define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.in
- %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
- %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+ %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
+ %class1 = call i1 @llvm.amdgcn.class.f32(float %a, i32 7) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
@@ -457,13 +457,13 @@ define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)
; SI: s_or_b64
; SI: s_endpgm
define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.in
- %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
- %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
+ %class0 = call i1 @llvm.amdgcn.class.f32(float %a, i32 4) #1
+ %class1 = call i1 @llvm.amdgcn.class.f32(float %b, i32 8) #1
%or = or i1 %class0, %class1
%sext = sext i1 %or to i32
@@ -477,7 +477,7 @@ define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
+ %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
@@ -489,7 +489,19 @@ define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
- %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
+ %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
+ %sext = sext i1 %result to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FIXME: Why is the extension still here?
+; SI-LABEL: {{^}}test_class_undef_f32:
+; SI-NOT: v_cmp_class
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
+; SI: buffer_store_dword
+define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+ %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
new file mode 100644
index 000000000000..f6495d8155f7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.cos.f32(float) #0
+
+; GCN-LABEL: {{^}}v_cos_f32:
+; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
+ %cos = call float @llvm.amdgcn.cos.f32(float %src) #0
+ store float %cos, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
new file mode 100644
index 000000000000..22bed45ee30f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubeid(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubeid:
+; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+ %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
+ store float %result, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
new file mode 100644
index 000000000000..565f22c5d5b6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubema(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubema:
+; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+ %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
+ store float %result, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
new file mode 100644
index 000000000000..a3ba32745814
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubesc(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubesc:
+; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+ %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
+ store float %result, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
new file mode 100644
index 000000000000..d3c0f2851ead
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare float @llvm.amdgcn.cubetc(float, float, float) #0
+
+; GCN-LABEL: {{^}}test_cubetc:
+; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+ %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
+ store float %result, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index d96ea743f6ed..2e8625256f13 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; ERROR: error: unsupported hsa intrinsic without hsa target in test
+; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
; GCN-LABEL: {{^}}test:
; GCN: enable_sgpr_dispatch_ptr = 1
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index 55ca9c7536e5..f9b390eca0c2 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -1,8 +1,8 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
-declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
+declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone
+declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
; GCN-LABEL: {{^}}test_div_fixup_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
@@ -17,7 +17,7 @@ declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readn
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
- %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -25,7 +25,7 @@ define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, fl
; GCN-LABEL: {{^}}test_div_fixup_f64:
; GCN: v_div_fixup_f64
define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
- %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
+ %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
store double %result, double addrspace(1)* %out, align 8
ret void
}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 7dc094ed1b4b..efea3eb707a1 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
; FIXME: Enable for VI.
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
-declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
+declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
; GCN-LABEL: {{^}}test_div_fmas_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
@@ -21,7 +21,7 @@ declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind re
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -35,7 +35,7 @@ define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, flo
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
- %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -45,11 +45,11 @@ define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a,
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -63,7 +63,7 @@ define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a,
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -71,7 +71,7 @@ define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a,
; GCN-LABEL: {{^}}test_div_fmas_f64:
; GCN: v_div_fmas_f64
define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
- %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
+ %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
store double %result, double addrspace(1)* %out, align 8
ret void
}
@@ -81,7 +81,7 @@ define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b,
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
%cmp = icmp eq i32 %i, 0
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -90,7 +90,7 @@ define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, f
; SI: s_mov_b64 vcc, 0
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -99,7 +99,7 @@ define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, f
; SI: s_mov_b64 vcc, -1
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
@@ -115,21 +115,21 @@ define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, fl
; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
; SI: s_endpgm
define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
%gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
%gep.out = getelementptr float, float addrspace(1)* %out, i32 2
- %a = load float, float addrspace(1)* %gep.a
- %b = load float, float addrspace(1)* %gep.b
- %c = load float, float addrspace(1)* %gep.c
+ %a = load volatile float, float addrspace(1)* %gep.a
+ %b = load volatile float, float addrspace(1)* %gep.b
+ %c = load volatile float, float addrspace(1)* %gep.c
%cmp0 = icmp eq i32 %tid, 0
%cmp1 = icmp ne i32 %d, 0
%and = and i1 %cmp0, %cmp1
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
store float %result, float addrspace(1)* %gep.out, align 4
ret void
}
@@ -146,13 +146,13 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo
; SI: BB9_2:
; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_i32_e32 vcc, 0, v0
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; SI: buffer_store_dword
; SI: s_endpgm
define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
entry:
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.out = getelementptr float, float addrspace(1)* %out, i32 2
%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
@@ -172,7 +172,7 @@ bb:
exit:
%cond = phi i1 [false, %entry], [%cmp1, %bb]
- %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
+ %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
store float %result, float addrspace(1)* %gep.out, align 4
ret void
}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index de830de039c7..38e4b8440d32 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -1,235 +1,235 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
-declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) nounwind readnone
+declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
-; SI-LABEL @test_div_scale_f32_1:
+; SI-LABEL: {{^}}test_div_scale_f32_1:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_2:
+; SI-LABEL: {{^}}test_div_scale_f32_2:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f64_1:
+; SI-LABEL: {{^}}test_div_scale_f64_1:
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
- %a = load double, double addrspace(1)* %gep.0, align 8
- %b = load double, double addrspace(1)* %gep.1, align 8
+ %a = load volatile double, double addrspace(1)* %gep.0, align 8
+ %b = load volatile double, double addrspace(1)* %gep.1, align 8
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f64_1:
+; SI-LABEL: {{^}}test_div_scale_f64_2:
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
- %a = load double, double addrspace(1)* %gep.0, align 8
- %b = load double, double addrspace(1)* %gep.1, align 8
+ %a = load volatile double, double addrspace(1)* %gep.0, align 8
+ %b = load volatile double, double addrspace(1)* %gep.1, align 8
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f32_scalar_num_1:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_1:
; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
; SI-DAG: s_load_dword [[A:s[0-9]+]]
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
%b = load float, float addrspace(1)* %gep, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_scalar_num_2:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_2:
; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
; SI-DAG: s_load_dword [[A:s[0-9]+]]
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
%b = load float, float addrspace(1)* %gep, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_scalar_den_1:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_1:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
; SI-DAG: s_load_dword [[B:s[0-9]+]]
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
%a = load float, float addrspace(1)* %gep, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_scalar_den_2:
+; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_2:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
; SI-DAG: s_load_dword [[B:s[0-9]+]]
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
%a = load float, float addrspace(1)* %gep, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f64_scalar_num_1:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_1:
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
%b = load double, double addrspace(1)* %gep, align 8
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f64_scalar_num_2:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_2:
; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
%b = load double, double addrspace(1)* %gep, align 8
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f64_scalar_den_1:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_1:
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
%a = load double, double addrspace(1)* %gep, align 8
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f64_scalar_den_2:
+; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_2:
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
%a = load double, double addrspace(1)* %gep, align 8
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f32_all_scalar_1:
+; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1:
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
@@ -237,13 +237,13 @@ define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double a
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_all_scalar_2:
+; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2:
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
@@ -251,13 +251,13 @@ define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a,
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f64_all_scalar_1:
+; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
@@ -266,13 +266,13 @@ define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a,
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f64_all_scalar_2:
+; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
@@ -281,83 +281,83 @@ define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
- %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL @test_div_scale_f32_inline_imm_num:
+; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_num:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%a = load float, float addrspace(1)* %gep.0, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_inline_imm_den:
+; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_den:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%a = load float, float addrspace(1)* %gep.0, align 4
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_fabs_num:
+; SI-LABEL: {{^}}test_div_scale_f32_fabs_num:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
}
-; SI-LABEL @test_div_scale_f32_fabs_den:
+; SI-LABEL: {{^}}test_div_scale_f32_fabs_den:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
- %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
new file mode 100644
index 000000000000..92d3fc8b107e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
+
+; FUNC-LABEL: {{^}}ds_bpermute:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
+ store i32 %bpermute, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+ %index = add i32 %base_index, 4
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
+ store i32 %bpermute, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
+ store i32 %bpermute, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
new file mode 100644
index 000000000000..6d9c94191535
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0
+
+; CHECK-LABEL: {{^}}ds_permute:
+; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+ %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
+ store i32 %bpermute, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}ds_permute_imm_offset:
+; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+ %index = add i32 %base_index, 4
+ %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
+ store i32 %bpermute, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
new file mode 100644
index 000000000000..ef3cb00024bb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
+
+; FUNC-LABEL: {{^}}ds_swizzle:
+; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
+ %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
+ store i32 %swizzle, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
new file mode 100644
index 000000000000..1cca9eb6a77a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.fract.f32(float) #0
+declare double @llvm.amdgcn.fract.f64(double) #0
+
+; GCN-LABEL: {{^}}v_fract_f32:
+; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
+ %fract = call float @llvm.amdgcn.fract.f32(float %src)
+ store float %fract, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fract_f64:
+; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
+ %fract = call double @llvm.amdgcn.fract.f64(double %src)
+ store double %fract, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fract_undef_f32:
+; GCN-NOT: v_fract_f32
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
+ %fract = call float @llvm.amdgcn.fract.f32(float undef)
+ store float %fract, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
new file mode 100644
index 000000000000..728a6b5cf26b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.fabs.f32(float) #0
+declare double @llvm.fabs.f64(double) #0
+declare i32 @llvm.amdgcn.frexp.exp.f32(float) #0
+declare i32 @llvm.amdgcn.frexp.exp.f64(double) #0
+
+; GCN-LABEL: {{^}}s_test_frexp_exp_f32:
+; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+ %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f32(float %src)
+ store i32 %frexp.exp, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32:
+; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
+define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+ %fabs.src = call float @llvm.fabs.f32(float %src)
+ %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f32(float %fabs.src)
+ store i32 %frexp.exp, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32:
+; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
+define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+ %fabs.src = call float @llvm.fabs.f32(float %src)
+ %fneg.fabs.src = fsub float -0.0, %fabs.src
+ %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f32(float %fneg.fabs.src)
+ store i32 %frexp.exp, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_frexp_exp_f64:
+; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+ %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f64(double %src)
+ store i32 %frexp.exp, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64:
+; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+ %fabs.src = call double @llvm.fabs.f64(double %src)
+ %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f64(double %fabs.src)
+ store i32 %frexp.exp, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64:
+; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+ %fabs.src = call double @llvm.fabs.f64(double %src)
+ %fneg.fabs.src = fsub double -0.0, %fabs.src
+ %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.f64(double %fneg.fabs.src)
+ store i32 %frexp.exp, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
new file mode 100644
index 000000000000..b8d63defffed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.fabs.f32(float) #0
+declare double @llvm.fabs.f64(double) #0
+declare float @llvm.amdgcn.frexp.mant.f32(float) #0
+declare double @llvm.amdgcn.frexp.mant.f64(double) #0
+
+; GCN-LABEL: {{^}}s_test_frexp_mant_f32:
+; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+ %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src)
+ store float %frexp.mant, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32:
+; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
+define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+ %fabs.src = call float @llvm.fabs.f32(float %src)
+ %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src)
+ store float %frexp.mant, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32:
+; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
+define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+ %fabs.src = call float @llvm.fabs.f32(float %src)
+ %fneg.fabs.src = fsub float -0.0, %fabs.src
+ %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src)
+ store float %frexp.mant, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_frexp_mant_f64:
+; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+ %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src)
+ store double %frexp.mant, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64:
+; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+ %fabs.src = call double @llvm.fabs.f64(double %src)
+ %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src)
+ store double %frexp.mant, double addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64:
+; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}|
+define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+ %fabs.src = call double @llvm.fabs.f64(double %src)
+ %fneg.fabs.src = fsub double -0.0, %fabs.src
+ %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src)
+ store double %frexp.mant, double addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll
new file mode 100644
index 000000000000..cf6d1ab237cd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticgroup.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+
+@lds0 = addrspace(3) global [512 x float] undef, align 4
+@lds1 = addrspace(3) global [256 x float] undef, align 4
+
+; FUNC-LABEL: {{^}}groupstaticsize_test0:
+; CHECK: s_movk_i32 s{{[0-9]+}}, 0x800
+define void @get_groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %idx.0 = add nsw i32 %tid.x, 64
+ %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
+ store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
+ %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+ %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+ store float %val0, float addrspace(1)* %out, align 4
+
+ ret void
+}
+
+
+; FUNC-LABEL: {{^}}groupstaticsize_test1:
+; CHECK: s_movk_i32 s{{[0-9]+}}, 0xc00
+define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+entry:
+ %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
+ store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %idx.0 = add nsw i32 %tid.x, 64
+ %tmp = icmp eq i32 %cond, 0
+ br i1 %tmp, label %if, label %else
+
+if: ; preds = %entry
+ %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+ %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+ store float %val0, float addrspace(1)* %out, align 4
+ br label %endif
+
+else: ; preds = %entry
+ %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+ %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+ store float %val1, float addrspace(1)* %out, align 4
+ br label %endif
+
+endif: ; preds = %else, %if
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.groupstaticsize() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
new file mode 100644
index 000000000000..87d838727882
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -0,0 +1,123 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+
+;CHECK-LABEL: {{^}}image_atomic_swap:
+;SI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_swap(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+ %orig = call i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %orig.f = bitcast i32 %orig to float
+ ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_swap_v2i32:
+;SI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x02,0x00,0x00]
+;VI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x02,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_swap_v2i32(<8 x i32> inreg, <2 x i32>, i32) {
+main_body:
+ %orig = call i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32 %2, <2 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %orig.f = bitcast i32 %orig to float
+ ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_swap_i32:
+;SI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x01,0x00,0x00]
+;VI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x01,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_swap_i32(<8 x i32> inreg, i32, i32) {
+main_body:
+ %orig = call i32 @llvm.amdgcn.image.atomic.swap.i32(i32 %2, i32 %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %orig.f = bitcast i32 %orig to float
+ ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_cmpswap:
+;SI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x40,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: v_mov_b32_e32 v0, v4
+define amdgpu_ps float @image_atomic_cmpswap(<8 x i32> inreg, <4 x i32>, i32, i32) {
+main_body:
+ %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32 %2, i32 %3, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %orig.f = bitcast i32 %orig to float
+ ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_add:
+;SI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x44,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_add(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+ %orig = call i32 @llvm.amdgcn.image.atomic.add.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %orig.f = bitcast i32 %orig to float
+ ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_sub:
+;SI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
+;VI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4c,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_sub(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+ %orig = call i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %orig.f = bitcast i32 %orig to float
+ ret float %orig.f
+}
+
+;CHECK-LABEL: {{^}}image_atomic_unchanged:
+;CHECK: image_atomic_smin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x50,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_umin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x54,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_smax v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x58,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_umax v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5c,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_and v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x60,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_or v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x64,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_xor v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x68,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_inc v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6c,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_atomic_dec v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x70,0xf0,0x00,0x04,0x00,0x00]
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_atomic_unchanged(<8 x i32> inreg, <4 x i32>, i32) {
+main_body:
+ %t0 = call i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t1 = call i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32 %t0, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t2 = call i32 @llvm.amdgcn.image.atomic.smax.v4i32(i32 %t1, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t3 = call i32 @llvm.amdgcn.image.atomic.umax.v4i32(i32 %t2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t4 = call i32 @llvm.amdgcn.image.atomic.and.v4i32(i32 %t3, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t5 = call i32 @llvm.amdgcn.image.atomic.or.v4i32(i32 %t4, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t6 = call i32 @llvm.amdgcn.image.atomic.xor.v4i32(i32 %t5, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t7 = call i32 @llvm.amdgcn.image.atomic.inc.v4i32(i32 %t6, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %t8 = call i32 @llvm.amdgcn.image.atomic.dec.v4i32(i32 %t7, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
+ %out = bitcast i32 %t8 to float
+ ret float %out
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.i32(i32, i32, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32, <2 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32, i32, <4 x i32>, <8 x i32>,i1, i1, i1) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.and.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.or.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
new file mode 100644
index 000000000000..f0d23b93119d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -0,0 +1,110 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}image_load_v4i32:
+;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+ %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_v2i32:
+;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+main_body:
+ %tex = call <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_i32:
+;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) {
+main_body:
+ %tex = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_mip:
+;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+ %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret <4 x float> %tex
+}
+
+;CHECK-LABEL: {{^}}image_load_1:
+;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
+;CHECK: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+ %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ %elt = extractelement <4 x float> %tex, i32 0
+; Only first component used, test that dmask etc. is changed accordingly
+ ret float %elt
+}
+
+;CHECK-LABEL: {{^}}image_store_v4i32:
+;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+main_body:
+ call void @llvm.amdgcn.image.store.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_v2i32:
+;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) {
+main_body:
+ call void @llvm.amdgcn.image.store.v2i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_i32:
+;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) {
+main_body:
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}image_store_mip:
+;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+main_body:
+ call void @llvm.amdgcn.image.store.mip.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}image_store_wait:
+;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) {
+main_body:
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
+ %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
index a28e1b1eb241..911d7d9b74d9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -6,7 +6,7 @@
;GCN: s_mov_b32 m0, s{{[0-9]+}}
;GCN: v_interp_p1_f32
;GCN: v_interp_p2_f32
-define void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) {
main_body:
%i = extractelement <2 x i32> %4, i32 0
%j = extractelement <2 x i32> %4, i32 1
@@ -19,12 +19,11 @@ main_body:
}
; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #0
; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
new file mode 100644
index 000000000000..07650d990f3c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=MESA -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}test:
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
+
+; MESA: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
+define void @test(i32 addrspace(1)* %out) #1 {
+ %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
+ %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
+ %value = load i32, i32 addrspace(2)* %gep
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}test_implicit:
+; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
+; MESA: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
+define void @test_implicit(i32 addrspace(1)* %out) #1 {
+ %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+ %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
+ %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
+ %value = load i32, i32 addrspace(2)* %gep
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
new file mode 100644
index 000000000000..a23defd742a8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.amdgcn.ldexp.f32(float, i32) nounwind readnone
+declare double @llvm.amdgcn.ldexp.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_ldexp_f32:
+; SI: v_ldexp_f32
+; SI: s_endpgm
+define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+ %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone
+ store float %result, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_f64:
+; SI: v_ldexp_f64
+; SI: s_endpgm
+define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+ %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone
+ store double %result, double addrspace(1)* %out, align 8
+ ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_undef_f32:
+; SI-NOT: v_ldexp_f32
+define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
+ %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone
+ store float %result, float addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
new file mode 100644
index 000000000000..014369b45015
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0
+
+; GCN-LABEL: {{^}}v_lerp:
+; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
+ %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
new file mode 100644
index 000000000000..f78257f1d226
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+; ERR: intrinsic not supported on subtarget
+
+declare float @llvm.amdgcn.log.clamp.f32(float) #0
+
+; GCN-LABEL: {{^}}v_log_clamp_f32:
+; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
+ %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0
+ store float %log.clamp, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index 02ee2039542a..4825c3a479c1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -6,7 +6,7 @@
;SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
;VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
-define void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
main_body:
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
new file mode 100644
index 000000000000..a85fc7e13fd8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s
+
+; FIXME: The register allocator / scheduler should be able to avoid these hazards.
+
+; VI-LABEL: {{^}}dpp_test:
+; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+ %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+ store i32 %tmp0, i32 addrspace(1)* %out
+ ret void
+}
+
+; VI-LABEL: {{^}}dpp_wait_states:
+; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}}
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+ %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
+ %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+ store i32 %tmp1, i32 addrspace(1)* %out
+ ret void
+}
+
+; VI-LABEL: {{^}}dpp_first_in_bb:
+; VI: ; %endif
+; VI-OPT: s_mov_b32
+; VI-OPT: s_mov_b32
+; VI-NOOPT: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI: s_nop 1
+; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+ %cmp = fcmp oeq float %cond, 0.0
+ br i1 %cmp, label %if, label %else
+
+if:
+ %out_val = load float, float addrspace(1)* %out
+ %if_val = fadd float %a, %out_val
+ br label %endif
+
+else:
+ %in_val = load float, float addrspace(1)* %in
+ %else_val = fadd float %b, %in_val
+ br label %endif
+
+endif:
+ %val = phi float [%if_val, %if], [%else_val, %else]
+ %val_i32 = bitcast float %val to i32
+ %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %val_i32, i32 1, i32 1, i32 1, i1 1) #0
+ %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
+ %tmp2 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp1, i32 1, i32 1, i32 1, i1 1) #0
+ %tmp_float = bitcast i32 %tmp2 to float
+ store float %tmp_float, float addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
new file mode 100644
index 000000000000..fd1a463fd3e9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
+;
+; Note: We could generate better code here if we recognized earlier that
+; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
+; the expectation is that the intrinsic will be used in non-trivial shaders,
+; so such an optimization doesn't seem worth the effort.
+define amdgpu_ps float @test1() {
+ %live = call i1 @llvm.amdgcn.ps.live()
+ %live.32 = zext i1 %live to i32
+ %r = bitcast i32 %live.32 to float
+ ret float %r
+}
+
+; CHECK-LABEL: {{^}}test2:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
+; CHECK: image_sample v0, [[VAR]],
+define amdgpu_ps float @test2() {
+ %live = call i1 @llvm.amdgcn.ps.live()
+ %live.32 = zext i1 %live to i32
+
+ %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ %r = extractelement <4 x float> %t, i32 0
+ ret float %r
+}
+
+; CHECK-LABEL: {{^}}test3:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
+; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
+; CHECK: ; %dead
+define amdgpu_ps float @test3(i32 %in) {
+entry:
+ %live = call i1 @llvm.amdgcn.ps.live()
+ br i1 %live, label %end, label %dead
+
+dead:
+ %tc.dead = mul i32 %in, 2
+ br label %end
+
+end:
+ %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]
+ %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ %r = extractelement <4 x float> %t, i32 0
+ ret float %r
+}
+
+declare i1 @llvm.amdgcn.ps.live() #0
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
new file mode 100644
index 000000000000..6bf871543ca2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target
+
+; GCN-LABEL: {{^}}test:
+; GCN: enable_sgpr_queue_ptr = 1
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+define void @test(i32 addrspace(1)* %out) {
+ %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+ %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
+ %value = load i32, i32 addrspace(2)* %header_ptr
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
new file mode 100644
index 000000000000..825231bf8680
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rcp.f32(float) #0
+declare double @llvm.amdgcn.rcp.f64(double) #0
+
+declare double @llvm.sqrt.f64(double) #0
+declare float @llvm.sqrt.f32(float) #0
+
+; FUNC-LABEL: {{^}}rcp_undef_f32:
+; SI-NOT: v_rcp_f32
+define void @rcp_undef_f32(float addrspace(1)* %out) #1 {
+ %rcp = call float @llvm.amdgcn.rcp.f32(float undef)
+ store float %rcp, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}safe_no_fp32_denormals_rcp_f32:
+; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
+ %rcp = fdiv float 1.0, %src
+ store float %rcp, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}safe_f32_denormals_rcp_pat_f32:
+; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dword [[RESULT]]
+define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
+ %rcp = fdiv float 1.0, %src
+ store float %rcp, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32:
+; SI: v_div_scale_f32
+define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
+ %rcp = fdiv float 1.0, %src
+ store float %rcp, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
+; SI: v_sqrt_f32_e32
+; SI: v_rcp_f32_e32
+define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
+ %sqrt = call float @llvm.sqrt.f32(float %src)
+ %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
+ store float %rcp, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32:
+; SI: v_rsq_f32_e32
+define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
+ %sqrt = call float @llvm.sqrt.f32(float %src)
+ %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
+ store float %rcp, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_f64:
+; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
+ %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rcp_f64:
+; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
+ %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_pat_f64:
+; SI: v_div_scale_f64
+define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+ %rcp = fdiv double 1.0, %src
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rcp_pat_f64:
+; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+ %rcp = fdiv double 1.0, %src
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f64:
+; SI-NOT: v_rsq_f64_e32
+; SI: v_sqrt_f64
+; SI: v_rcp_f64
+define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+ %sqrt = call double @llvm.sqrt.f64(double %src)
+ %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f64:
+; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
+; SI-NOT: [[RESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+ %sqrt = call double @llvm.sqrt.f64(double %src)
+ %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+ store double %rcp, double addrspace(1)* %out, align 8
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
+attributes #3 = { nounwind "unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
+attributes #4 = { nounwind "unsafe-fp-math"="true" "target-features"="+fp32-denormals" }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.read.workdim.ll
index 2e299e30b8c7..76a5757e4c20 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.read.workdim.ll
@@ -1,23 +1,19 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}read_workdim:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].Z
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA %s
+; GCN-LABEL: {{^}}read_workdim:
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @read_workdim(i32 addrspace(1)* %out) {
entry:
- %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+ %0 = call i32 @llvm.amdgcn.read.workdim() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}read_workdim_known_bits:
+; GCN-LABEL: {{^}}read_workdim_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
; GCN-NOT: 0xff
@@ -25,13 +21,26 @@ entry:
; GCN: buffer_store_dword [[VVAL]]
define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
entry:
- %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+ %dim = call i32 @llvm.amdgcn.read.workdim() #0
%shl = shl i32 %dim, 24
%shr = lshr i32 %shl, 24
store i32 %shr, i32 addrspace(1)* %out
ret void
}
+; GCN-LABEL: {{^}}legacy_read_workdim:
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+define void @legacy_read_workdim(i32 addrspace(1)* %out) {
+entry:
+ %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+ store i32 %dim, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.read.workdim() #0
declare i32 @llvm.AMDGPU.read.workdim() #0
-attributes #0 = { readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
new file mode 100644
index 000000000000..73a5c54e175e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
+declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
+
+; FUNC-LABEL: {{^}}rsq_clamp_f32:
+; SI: v_rsq_clamp_f32_e32
+
+; VI: s_load_dword [[SRC:s[0-9]+]]
+; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], [[SRC]]
+; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
+; TODO: this constant should be folded:
+; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff7fffff
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[K]]
+; VI: buffer_store_dword [[RESULT]]
+define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
+ store float %rsq_clamp, float addrspace(1)* %out
+ ret void
+}
+
+
+; FUNC-LABEL: {{^}}rsq_clamp_f64:
+; SI: v_rsq_clamp_f64_e32
+
+; TODO: this constant should be folded:
+; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
+; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
+; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
+; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
+; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
+define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
+ store double %rsq_clamp, double addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
+; SI-NOT: v_rsq_clamp_f32
+define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
+ store float %rsq_clamp, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
new file mode 100644
index 000000000000..47bd0d82b834
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rsq.legacy(float) #0
+
+; FUNC-LABEL: {{^}}rsq_legacy_f32:
+; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0
+; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0
+define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0
+; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
+define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_legacy_undef_f32:
+; SI-NOT: v_rsq_legacy_f32
+define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.legacy(float undef)
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
new file mode 100644
index 000000000000..012f6cd82925
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.amdgcn.rsq.f32(float) #0
+declare double @llvm.amdgcn.rsq.f64(double) #0
+
+; FUNC-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
+define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
+define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f64:
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
+ %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0
+ store double %rsq, double addrspace(1)* %out, align 4
+ ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f64_constant_4.0
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0
+define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
+ %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0
+ store double %rsq, double addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f64_constant_100.0
+; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
+; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
+ %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
+ store double %rsq, double addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_undef_f32:
+; SI-NOT: v_rsq_f32
+define void @rsq_undef_f32(float addrspace(1)* %out) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.f32(float undef)
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
new file mode 100644
index 000000000000..132e476d5e29
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_barrier:
+; GCN: buffer_store_dword
+; GCN: s_waitcnt
+; GCN: s_barrier
+define void @test_barrier(i32 addrspace(1)* %out) #0 {
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
+ store i32 %tmp, i32 addrspace(1)* %tmp1
+ call void @llvm.amdgcn.s.barrier()
+ %tmp2 = call i32 @llvm.r600.read.local.size.x()
+ %tmp3 = sub i32 %tmp2, 1
+ %tmp4 = sub i32 %tmp3, %tmp
+ %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
+ %tmp6 = load i32, i32 addrspace(1)* %tmp5
+ store i32 %tmp6, i32 addrspace(1)* %tmp1
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+declare i32 @llvm.r600.read.local.size.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index f8af67c17ec2..ecd4ac6824cc 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare void @llvm.amdgcn.s.dcache.inv() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
; GCN-LABEL: {{^}}test_s_dcache_inv:
; GCN-NEXT: ; BB#0:
@@ -15,10 +16,11 @@ define void @test_s_dcache_inv() #0 {
; GCN-LABEL: {{^}}test_s_dcache_inv_insert_wait:
; GCN-NEXT: ; BB#0:
-; GCN-NEXT: s_dcache_inv
-; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; GCN: s_dcache_inv
+; GCN: s_waitcnt lgkmcnt(0) ; encoding
define void @test_s_dcache_inv_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.inv()
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
br label %end
end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index a8502a7c5033..097f35d42c4f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
; GCN-LABEL: {{^}}test_s_dcache_inv_vol:
; GCN-NEXT: ; BB#0:
@@ -16,9 +17,10 @@ define void @test_s_dcache_inv_vol() #0 {
; GCN-LABEL: {{^}}test_s_dcache_inv_vol_insert_wait:
; GCN-NEXT: ; BB#0:
; GCN-NEXT: s_dcache_inv_vol
-; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; GCN: s_waitcnt lgkmcnt(0) ; encoding
define void @test_s_dcache_inv_vol_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.inv.vol()
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
br label %end
end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index f9ae09b391aa..9ecce7463f6b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
declare void @llvm.amdgcn.s.dcache.wb() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
; VI-LABEL: {{^}}test_s_dcache_wb:
; VI-NEXT: ; BB#0:
@@ -14,9 +15,10 @@ define void @test_s_dcache_wb() #0 {
; VI-LABEL: {{^}}test_s_dcache_wb_insert_wait:
; VI-NEXT: ; BB#0:
; VI-NEXT: s_dcache_wb
-; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; VI: s_waitcnt lgkmcnt(0) ; encoding
define void @test_s_dcache_wb_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.wb()
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
br label %end
end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index d9145458a1f6..943f8c67a2e3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
declare void @llvm.amdgcn.s.dcache.wb.vol() #0
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
; VI-LABEL: {{^}}test_s_dcache_wb_vol:
; VI-NEXT: ; BB#0:
@@ -14,9 +15,10 @@ define void @test_s_dcache_wb_vol() #0 {
; VI-LABEL: {{^}}test_s_dcache_wb_vol_insert_wait:
; VI-NEXT: ; BB#0:
; VI-NEXT: s_dcache_wb_vol
-; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding
+; VI: s_waitcnt lgkmcnt(0) ; encoding
define void @test_s_dcache_wb_vol_insert_wait() #0 {
call void @llvm.amdgcn.s.dcache.wb.vol()
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
br label %end
end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
new file mode 100644
index 000000000000..251eec656edc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+
+; FUNC-LABEL: {{^}}s_getreg_test:
+; CHECK: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
+define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+ %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #0
+ %lds_size_bytes = shl i32 %lds_size_64dwords, 8
+ store i32 %lds_size_bytes, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.s.getreg(i32) #0
+
+attributes #0 = { nounwind readonly}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
new file mode 100644
index 000000000000..372cba6eb67b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.s.memrealtime() #0
+
+; GCN-LABEL: {{^}}test_s_memrealtime:
+; GCN-DAG: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: s_load_dwordx2
+; GCN: lgkmcnt
+; GCN: buffer_store_dwordx2
+; GCN-NOT: lgkmcnt
+; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
+ %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
+ store volatile i64 %cycle0, i64 addrspace(1)* %out
+
+ %cycle1 = call i64 @llvm.amdgcn.s.memrealtime()
+ store volatile i64 %cycle1, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
new file mode 100644
index 000000000000..8ce2d48733c6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.s.memtime() #0
+
+; GCN-LABEL: {{^}}test_s_memtime:
+; GCN-DAG: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: s_load_dwordx2
+; GCN: lgkmcnt
+; GCN: buffer_store_dwordx2
+; GCN-NOT: lgkmcnt
+; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define void @test_s_memtime(i64 addrspace(1)* %out) #0 {
+ %cycle0 = call i64 @llvm.amdgcn.s.memtime()
+ store volatile i64 %cycle0, i64 addrspace(1)* %out
+
+ %cycle1 = call i64 @llvm.amdgcn.s.memtime()
+ store volatile i64 %cycle1, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
new file mode 100644
index 000000000000..870aa48a3417
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare void @llvm.amdgcn.s.sleep(i32) #0
+
+; GCN-LABEL: {{^}}test_s_sleep:
+; GCN: s_sleep 0{{$}}
+; GCN: s_sleep 1{{$}}
+; GCN: s_sleep 2{{$}}
+; GCN: s_sleep 3{{$}}
+; GCN: s_sleep 4{{$}}
+; GCN: s_sleep 5{{$}}
+; GCN: s_sleep 6{{$}}
+; GCN: s_sleep 7{{$}}
+; GCN: s_sleep 8{{$}}
+; GCN: s_sleep 9{{$}}
+; GCN: s_sleep 10{{$}}
+; GCN: s_sleep 11{{$}}
+; GCN: s_sleep 12{{$}}
+; GCN: s_sleep 13{{$}}
+; GCN: s_sleep 14{{$}}
+; GCN: s_sleep 15{{$}}
+define void @test_s_sleep(i32 %x) #0 {
+ call void @llvm.amdgcn.s.sleep(i32 0)
+ call void @llvm.amdgcn.s.sleep(i32 1)
+ call void @llvm.amdgcn.s.sleep(i32 2)
+ call void @llvm.amdgcn.s.sleep(i32 3)
+ call void @llvm.amdgcn.s.sleep(i32 4)
+ call void @llvm.amdgcn.s.sleep(i32 5)
+ call void @llvm.amdgcn.s.sleep(i32 6)
+ call void @llvm.amdgcn.s.sleep(i32 7)
+
+ ; Values that might only work on VI
+ call void @llvm.amdgcn.s.sleep(i32 8)
+ call void @llvm.amdgcn.s.sleep(i32 9)
+ call void @llvm.amdgcn.s.sleep(i32 10)
+ call void @llvm.amdgcn.s.sleep(i32 11)
+ call void @llvm.amdgcn.s.sleep(i32 12)
+ call void @llvm.amdgcn.s.sleep(i32 13)
+ call void @llvm.amdgcn.s.sleep(i32 14)
+ call void @llvm.amdgcn.s.sleep(i32 15)
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
new file mode 100644
index 000000000000..c2d48f99aac5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: image_store
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}
+; CHECK-NEXT: image_store
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) {
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+ call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+ ret void
+}
+
+; Test that the intrinsic is merged with automatically generated waits and
+; emitted as late as possible.
+;
+; CHECK-LABEL: {{^}}test2:
+; CHECK: image_load
+; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
+; CHECK: s_waitcnt
+; CHECK-NEXT: image_store
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
+ %t = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+ %c.1 = mul i32 %c, 2
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
new file mode 100644
index 000000000000..9dc4554b88a4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.sin.f32(float) #0
+
+; GCN-LABEL: {{^}}v_sin_f32:
+; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
+ %sin = call float @llvm.amdgcn.sin.f32(float %src) #0
+ store float %sin, float addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index 6b546a7e17c1..7757e411553b 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
+declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
; SI-LABEL: {{^}}test_trig_preop_f64:
; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]]
@@ -12,7 +12,7 @@ declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
%a = load double, double addrspace(1)* %aptr, align 8
%b = load i32, i32 addrspace(1)* %bptr, align 4
- %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
+ %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone
store double %result, double addrspace(1)* %out, align 8
ret void
}
@@ -24,7 +24,7 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)*
; SI: s_endpgm
define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
%a = load double, double addrspace(1)* %aptr, align 8
- %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
+ %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone
store double %result, double addrspace(1)* %out, align 8
ret void
}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
new file mode 100644
index 000000000000..c22eac7e271c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=CI-HSA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=VI-HSA %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+; ALL-LABEL {{^}}test_workgroup_id_x:
+
+; HSA: .amd_kernel_code_t
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; HSA: .end_amd_kernel_code_t
+
+; MESA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s2{{$}}
+; HSA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s6{{$}}
+
+; ALL-NOT: [[VCOPY]]
+; ALL: {{buffer|flat}}_store_dword {{.*}}[[VCOPY]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; ALL-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; ALL: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
+ %id = call i32 @llvm.amdgcn.workgroup.id.x()
+ store i32 %id, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL {{^}}test_workgroup_id_y:
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 1
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; MESA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s7{{$}}
+
+; ALL-NOT: [[VCOPY]]
+; ALL: {{buffer|flat}}_store_dword {{.*}}[[VCOPY]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; ALL-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; ALL: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
+ %id = call i32 @llvm.amdgcn.workgroup.id.y()
+ store i32 %id, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL {{^}}test_workgroup_id_z:
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 1
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; MESA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VCOPY:v[0-9]+]], s7{{$}}
+
+; ALL-NOT: [[VCOPY]]
+; ALL: {{buffer|flat}}_store_dword {{.*}}[[VCOPY]]
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; ALL-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; ALL: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
+ %id = call i32 @llvm.amdgcn.workgroup.id.z()
+ store i32 %id, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
new file mode 100644
index 000000000000..28ef7b82ef84
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=CI-HSA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HSA -check-prefix=VI-HSA %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+; MESA: .section .AMDGPU.config
+; MESA: .long 47180
+; MESA-NEXT: .long 132{{$}}
+
+; ALL-LABEL {{^}}test_workitem_id_x:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+
+; ALL-NOT: v0
+; ALL: {{buffer|flat}}_store_dword {{.*}}v0
+define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ store i32 %id, i32 addrspace(1)* %out
+ ret void
+}
+
+; MESA: .section .AMDGPU.config
+; MESA: .long 47180
+; MESA-NEXT: .long 2180{{$}}
+
+; ALL-LABEL {{^}}test_workitem_id_y:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
+
+; ALL-NOT: v1
+; ALL: {{buffer|flat}}_store_dword {{.*}}v1
+define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
+ %id = call i32 @llvm.amdgcn.workitem.id.y()
+ store i32 %id, i32 addrspace(1)* %out
+ ret void
+}
+
+; MESA: .section .AMDGPU.config
+; MESA: .long 47180
+; MESA-NEXT: .long 4228{{$}}
+
+; ALL-LABEL {{^}}test_workitem_id_z:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
+
+; ALL-NOT: v2
+; ALL: {{buffer|flat}}_store_dword {{.*}}v2
+define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
+ %id = call i32 @llvm.amdgcn.workitem.id.z()
+ store i32 %id, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
index 42df6db1ccfd..6b865d8076e6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
@@ -3,7 +3,7 @@
; SI-LABEL: {{^}}kilp_gs_const:
; SI: s_mov_b64 exec, 0
-define void @kilp_gs_const() #0 {
+define amdgpu_gs void @kilp_gs_const() {
main_body:
%0 = icmp ule i32 0, 3
%1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
@@ -16,6 +16,4 @@ main_body:
declare void @llvm.AMDGPU.kilp(float)
-attributes #0 = { "ShaderType"="2" }
-
!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
deleted file mode 100644
index 0c3e4ecaa1a0..000000000000
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
-
-; FUNC-LABEL: {{^}}test_lrp:
-; SI: v_mad_f32
-; SI: v_mac_f32_e32
-define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
- %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
- store float %mad, float addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll
index c65df8b3e8da..eb7dcbbf2346 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
;FUNC-LABEL: test
;EG: MULADD_IEEE *
@@ -37,5 +37,3 @@ define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
declare float @llvm.cos.f32(float) readnone
declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone
-
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index b01f8ab2bdf9..1a37ba311606 100644
--- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -14,17 +14,16 @@ entry:
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !12}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 244715) (llvm/trunk 244718)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 244715) (llvm/trunk 244718)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "/tmp/test_debug_value.cl", directory: "/Users/matt/src/llvm/build_debug")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, variables: !9)
+!4 = distinct !DISubprogram(name: "test_debug_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !9)
!5 = !DISubroutineType(types: !6)
!6 = !{null, !7}
!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 32)
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
index d83ab562b718..8398309d7520 100644
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -6,77 +6,77 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+; SI-DAG: ds_read_u8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
+; SI-DAG: ds_write_b8
; SI: s_endpgm
define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
@@ -87,41 +87,41 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %
}
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
-; SI: ds_write_b16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+; SI-DAG: ds_read_u16
+
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
+; SI-DAG: ds_write_b16
; SI: s_endpgm
define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
@@ -153,15 +153,11 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
; FIXME: Use 64-bit ops
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_read_b64
+; SI: ds_read2_b64
+; SI: ds_read2_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_write2_b64
+; SI: ds_write2_b64
; SI-DAG: s_endpgm
define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/llvm.pow.ll b/test/CodeGen/AMDGPU/llvm.pow.ll
index c4ae652619c2..3f203ddf93b8 100644
--- a/test/CodeGen/AMDGPU/llvm.pow.ll
+++ b/test/CodeGen/AMDGPU/llvm.pow.ll
@@ -5,12 +5,12 @@
;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-define void @test1(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test1(<4 x float> inreg %reg0) {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
%r2 = call float @llvm.pow.f32( float %r0, float %r1)
%vec = insertelement <4 x float> undef, float %r2, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
@@ -27,14 +27,12 @@ define void @test1(<4 x float> inreg %reg0) #0 {
;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_ps void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
%vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1)
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
declare float @llvm.pow.f32(float ,float ) readonly
declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
index 036cd2ca82a6..4db29c58385e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
@@ -1,11 +1,11 @@
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
%src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
%src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
- %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
+ %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
store float %dp4, float addrspace(1)* %out, align 4
ret void
}
diff --git a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
new file mode 100644
index 000000000000..e4e6dd8e1069
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: {{^}}test_group_barrier:
+; EG: GROUP_BARRIER
+define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
+entry:
+ %tmp = call i32 @llvm.r600.read.tidig.x()
+ %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
+ store i32 %tmp, i32 addrspace(1)* %tmp1
+ call void @llvm.r600.group.barrier()
+ %tmp2 = call i32 @llvm.r600.read.local.size.x()
+ %tmp3 = sub i32 %tmp2, 1
+ %tmp4 = sub i32 %tmp3, %tmp
+ %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
+ %tmp6 = load i32, i32 addrspace(1)* %tmp5
+ store i32 %tmp6, i32 addrspace(1)* %tmp1
+ ret void
+}
+
+; Function Attrs: convergent nounwind
+declare void @llvm.r600.group.barrier() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll
new file mode 100644
index 000000000000..2f5947395c43
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.workdim.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: {{^}}read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].Z
+define void @read_workdim(i32 addrspace(1)* %out) {
+entry:
+ %dim = call i32 @llvm.r600.read.workdim() #0
+ store i32 %dim, i32 addrspace(1)* %out
+ ret void
+}
+
+; EG-LABEL: {{^}}read_workdim_known_bits:
+define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
+entry:
+ %dim = call i32 @llvm.r600.read.workdim() #0
+ %shl = shl i32 %dim, 24
+ %shr = lshr i32 %shl, 24
+ store i32 %shr, i32 addrspace(1)* %out
+ ret void
+}
+
+; EG-LABEL: {{^}}legacy_read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV * [[VAL]], KC0[2].Z
+define void @legacy_read_workdim(i32 addrspace(1)* %out) {
+entry:
+ %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+ store i32 %dim, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.r600.read.workdim() #0
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
new file mode 100644
index 000000000000..1c6e7950e9b7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+
+declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone
+
+; EG-LABEL: {{^}}rsq_clamped_f32:
+; EG: RECIPSQRT_CLAMPED
+define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+ %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src)
+ store float %rsq_clamped, float addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
new file mode 100644
index 000000000000..1d6bff01e662
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+
+declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone
+
+; EG-LABEL: {{^}}recipsqrt.ieee_f32:
+; EG: RECIPSQRT_IEEE
+define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
+ %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone
+ store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; TODO: Really these should be constant folded
+; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0
+; EG: RECIPSQRT_IEEE
+define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+ %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone
+ store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0
+; EG: RECIPSQRT_IEEE
+define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+ %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone
+ store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.r600.tex.ll b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
new file mode 100644
index 000000000000..409037f3e976
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
@@ -0,0 +1,65 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+bb:
+ %addr = load <4 x float>, <4 x float> addrspace(1)* %in
+ %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp1 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp3 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp5 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp6 = shufflevector <4 x float> %tmp5, <4 x float> %tmp5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp7 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
+ %tmp10 = shufflevector <4 x float> %tmp9, <4 x float> %tmp9, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+ %tmp11 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp12 = shufflevector <4 x float> %tmp11, <4 x float> %tmp11, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+ %tmp13 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp14 = shufflevector <4 x float> %tmp13, <4 x float> %tmp13, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+ %tmp15 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1)
+ %tmp16 = shufflevector <4 x float> %tmp15, <4 x float> %tmp15, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
+ %tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+ %tmp18 = shufflevector <4 x float> %tmp17, <4 x float> %tmp17, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+ %tmp20 = shufflevector <4 x float> %tmp19, <4 x float> %tmp19, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+ %tmp21 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+ %tmp22 = shufflevector <4 x float> %tmp21, <4 x float> %tmp21, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp23 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+ %tmp24 = shufflevector <4 x float> %tmp23, <4 x float> %tmp23, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp25 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp26 = shufflevector <4 x float> %tmp25, <4 x float> %tmp25, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp28 = shufflevector <4 x float> %tmp27, <4 x float> %tmp27, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp29 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp30 = shufflevector <4 x float> %tmp29, <4 x float> %tmp29, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp31 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1)
+ store <4 x float> %tmp31, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll
index 661db51ad032..cf7c0e4c6fb6 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
; FUNC-LABEL: {{^}}rint_f32:
; R600: RNDNE
@@ -43,18 +43,6 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32:
-; R600: RNDNE
-
-; SI: v_rndne_f32_e32
-define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
-entry:
- %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
- store float %0, float addrspace(1)* %out
- ret void
-}
-
-declare float @llvm.AMDIL.round.nearest.f32(float) #0
declare float @llvm.rint.f32(float) #0
declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 98afbeee93e6..a8024b713261 100644
--- a/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -13,7 +13,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
; FUNC-LABEL: {{^}}v_round_f64:
; SI: buffer_load_dwordx2
-; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
+; SI-DAG: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
; SI-DAG: v_not_b32_e32
; SI-DAG: v_not_b32_e32
@@ -27,7 +27,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
; SI: buffer_store_dwordx2
; SI: s_endpgm
define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
%out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
%x = load double, double addrspace(1)* %gep
@@ -60,7 +60,7 @@ define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
ret void
}
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
declare double @llvm.round.f64(double) #1
declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index d0e49243ffa7..9b7bb00d3c38 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -5,9 +5,9 @@
; FUNC-LABEL: {{^}}round_f32:
; SI-DAG: s_load_dword [[SX:s[0-9]+]]
; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
-; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
-; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
-; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; SI-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; SI-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
; SI: v_cmp_le_f32_e64 vcc, 0.5, |[[SUB]]|
; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll
index 3bb245c2e249..04754396a0f7 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -1,8 +1,5 @@
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: sin_f32
; EG: MULADD_IEEE *
@@ -10,58 +7,91 @@
; EG: ADD *
; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
; EG-NOT: SIN
+
; SI: v_mul_f32
; SI: v_fract_f32
; SI: v_sin_f32
; SI-NOT: v_sin_f32
-
define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
%sin = call float @llvm.sin.f32(float %x)
store float %sin, float addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}sin_3x_f32:
-; SI-UNSAFE-NOT: v_add_f32
-; SI-UNSAFE: 0x3ef47644
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_mul_f32
-; SI-SAFE: v_mul_f32
+; FUNC-LABEL: {{^}}safe_sin_3x_f32:
+; SI: v_mul_f32
+; SI: v_mul_f32
; SI: v_fract_f32
; SI: v_sin_f32
; SI-NOT: v_sin_f32
-define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
%y = fmul float 3.0, %x
%sin = call float @llvm.sin.f32(float %y)
store float %sin, float addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}sin_2x_f32:
-; SI-UNSAFE-NOT: v_add_f32
-; SI-UNSAFE: 0x3ea2f983
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_add_f32
-; SI-SAFE: v_mul_f32
+; FUNC-LABEL: {{^}}unsafe_sin_3x_f32:
+; SI-NOT: v_add_f32
+; SI: 0x3ef47644
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
+ %y = fmul float 3.0, %x
+ %sin = call float @llvm.sin.f32(float %y)
+ store float %sin, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}safe_sin_2x_f32:
+; SI: v_add_f32
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+ %y = fmul float 2.0, %x
+ %sin = call float @llvm.sin.f32(float %y)
+ store float %sin, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_sin_2x_f32:
+; SI-NOT: v_add_f32
+; SI: 0x3ea2f983
+; SI: v_mul_f32
; SI: v_fract_f32
; SI: v_sin_f32
; SI-NOT: v_sin_f32
-define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
%y = fmul float 2.0, %x
%sin = call float @llvm.sin.f32(float %y)
store float %sin, float addrspace(1)* %out
ret void
}
-; FUNC-LABEL: {{^}}test_2sin_f32:
-; SI-UNSAFE: 0x3ea2f983
-; SI-UNSAFE: v_mul_f32
-; SI-SAFE: v_add_f32
-; SI-SAFE: v_mul_f32
+; FUNC-LABEL: {{^}}test_safe_2sin_f32:
+; SI: v_add_f32
+; SI: v_mul_f32
; SI: v_fract_f32
; SI: v_sin_f32
; SI-NOT: v_sin_f32
-define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+ %y = fmul float 2.0, %x
+ %sin = call float @llvm.sin.f32(float %y)
+ store float %sin, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_unsafe_2sin_f32:
+; SI: 0x3ea2f983
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
%y = fmul float 2.0, %x
%sin = call float @llvm.sin.f32(float %y)
store float %sin, float addrspace(1)* %out
@@ -74,19 +104,21 @@ define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
; EG-NOT: SIN
+
; SI: v_sin_f32
; SI: v_sin_f32
; SI: v_sin_f32
; SI: v_sin_f32
; SI-NOT: v_sin_f32
-
define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
%sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
store <4 x float> %sin, <4 x float> addrspace(1)* %out
ret void
}
-declare float @llvm.sin.f32(float) readnone
-declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone
+declare float @llvm.sin.f32(float) #0
+declare <4 x float> @llvm.sin.v4f32(<4 x float>) #0
-attributes #0 = { "ShaderType"="0" }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="false" }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.ll b/test/CodeGen/AMDGPU/llvm.sqrt.ll
deleted file mode 100644
index c6da047f5392..000000000000
--- a/test/CodeGen/AMDGPU/llvm.sqrt.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
-
-; R600-LABEL: {{^}}sqrt_f32:
-; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-; SI-LABEL: {{^}}sqrt_f32:
-; SI: v_sqrt_f32_e32
-define void @sqrt_f32(float addrspace(1)* %out, float %in) {
-entry:
- %0 = call float @llvm.sqrt.f32(float %in)
- store float %0, float addrspace(1)* %out
- ret void
-}
-
-; R600-LABEL: {{^}}sqrt_v2f32:
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-; SI-LABEL: {{^}}sqrt_v2f32:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
- %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
- store <2 x float> %0, <2 x float> addrspace(1)* %out
- ret void
-}
-
-; R600-LABEL: {{^}}sqrt_v4f32:
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
-; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
-; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-; SI-LABEL: {{^}}sqrt_v4f32:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
- %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
- store <4 x float> %0, <4 x float> addrspace(1)* %out
- ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check:
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check(float addrspace(1)* %out, float %in) {
-entry:
- %sqrt = call float @llvm.sqrt.f32(float %in)
- %cmp = fcmp olt float %in, -0.000000e+00
- %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
- store float %res, float addrspace(1)* %out
- ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_ult:
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) {
-entry:
- %sqrt = call float @llvm.sqrt.f32(float %in)
- %cmp = fcmp ult float %in, -0.000000e+00
- %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
- store float %res, float addrspace(1)* %out
- ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_v2:
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
- %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
- %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
- %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
- store <2 x float> %res, <2 x float> addrspace(1)* %out
- ret void
-}
-
-; SI-LABEL: {{^}}elim_redun_check_v2_ult
-; SI: v_sqrt_f32_e32
-; SI: v_sqrt_f32_e32
-; SI-NOT: v_cndmask
-define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
- %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
- %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
- %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
- store <2 x float> %res, <2 x float> addrspace(1)* %out
- ret void
-}
-
-declare float @llvm.sqrt.f32(float %in)
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/AMDGPU/load-constant-f64.ll b/test/CodeGen/AMDGPU/load-constant-f64.ll
new file mode 100644
index 000000000000..f94a3785a685
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_f64:
+; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
+; GCN-NOHSA: buffer_store_dwordx2
+; GCN-HSA: flat_store_dwordx2
+define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+ %ld = load double, double addrspace(2)* %in
+ store double %ld, double addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i1.ll b/test/CodeGen/AMDGPU/load-constant-i1.ll
new file mode 100644
index 000000000000..f15e4f484ffa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -0,0 +1,371 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_i1:
+; GCN: buffer_load_ubyte
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
+; GCN: buffer_store_byte
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+ %load = load i1, i1 addrspace(2)* %in
+ store i1 %load, i1 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i1:
+define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+ store <2 x i1> %load, <2 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i1:
+define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+ store <3 x i1> %load, <3 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i1:
+define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+ store <4 x i1> %load, <4 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i1:
+define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+ store <8 x i1> %load, <8 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i1:
+define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+ store <16 x i1> %load, <16 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v32i1:
+define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+ store <32 x i1> %load, <32 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v64i1:
+define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+ store <64 x i1> %load, <64 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_dword
+define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+ %a = load i1, i1 addrspace(2)* %in
+ %ext = zext i1 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; GCN: buffer_store_dword
+
+; EG: VTX_READ_8
+; EG: BFE_INT
+define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+ %a = load i1, i1 addrspace(2)* %in
+ %ext = sext i1 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
+define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+ %ext = zext <1 x i1> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
+define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+ %ext = sext <1 x i1> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
+define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+ %ext = zext <2 x i1> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
+define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+ %ext = sext <2 x i1> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
+define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+ %ext = zext <3 x i1> %load to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
+define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+ %ext = sext <3 x i1> %load to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
+define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+ %ext = zext <4 x i1> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
+define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+ %ext = sext <4 x i1> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
+define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+ %ext = zext <8 x i1> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
+define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+ %ext = sext <8 x i1> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
+define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+ %ext = zext <16 x i1> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
+define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+ %ext = sext <16 x i1> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
+define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+ %ext = zext <32 x i1> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
+define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+ %ext = sext <32 x i1> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
+define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+ %ext = zext <64 x i1> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
+define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+ %ext = sext <64 x i1> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i1_to_i64:
+; GCN-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
+; GCN: buffer_store_dwordx2
+define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+ %a = load i1, i1 addrspace(2)* %in
+ %ext = zext i1 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i1_to_i64:
+; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; GCN: buffer_store_dwordx2
+define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+ %a = load i1, i1 addrspace(2)* %in
+ %ext = sext i1 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
+define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+ %ext = zext <1 x i1> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
+define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
+ %ext = sext <1 x i1> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
+define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+ %ext = zext <2 x i1> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
+define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
+ %ext = sext <2 x i1> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
+define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+ %ext = zext <3 x i1> %load to <3 x i64>
+ store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
+define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
+ %ext = sext <3 x i1> %load to <3 x i64>
+ store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
+define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+ %ext = zext <4 x i1> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
+define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
+ %ext = sext <4 x i1> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
+define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+ %ext = zext <8 x i1> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
+define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
+ %ext = sext <8 x i1> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
+define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+ %ext = zext <16 x i1> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
+define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
+ %ext = sext <16 x i1> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
+define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+ %ext = zext <32 x i1> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
+define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
+ %ext = sext <32 x i1> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
+define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+ %ext = zext <64 x i1> %load to <64 x i64>
+ store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
+define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
+ %ext = sext <64 x i1> %load to <64 x i64>
+ store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
new file mode 100644
index 000000000000..ef9791d8f7a1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -0,0 +1,441 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_i16:
+; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
+; GCN-HSA: flat_load_ushort
+
+; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+ %ld = load i16, i16 addrspace(2)* %in
+ store i16 %ld, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i16:
+; GCN: s_load_dword s
+
+; EG: VTX_READ_32
+define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
+entry:
+ %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
+ store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i16:
+; GCN: s_load_dwordx2 s
+
+; EG-DAG: VTX_READ_32
+; EG-DAG: VTX_READ_16
+define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+ store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i16:
+; GCN: s_load_dwordx2
+
+; EG: VTX_READ_64
+define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
+entry:
+ %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
+ store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i16:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
+entry:
+ %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
+ store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i16:
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
+entry:
+ %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
+ store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_ushort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+ %a = load i16, i16 addrspace(2)* %in
+ %ext = zext i16 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_sshort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 16
+define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+ %a = load i16, i16 addrspace(2)* %in
+ %ext = sext i16 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+ %ext = zext <1 x i16> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-HSA: flat_load_sshort
+define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+ %ext = sext <1 x i16> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff{{$}}
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+ %ext = zext <2 x i16> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+ %ext = sext <2 x i16> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_constant_zextload_v3i16_to_v3i32:
+; GCN: s_load_dwordx2
+define void @constant_constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+ %ext = zext <3 x i16> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_constant_sextload_v3i16_to_v3i32:
+; GCN: s_load_dwordx2
+define void @constant_constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
+ %ext = sext <3 x i16> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_constant_zextload_v4i16_to_v4i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+ %ext = zext <4 x i16> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+ %ext = sext <4 x i16> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i32:
+; GCN: s_load_dwordx4
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+ %ext = zext <8 x i16> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i32:
+; GCN: s_load_dwordx4
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+ %ext = sext <8 x i16> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i32:
+; GCN: s_load_dwordx8
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+ %ext = zext <16 x i16> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i32:
+; GCN: s_load_dwordx8
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+ %ext = sext <16 x i16> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
+; GCN-DAG: s_load_dwordx16
+; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
+; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+ %ext = zext <32 x i16> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32:
+; GCN: s_load_dwordx16
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i16
+define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+ %ext = sext <32 x i16> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i32:
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+ %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+ %ext = zext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i32:
+define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+ %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+ %ext = sext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+ %a = load i16, i16 addrspace(2)* %in
+ %ext = zext i16 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
+; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+ %a = load i16, i16 addrspace(2)* %in
+ %ext = sext i16 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i64:
+define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+ %ext = zext <1 x i16> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i64:
+define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
+ %ext = sext <1 x i16> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
+define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+ %ext = zext <2 x i16> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
+define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
+ %ext = sext <2 x i16> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
+define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+ %ext = zext <4 x i16> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
+define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
+ %ext = sext <4 x i16> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
+define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+ %ext = zext <8 x i16> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
+define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
+ %ext = sext <8 x i16> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i64:
+define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+ %ext = zext <16 x i16> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i64:
+define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
+ %ext = sext <16 x i16> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i64:
+define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+ %ext = zext <32 x i16> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i64:
+define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
+ %ext = sext <32 x i16> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
+; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+; %ext = zext <64 x i16> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
+; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
+; %ext = sext <64 x i16> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i32.ll b/test/CodeGen/AMDGPU/load-constant-i32.ll
new file mode 100644
index 000000000000..40c29be60548
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -0,0 +1,380 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}constant_load_i32:
+; GCN: s_load_dword s{{[0-9]+}}
+
+; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+entry:
+ %ld = load i32, i32 addrspace(2)* %in
+ store i32 %ld, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i32:
+; GCN: s_load_dwordx2
+
+; EG: VTX_READ_64
+define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+ store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i32:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in
+ store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i32:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+ store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i32:
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+ store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i32:
+; GCN: s_load_dwordx16
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+ store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i32_to_i64:
+; GCN-DAG: s_load_dword s[[SLO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[SHI:[0-9]+]], 0{{$}}
+; GCN: store_dwordx2
+
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG: CF_END
+; EG: VTX_READ_32
+define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+ %ld = load i32, i32 addrspace(2)* %in
+ %ext = zext i32 %ld to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i32_to_i64:
+; GCN: s_load_dword s[[SLO:[0-9]+]]
+; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[SLO]], 31
+; GCN: store_dwordx2
+
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG: CF_END
+; EG: VTX_READ_32
+; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
+; EG: 31
+define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+ %ld = load i32, i32 addrspace(2)* %in
+ %ext = sext i32 %ld to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
+; GCN: s_load_dword
+; GCN: store_dwordx2
+define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+ %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
+ %ext = zext <1 x i32> %ld to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i32_to_v1i64:
+; GCN: s_load_dword s[[LO:[0-9]+]]
+; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
+; GCN: store_dwordx2
+define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+ %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
+ %ext = sext <1 x i32> %ld to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
+; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN: store_dwordx4
+define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+ %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+ %ext = zext <2 x i32> %ld to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i32_to_v2i64:
+; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_ashr_i32
+
+; GCN: store_dwordx4
+define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+ %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
+ %ext = sext <2 x i32> %ld to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i32_to_v4i64:
+; GCN: s_load_dwordx4
+
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+ %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+ %ext = zext <4 x i32> %ld to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i32_to_v4i64:
+; GCN: s_load_dwordx4
+
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+ %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
+ %ext = sext <4 x i32> %ld to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i32_to_v8i64:
+; GCN: s_load_dwordx8
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-SA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+ %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+ %ext = zext <8 x i32> %ld to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i32_to_v8i64:
+; GCN: s_load_dwordx8
+
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+; GCN: s_ashr_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+ %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
+ %ext = sext <8 x i32> %ld to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i32_to_v16i64:
+; GCN: s_load_dwordx16
+
+
+; GCN-DAG: s_ashr_i32
+
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+; GCN: store_dwordx4
+define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+ %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+ %ext = sext <16 x i32> %ld to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i32_to_v16i64
+; GCN: s_load_dwordx16
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+ %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
+ %ext = zext <16 x i32> %ld to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i32_to_v32i64:
+
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+ %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
+ %ext = sext <32 x i32> %ld to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i32_to_v32i64:
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+ %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
+ %ext = zext <32 x i32> %ld to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i64.ll b/test/CodeGen/AMDGPU/load-constant-i64.ll
new file mode 100644
index 000000000000..e4656a2b2ac6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}constant_load_i64:
+; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; EG: VTX_READ_64
+define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
+ %ld = load i64, i64 addrspace(2)* %in
+ store i64 %ld, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i64:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in
+ store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i64:
+; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+
+; EG-DAG: VTX_READ_128
+; EG-DAG: VTX_READ_128
+define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in
+ store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i64
+; GCN: s_load_dwordx8
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in
+ store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i64:
+; GCN: s_load_dwordx16
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in
+ store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i64:
+; GCN: s_load_dwordx16
+; GCN: s_load_dwordx16
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in
+ store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll
new file mode 100644
index 000000000000..87828982a987
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -0,0 +1,567 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}constant_load_i8:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+entry:
+ %ld = load i8, i8 addrspace(2)* %in
+ store i8 %ld, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v2i8:
+; GCN-NOHSA: buffer_load_ushort v
+; GCN-HSA: flat_load_ushort v
+
+; EG: VTX_READ_16
+define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
+ store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v3i8:
+; GCN: s_load_dword s
+
+; EG-DAG: VTX_READ_32
+define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+ store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v4i8:
+; GCN: s_load_dword s
+
+; EG: VTX_READ_32
+define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in
+ store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v8i8:
+; GCN: s_load_dwordx2
+
+; EG: VTX_READ_64
+define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in
+ store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_load_v16i8:
+; GCN: s_load_dwordx4
+
+; EG: VTX_READ_128
+define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
+ store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+ %a = load i8, i8 addrspace(2)* %in
+ %ext = zext i8 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_sbyte
+; GCN-HSA: flat_load_sbyte
+
+; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 8
+define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+ %ld = load i8, i8 addrspace(2)* %in
+ %ext = sext i8 %ld to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:
+define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+ %ext = zext <1 x i8> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i8_to_v1i32:
+define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+ %ext = sext <1 x i8> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+ %ext = zext <2 x i8> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+
+; GCN-HSA: flat_load_ushort
+
+; GCN: v_bfe_i32
+; GCN: v_bfe_i32
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+ %ext = sext <2 x i8> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v3i8_to_v3i32:
+; GCN: s_load_dword s
+
+; GCN-DAG: s_bfe_u32
+; GCN-DAG: s_bfe_u32
+; GCN-DAG: s_and_b32
+define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+ %ext = zext <3 x i8> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v3i8_to_v3i32:
+; GCN: s_load_dword s
+
+; GCN-DAG: s_bfe_i32
+; GCN-DAG: s_bfe_i32
+; GCN-DAG: s_bfe_i32
+define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
+ %ext = sext <3 x i8> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+ %ext = zext <4 x i8> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i32:
+; GCN: s_load_dword s
+; GCN-DAG: s_sext_i32_i8
+; GCN-DAG: s_ashr_i32
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+ %ext = sext <4 x i8> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_and_b32
+; GCN-DAG: s_lshr_b32
+define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+ %ext = zext <8 x i8> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i32:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_ashr_i32
+; GCN-DAG: s_sext_i32_i8
+define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+ %ext = sext <8 x i8> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i32:
+define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+ %ext = zext <16 x i8> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i32:
+define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+ %ext = sext <16 x i8> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i8_to_v32i32:
+define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+ %ext = zext <32 x i8> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i8_to_v32i32:
+define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+ %ext = sext <32 x i8> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i32:
+define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+ %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+ %ext = zext <64 x i8> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i32:
+define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+ %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+ %ext = sext <64 x i8> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_i8_to_i64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+
+; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+ %a = load i8, i8 addrspace(2)* %in
+ %ext = zext i8 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i8_to_i64:
+; GCN-NOHSA: buffer_load_sbyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+ %a = load i8, i8 addrspace(2)* %in
+ %ext = sext i8 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i64:
+define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+ %ext = zext <1 x i8> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i8_to_v1i64:
+define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+ %ext = sext <1 x i8> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:
+define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+ %ext = zext <2 x i8> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:
+define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+ %ext = sext <2 x i8> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:
+define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+ %ext = zext <4 x i8> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:
+define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+ %ext = sext <4 x i8> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:
+define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+ %ext = zext <8 x i8> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:
+define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+ %ext = sext <8 x i8> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:
+define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+ %ext = zext <16 x i8> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:
+define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+ %ext = sext <16 x i8> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i8_to_v32i64:
+define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+ %ext = zext <32 x i8> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i8_to_v32i64:
+define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+ %ext = sext <32 x i8> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
+; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; %ext = zext <64 x i8> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
+; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; %ext = sext <64 x i8> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+; FUNC-LABEL: {{^}}constant_zextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_ubyte v[[VAL:[0-9]+]],
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+
+; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+ %a = load i8, i8 addrspace(2)* %in
+ %ext = zext i8 %a to i16
+ store i16 %ext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_sbyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[VAL:[0-9]+]],
+
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+ %a = load i8, i8 addrspace(2)* %in
+ %ext = sext i8 %a to i16
+ store i16 %ext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
+define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+ %ext = zext <1 x i8> %load to <1 x i16>
+ store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v1i8_to_v1i16:
+define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
+ %ext = sext <1 x i8> %load to <1 x i16>
+ store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:
+define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+ %ext = zext <2 x i8> %load to <2 x i16>
+ store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i16:
+define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
+ %ext = sext <2 x i8> %load to <2 x i16>
+ store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:
+define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+ %ext = zext <4 x i8> %load to <4 x i16>
+ store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i16:
+define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
+ %ext = sext <4 x i8> %load to <4 x i16>
+ store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:
+define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+ %ext = zext <8 x i8> %load to <8 x i16>
+ store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i16:
+define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
+ %ext = sext <8 x i8> %load to <8 x i16>
+ store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:
+define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+ %ext = zext <16 x i8> %load to <16 x i16>
+ store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i16:
+define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
+ %ext = sext <16 x i8> %load to <16 x i16>
+ store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_zextload_v32i8_to_v32i16:
+define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+ %ext = zext <32 x i8> %load to <32 x i16>
+ store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}constant_sextload_v32i8_to_v32i16:
+define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
+ %ext = sext <32 x i8> %load to <32 x i16>
+ store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+ ret void
+}
+
+; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
+; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; %ext = zext <64 x i8> %load to <64 x i16>
+; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+; ret void
+; }
+
+; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
+; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
+; %ext = sext <64 x i8> %load to <64 x i16>
+; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+; ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll
new file mode 100644
index 000000000000..23f4a6079e81
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_f32:
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
+; GCN-HSA: flat_load_dword
+
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+ %tmp0 = load float, float addrspace(1)* %in
+ store float %tmp0, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2f32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; R600: VTX_READ_64
+define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+entry:
+ %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
+ store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+entry:
+ %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
+ store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+entry:
+ %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
+ store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+entry:
+ %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
+ store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16f32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+entry:
+ %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
+ store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll
new file mode 100644
index 000000000000..a86cc5a6d3d4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -0,0 +1,94 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_f64:
+; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-NOHSA: buffer_store_dwordx2 [[VAL]]
+
+; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
+define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+ %ld = load double, double addrspace(1)* %in
+ store double %ld, double addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <2 x double>, <2 x double> addrspace(1)* %in
+ store <2 x double> %ld, <2 x double> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <3 x double>, <3 x double> addrspace(1)* %in
+ store <3 x double> %ld, <3 x double> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <4 x double>, <4 x double> addrspace(1)* %in
+ store <4 x double> %ld, <4 x double> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <8 x double>, <8 x double> addrspace(1)* %in
+ store <8 x double> %ld, <8 x double> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16f64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <16 x double>, <16 x double> addrspace(1)* %in
+ store <16 x double> %ld, <16 x double> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i1.ll b/test/CodeGen/AMDGPU/load-global-i1.ll
new file mode 100644
index 000000000000..ebfec781087e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -0,0 +1,371 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_i1:
+; GCN: buffer_load_ubyte
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
+; GCN: buffer_store_byte
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+ %load = load i1, i1 addrspace(1)* %in
+ store i1 %load, i1 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i1:
+define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+ store <2 x i1> %load, <2 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i1:
+define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+ store <3 x i1> %load, <3 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i1:
+define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+ store <4 x i1> %load, <4 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i1:
+define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+ store <8 x i1> %load, <8 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i1:
+define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+ store <16 x i1> %load, <16 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v32i1:
+define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+ store <32 x i1> %load, <32 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v64i1:
+define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+ store <64 x i1> %load, <64 x i1> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_dword
+define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+ %a = load i1, i1 addrspace(1)* %in
+ %ext = zext i1 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
+; GCN: buffer_load_ubyte
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; GCN: buffer_store_dword
+
+; EG: VTX_READ_8
+; EG: BFE_INT
+define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+ %a = load i1, i1 addrspace(1)* %in
+ %ext = sext i1 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32:
+define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+ %ext = zext <1 x i1> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32:
+define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+ %ext = sext <1 x i1> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32:
+define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+ %ext = zext <2 x i1> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32:
+define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+ %ext = sext <2 x i1> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32:
+define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+ %ext = zext <3 x i1> %load to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32:
+define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+ %ext = sext <3 x i1> %load to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32:
+define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+ %ext = zext <4 x i1> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32:
+define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+ %ext = sext <4 x i1> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32:
+define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+ %ext = zext <8 x i1> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32:
+define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+ %ext = sext <8 x i1> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32:
+define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+ %ext = zext <16 x i1> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32:
+define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+ %ext = sext <16 x i1> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32:
+define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+ %ext = zext <32 x i1> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32:
+define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+ %ext = sext <32 x i1> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32:
+define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+ %ext = zext <64 x i1> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32:
+define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+ %ext = sext <64 x i1> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
+; GCN-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}}
+; GCN: buffer_store_dwordx2
+define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+ %a = load i1, i1 addrspace(1)* %in
+ %ext = zext i1 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
+; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; GCN: buffer_store_dwordx2
+define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+ %a = load i1, i1 addrspace(1)* %in
+ %ext = sext i1 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64:
+define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+ %ext = zext <1 x i1> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64:
+define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+ %ext = sext <1 x i1> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64:
+define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+ %ext = zext <2 x i1> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64:
+define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+ %ext = sext <2 x i1> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64:
+define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+ %ext = zext <3 x i1> %load to <3 x i64>
+ store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64:
+define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
+ %ext = sext <3 x i1> %load to <3 x i64>
+ store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64:
+define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+ %ext = zext <4 x i1> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64:
+define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+ %ext = sext <4 x i1> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64:
+define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+ %ext = zext <8 x i1> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64:
+define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+ %ext = sext <8 x i1> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64:
+define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+ %ext = zext <16 x i1> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64:
+define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+ %ext = sext <16 x i1> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64:
+define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+ %ext = zext <32 x i1> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64:
+define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+ %ext = sext <32 x i1> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64:
+define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+ %ext = zext <64 x i1> %load to <64 x i64>
+ store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64:
+define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+ %ext = sext <64 x i1> %load to <64 x i64>
+ store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
new file mode 100644
index 000000000000..11e6b10c38ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -0,0 +1,476 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
+
+; FUNC-LABEL: {{^}}global_load_i16:
+; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
+; GCN-HSA: flat_load_ushort
+
+; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+ %ld = load i16, i16 addrspace(1)* %in
+ store i16 %ld, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i16:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; EG: VTX_READ_32
+define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+ %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
+ store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i16:
+; GCN-NOHSA: buffer_load_dwordx2 v
+; GCN-HSA: flat_load_dwordx2 v
+
+; EG-DAG: VTX_READ_32
+; EG-DAG: VTX_READ_16
+define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+ store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i16:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_64
+define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+ %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
+ store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i16:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
+entry:
+ %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
+ store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i16:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
+entry:
+ %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
+ store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_ushort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+ %a = load i16, i16 addrspace(1)* %in
+ %ext = zext i16 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i16_to_i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-NOHSA: buffer_store_dword
+
+; GCN-HSA: flat_load_sshort
+; GCN-HSA: flat_store_dword
+
+; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 16
+define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+ %a = load i16, i16 addrspace(1)* %in
+ %ext = sext i16 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+ %ext = zext <1 x i16> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i32:
+; GCN-NOHSA: buffer_load_sshort
+; GCN-HSA: flat_load_sshort
+define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+ %ext = sext <1 x i16> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: flat_load_dword
+define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+ %ext = zext <2 x i16> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i32:
+; GCN-NOHSA: buffer_load_dword
+
+; GCN-HSA: flat_load_dword
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+ %ext = sext <2 x i16> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_global_zextload_v3i16_to_v3i32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+define void @global_global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+ %ext = zext <3 x i16> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_global_sextload_v3i16_to_v3i32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+define void @global_global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
+ %ext = sext <3 x i16> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_global_zextload_v4i16_to_v4i32:
+; GCN-NOHSA: buffer_load_dwordx2
+
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+ %ext = zext <4 x i16> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i32:
+; GCN-NOHSA: buffer_load_dwordx2
+
+; GCN-HSA: flat_load_dwordx2
+
+; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+; EG-DAG: 16
+define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+ %ext = sext <4 x i16> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+ %ext = zext <8 x i16> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+ %ext = sext <8 x i16> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+ %ext = zext <16 x i16> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i32:
+define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+ %ext = sext <16 x i16> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+ %ext = zext <32 x i16> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+ %ext = sext <32 x i16> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+ %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+ %ext = zext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i32:
+define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+ %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+ %ext = sext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+ %a = load i16, i16 addrspace(1)* %in
+ %ext = zext i16 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i16_to_i64:
+; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
+; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+ %a = load i16, i16 addrspace(1)* %in
+ %ext = sext i16 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i64:
+define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+ %ext = zext <1 x i16> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i64:
+define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+ %ext = sext <1 x i16> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64:
+define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+ %ext = zext <2 x i16> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
+define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+ %ext = sext <2 x i16> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
+define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+ %ext = zext <4 x i16> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
+define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+ %ext = sext <4 x i16> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
+define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+ %ext = zext <8 x i16> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
+define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+ %ext = sext <8 x i16> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i64:
+define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+ %ext = zext <16 x i16> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i64:
+define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+ %ext = sext <16 x i16> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i64:
+define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+ %ext = zext <32 x i16> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i64:
+define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+ %ext = sext <32 x i16> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64:
+; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+; %ext = zext <64 x i16> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64:
+; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+; %ext = sext <64 x i16> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll
new file mode 100644
index 000000000000..5e1171a69be5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -0,0 +1,521 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}global_load_i32:
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
+; GCN-HSA: flat_load_dword
+
+; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+entry:
+ %ld = load i32, i32 addrspace(1)* %in
+ store i32 %ld, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i32:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_64
+define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+ store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
+ store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
+ store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i32:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
+ store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i32_to_i64:
+; GCN-NOHSA-DAG: buffer_load_dword v[[LO:[0-9]+]],
+; GCN-HSA-DAG: flat_load_dword v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %ld = load i32, i32 addrspace(1)* %in
+ %ext = zext i32 %ld to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i32_to_i64:
+; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-HSA: flat_load_dword v[[LO:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+
+
+; EG: MEM_RAT
+; EG: VTX_READ_32
+; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.
+; EG: 31
+define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %ld = load i32, i32 addrspace(1)* %in
+ %ext = sext i32 %ld to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i32_to_v1i64:
+; GCN-NOHSA: buffer_load_dword
+; GCN-NOHSA: buffer_store_dwordx2
+
+; GCN-HSA: flat_load_dword
+; GCN-HSA: flat_store_dwordx2
+define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+ %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
+ %ext = zext <1 x i32> %ld to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i32_to_v1i64:
+; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-HSA: flat_load_dword v[[LO:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+ %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
+ %ext = sext <1 x i32> %ld to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i32_to_v2i64:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_load_dwordx2
+; GCN-HSA: flat_store_dwordx4
+define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+ %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+ %ext = zext <2 x i32> %ld to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i32_to_v2i64:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+ %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+ %ext = sext <2 x i32> %ld to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i32_to_v4i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %ext = zext <4 x i32> %ld to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i32_to_v4i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %ext = sext <4 x i32> %ld to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i32_to_v8i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-SA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+ %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
+ %ext = zext <8 x i32> %ld to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i32_to_v8i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+ %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
+ %ext = sext <8 x i32> %ld to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i32_to_v16i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+ %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
+ %ext = sext <16 x i32> %ld to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i32_to_v16i64
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+ %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
+ %ext = zext <16 x i32> %ld to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i32_to_v32i64:
+
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+; GCN-DAG: v_ashrrev_i32
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+
+define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+ %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
+ %ext = sext <32 x i32> %ld to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i32_to_v32i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+; GCN-NOHSA-DAG: buffer_store_dwordx4
+
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+; GCN-HSA-DAG: flat_store_dwordx4
+define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+ %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
+ %ext = zext <32 x i32> %ld to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll
new file mode 100644
index 000000000000..305b954c78f9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -0,0 +1,122 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_load_i64:
+; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-NOHSA: buffer_store_dwordx2 [[VAL]]
+
+; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
+
+; EG: VTX_READ_64
+define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+ %ld = load i64, i64 addrspace(1)* %in
+ store i64 %ld, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in
+ store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in
+ store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in
+ store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in
+ store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i64:
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in
+ store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
new file mode 100644
index 000000000000..b697967f1a23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -0,0 +1,564 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}global_load_i8:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+entry:
+ %ld = load i8, i8 addrspace(1)* %in
+ store i8 %ld, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v2i8:
+; GCN-NOHSA: buffer_load_ushort v
+; GCN-HSA: flat_load_ushort v
+
+; EG: VTX_READ_16
+define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v3i8:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; EG-DAG: VTX_READ_32
+define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+ store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v4i8:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; EG: VTX_READ_32
+define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v8i8:
+; GCN-NOHSA: buffer_load_dwordx2
+; GCN-HSA: flat_load_dwordx2
+
+; EG: VTX_READ_64
+define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in
+ store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_load_v16i8:
+; GCN-NOHSA: buffer_load_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+
+; EG: VTX_READ_128
+define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in
+ store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; GCN-HSA: flat_load_ubyte
+
+; EG: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+ %a = load i8, i8 addrspace(1)* %in
+ %ext = zext i8 %a to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i8_to_i32:
+; GCN-NOHSA: buffer_load_sbyte
+; GCN-HSA: flat_load_sbyte
+
+; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EG: 8
+define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+ %ld = load i8, i8 addrspace(1)* %in
+ %ext = sext i8 %ld to i32
+ store i32 %ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32:
+define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+ %ext = zext <1 x i8> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i8_to_v1i32:
+define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+ %ext = sext <1 x i8> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %ext = zext <2 x i8> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i32:
+; GCN-NOHSA: buffer_load_ushort
+; GCN-HSA: flat_load_ushort
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %ext = sext <2 x i8> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v3i8_to_v3i32:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
+; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
+define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+ %ext = zext <3 x i8> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v3i8_to_v3i32:
+; GCN-NOHSA: buffer_load_dword v
+; GCN-HSA: flat_load_dword v
+
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
+define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+ %ext = sext <3 x i8> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: flat_load_dword
+
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %ext = zext <4 x i8> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i32:
+; GCN-NOHSA: buffer_load_dword
+; GCN-HSA: flat_load_dword
+
+; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; EG-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; EG-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+; EG-DAG: 8
+define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %ext = sext <4 x i8> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i32:
+define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+ %ext = zext <8 x i8> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i32:
+define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+ %ext = sext <8 x i8> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i32:
+define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+ %ext = zext <16 x i8> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i32:
+define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+ %ext = sext <16 x i8> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i8_to_v32i32:
+define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+ %ext = zext <32 x i8> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i8_to_v32i32:
+define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+ %ext = sext <32 x i8> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i32:
+define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+ %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+ %ext = zext <64 x i8> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i32:
+define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+ %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+ %ext = sext <64 x i8> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i8_to_i64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+
+; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+ %a = load i8, i8 addrspace(1)* %in
+ %ext = zext i8 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i8_to_i64:
+; GCN-NOHSA: buffer_load_sbyte v[[LO:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+ %a = load i8, i8 addrspace(1)* %in
+ %ext = sext i8 %a to i64
+ store i64 %ext, i64 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i64:
+define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+ %ext = zext <1 x i8> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i8_to_v1i64:
+define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+ %ext = sext <1 x i8> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64:
+define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %ext = zext <2 x i8> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64:
+define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %ext = sext <2 x i8> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64:
+define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %ext = zext <4 x i8> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64:
+define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %ext = sext <4 x i8> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64:
+define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+ %ext = zext <8 x i8> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64:
+define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+ %ext = sext <8 x i8> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64:
+define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+ %ext = zext <16 x i8> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64:
+define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+ %ext = sext <16 x i8> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i8_to_v32i64:
+define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+ %ext = zext <32 x i8> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i8_to_v32i64:
+define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+ %ext = sext <32 x i8> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+ ret void
+}
+
+; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
+; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; %ext = zext <64 x i8> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
+; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; %ext = sext <64 x i8> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+; ret void
+; }
+
+; FUNC-LABEL: {{^}}global_zextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_ubyte v[[VAL:[0-9]+]],
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+
+; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+ %a = load i8, i8 addrspace(1)* %in
+ %ext = zext i8 %a to i16
+ store i16 %ext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i8_to_i16:
+; GCN-NOHSA: buffer_load_sbyte v[[VAL:[0-9]+]],
+; GCN-HSA: flat_load_sbyte v[[VAL:[0-9]+]],
+
+; GCN-NOHSA: buffer_store_short v[[VAL]]
+; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
+define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+ %a = load i8, i8 addrspace(1)* %in
+ %ext = sext i8 %a to i16
+ store i16 %ext, i16 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16:
+define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+ %ext = zext <1 x i8> %load to <1 x i16>
+ store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v1i8_to_v1i16:
+define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+ %ext = sext <1 x i8> %load to <1 x i16>
+ store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16:
+define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %ext = zext <2 x i8> %load to <2 x i16>
+ store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i16:
+define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %ext = sext <2 x i8> %load to <2 x i16>
+ store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16:
+define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %ext = zext <4 x i8> %load to <4 x i16>
+ store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i16:
+define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %ext = sext <4 x i8> %load to <4 x i16>
+ store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16:
+define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+ %ext = zext <8 x i8> %load to <8 x i16>
+ store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i16:
+define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+ %ext = sext <8 x i8> %load to <8 x i16>
+ store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16:
+define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+ %ext = zext <16 x i8> %load to <16 x i16>
+ store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i16:
+define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+ %ext = sext <16 x i8> %load to <16 x i16>
+ store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_v32i8_to_v32i16:
+define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+ %ext = zext <32 x i8> %load to <32 x i16>
+ store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_v32i8_to_v32i16:
+define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+ %ext = sext <32 x i8> %load to <32 x i16>
+ store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
+ ret void
+}
+
+; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
+; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; %ext = zext <64 x i8> %load to <64 x i16>
+; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+; ret void
+; }
+
+; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
+; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+; %ext = sext <64 x i8> %load to <64 x i16>
+; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
+; ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-i1.ll b/test/CodeGen/AMDGPU/load-i1.ll
deleted file mode 100644
index 0ca49fde3e7b..000000000000
--- a/test/CodeGen/AMDGPU/load-i1.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}global_copy_i1_to_i1:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: buffer_store_byte
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: AND_INT
-define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %load = load i1, i1 addrspace(1)* %in
- store i1 %load, i1 addrspace(1)* %out, align 1
- ret void
-}
-
-; FUNC-LABEL: {{^}}local_copy_i1_to_i1:
-; SI: ds_read_u8
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: ds_write_b8
-; SI: s_endpgm
-
-; EG: LDS_UBYTE_READ_RET
-; EG: AND_INT
-; EG: LDS_BYTE_WRITE
-define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind {
- %load = load i1, i1 addrspace(3)* %in
- store i1 %load, i1 addrspace(3)* %out, align 1
- ret void
-}
-
-; FUNC-LABEL: {{^}}constant_copy_i1_to_i1:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1
-; SI: buffer_store_byte
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: AND_INT
-define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind {
- %load = load i1, i1 addrspace(2)* %in
- store i1 %load, i1 addrspace(1)* %out, align 1
- ret void
-}
-
-; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: buffer_store_dword
-; SI: s_endpgm
-
-; EG: VTX_READ_8
-; EG: BFE_INT
-define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %load = load i1, i1 addrspace(1)* %in
- %ext = sext i1 %load to i32
- store i32 %ext, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-
-define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %load = load i1, i1 addrspace(1)* %in
- %ext = zext i1 %load to i32
- store i32 %ext, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %load = load i1, i1 addrspace(1)* %in
- %ext = sext i1 %load to i64
- store i64 %ext, i64 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
-; SI: buffer_load_ubyte
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
- %load = load i1, i1 addrspace(1)* %in
- %ext = zext i1 %load to i64
- store i64 %ext, i64 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg:
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32
-; SI: buffer_store_byte
-; SI: s_endpgm
-define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
- store i1 %x, i1 addrspace(1)* %out, align 1
- ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_zext_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
- %ext = zext i1 %x to i32
- store i32 %ext, i32 addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_zext_i64:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
- %ext = zext i1 %x to i64
- store i64 %ext, i64 addrspace(1)* %out, align 8
- ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_sext_i32:
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
-define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
- %ext = sext i1 %x to i32
- store i32 %ext, i32addrspace(1)* %out, align 4
- ret void
-}
-
-; FUNC-LABEL: {{^}}i1_arg_sext_i64:
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
- %ext = sext i1 %x to i64
- store i64 %ext, i64 addrspace(1)* %out, align 8
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll
index 1daf0e6527b9..b1899a45bf56 100644
--- a/test/CodeGen/AMDGPU/load-input-fold.ll
+++ b/test/CodeGen/AMDGPU/load-input-fold.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=cayman
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -88,14 +88,14 @@ main_body:
%83 = insertelement <4 x float> %82, float %75, i32 1
%84 = insertelement <4 x float> %83, float %77, i32 2
%85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
- %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85)
+ %86 = call float @llvm.r600.dot4(<4 x float> %81, <4 x float> %85)
%87 = insertelement <4 x float> undef, float %86, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %87, i32 2, i32 2)
ret void
}
; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
; Function Attrs: readonly
declare float @fabs(float) #2
@@ -104,14 +104,13 @@ declare float @fabs(float) #2
declare float @llvm.AMDGPU.rsq(float) #1
; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
; Function Attrs: nounwind readonly
declare float @llvm.pow.f32(float, float) #3
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-attributes #0 = { "ShaderType"="1" }
attributes #1 = { readnone }
attributes #2 = { readonly }
attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/load-local-f32.ll b/test/CodeGen/AMDGPU/load-local-f32.ll
new file mode 100644
index 000000000000..77b5e3cf3aed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -0,0 +1,110 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}load_f32_local:
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
+entry:
+ %tmp0 = load float, float addrspace(3)* %in
+ store float %tmp0, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2f32_local:
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
+entry:
+ %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in
+ store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+; FIXME: should this do a read2_b64?
+; FUNC-LABEL: {{^}}local_load_v3f32:
+; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
+; GCN: s_waitcnt
+; GCN-DAG: ds_write_b64
+; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8{{$}}
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
+entry:
+ %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in
+ store <3 x float> %tmp0, <3 x float> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4f32:
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
+entry:
+ %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in
+ store <4 x float> %tmp0, <4 x float> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8f32:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
+entry:
+ %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in
+ store <8 x float> %tmp0, <8 x float> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16f32:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
+entry:
+ %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in
+ store <16 x float> %tmp0, <16 x float> addrspace(3)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-f64.ll b/test/CodeGen/AMDGPU/load-local-f64.ll
new file mode 100644
index 000000000000..27d39b7e9d7d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_f64:
+; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
+; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
+ %ld = load double, double addrspace(3)* %in
+ store double %ld, double addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2f64:
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <2 x double>, <2 x double> addrspace(3)* %in
+ store <2 x double> %ld, <2 x double> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3f64:
+; GCN-DAG: ds_read2_b64
+; GCN-DAG: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <3 x double>, <3 x double> addrspace(3)* %in
+ store <3 x double> %ld, <3 x double> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4f64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <4 x double>, <4 x double> addrspace(3)* %in
+ store <4 x double> %ld, <4 x double> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8f64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <8 x double>, <8 x double> addrspace(3)* %in
+ store <8 x double> %ld, <8 x double> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16f64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <16 x double>, <16 x double> addrspace(3)* %in
+ store <16 x double> %ld, <16 x double> addrspace(3)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i1.ll b/test/CodeGen/AMDGPU/load-local-i1.ll
new file mode 100644
index 000000000000..2eed9917b5e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -0,0 +1,371 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_i1:
+; GCN: ds_read_u8
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 1
+; GCN: ds_write_b8
+
+; EG: LDS_UBYTE_READ_RET
+; EG: AND_INT
+; EG: LDS_BYTE_WRITE
+define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+ %load = load i1, i1 addrspace(3)* %in
+ store i1 %load, i1 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i1:
+define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+ store <2 x i1> %load, <2 x i1> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i1:
+define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+ store <3 x i1> %load, <3 x i1> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i1:
+define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+ store <4 x i1> %load, <4 x i1> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i1:
+define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+ store <8 x i1> %load, <8 x i1> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i1:
+define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+ store <16 x i1> %load, <16 x i1> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v32i1:
+define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+ store <32 x i1> %load, <32 x i1> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v64i1:
+define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+ store <64 x i1> %load, <64 x i1> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
+; GCN: ds_read_u8
+; GCN: ds_write_b32
+define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+ %a = load i1, i1 addrspace(3)* %in
+ %ext = zext i1 %a to i32
+ store i32 %ext, i32 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i1_to_i32:
+; GCN: ds_read_u8
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; GCN: ds_write_b32
+
+; EG: LDS_UBYTE_READ_RET
+; EG: BFE_INT
+define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+ %a = load i1, i1 addrspace(3)* %in
+ %ext = sext i1 %a to i32
+ store i32 %ext, i32 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
+define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+ %ext = zext <1 x i1> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
+define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+ %ext = sext <1 x i1> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
+define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+ %ext = zext <2 x i1> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
+define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+ %ext = sext <2 x i1> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
+define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+ %ext = zext <3 x i1> %load to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
+define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+ %ext = sext <3 x i1> %load to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
+define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+ %ext = zext <4 x i1> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
+define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+ %ext = sext <4 x i1> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
+define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+ %ext = zext <8 x i1> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
+define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+ %ext = sext <8 x i1> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
+define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+ %ext = zext <16 x i1> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
+define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+ %ext = sext <16 x i1> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
+define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+ %ext = zext <32 x i1> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
+define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+ %ext = sext <32 x i1> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
+define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+ %ext = zext <64 x i1> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
+define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+ %ext = sext <64 x i1> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i1_to_i64:
+; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; GCN: ds_write_b64
+define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+ %a = load i1, i1 addrspace(3)* %in
+ %ext = zext i1 %a to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i1_to_i64:
+; GCN: ds_read_u8 [[LOAD:v[0-9]+]],
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; GCN: ds_write_b64
+define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+ %a = load i1, i1 addrspace(3)* %in
+ %ext = sext i1 %a to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
+define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+ %ext = zext <1 x i1> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
+define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+ %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
+ %ext = sext <1 x i1> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
+define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+ %ext = zext <2 x i1> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
+define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+ %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
+ %ext = sext <2 x i1> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
+define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+ %ext = zext <3 x i1> %load to <3 x i64>
+ store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
+define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+ %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
+ %ext = sext <3 x i1> %load to <3 x i64>
+ store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
+define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+ %ext = zext <4 x i1> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
+define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+ %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
+ %ext = sext <4 x i1> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
+define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+ %ext = zext <8 x i1> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
+define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+ %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
+ %ext = sext <8 x i1> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
+define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+ %ext = zext <16 x i1> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
+define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+ %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
+ %ext = sext <16 x i1> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
+define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+ %ext = zext <32 x i1> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
+define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+ %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
+ %ext = sext <32 x i1> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
+define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+ %ext = zext <64 x i1> %load to <64 x i64>
+ store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
+define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+ %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
+ %ext = sext <64 x i1> %load to <64 x i64>
+ store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
new file mode 100644
index 000000000000..d3c0af469dd2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -0,0 +1,454 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_i16:
+; GCN: ds_read_u16 v{{[0-9]+}}
+
+; EG: LDS_USHORT_READ_RET
+define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
+entry:
+ %ld = load i16, i16 addrspace(3)* %in
+ store i16 %ld, i16 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i16:
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+ %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
+ store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i16:
+; GCN: ds_read_b64
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b16
+
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_READ_RET
+define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+ store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i16:
+; GCN: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+ %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
+ store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i16:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
+entry:
+ %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
+ store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i16:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
+entry:
+ %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
+ store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
+; GCN: ds_read_u16
+; GCN: ds_write_b32
+
+; EG: LDS_USHORT_READ_RET
+define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+ %a = load i16, i16 addrspace(3)* %in
+ %ext = zext i16 %a to i32
+ store i32 %ext, i32 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_i16
+
+; EG: LDS_USHORT_READ_RET
+; EG: BFE_INT
+define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+ %a = load i16, i16 addrspace(3)* %in
+ %ext = sext i16 %a to i32
+ store i32 %ext, i32 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
+; GCN: ds_read_u16
+define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+ %ext = zext <1 x i16> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
+; GCN: ds_read_i16
+define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+ %ext = sext <1 x i16> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+ %ext = zext <2 x i16> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+ %ext = sext <2 x i16> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
+; GCN: ds_read_b64
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b64
+define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+ %ext = zext <3 x i16> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
+; GCN: ds_read_b64
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b64
+define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+entry:
+ %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
+ %ext = sext <3 x i16> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b64
+
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+ %ext = zext <4 x i16> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b64
+
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+ %ext = sext <4 x i16> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+ %ext = zext <8 x i16> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+ %ext = sext <8 x i16> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FIXME: Should have 2 ds_read_b64
+; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
+
+; GCN: ds_write2_b64
+; GCN: ds_write2_b64
+; GCN: ds_write2_b64
+; GCN: ds_write2_b64
+define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+ %ext = zext <16 x i16> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:1{{$}}
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}}
+define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+ %ext = sext <16 x i16> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+ %ext = zext <32 x i16> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+ %ext = sext <32 x i16> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FIXME: Missed read2
+; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
+; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112
+define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+ %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+ %ext = zext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
+define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+ %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+ %ext = sext <64 x i16> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
+; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+ %a = load i16, i16 addrspace(3)* %in
+ %ext = zext i16 %a to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
+; GCN: ds_read_i16 v[[LO:[0-9]+]],
+; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+ %a = load i16, i16 addrspace(3)* %in
+ %ext = sext i16 %a to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
+define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+ %ext = zext <1 x i16> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
+define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+ %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
+ %ext = sext <1 x i16> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
+define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+ %ext = zext <2 x i16> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
+define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+ %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
+ %ext = sext <2 x i16> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
+define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+ %ext = zext <4 x i16> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
+define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+ %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
+ %ext = sext <4 x i16> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
+define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+ %ext = zext <8 x i16> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
+define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+ %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
+ %ext = sext <8 x i16> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
+define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+ %ext = zext <16 x i16> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
+define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+ %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
+ %ext = sext <16 x i16> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
+define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+ %ext = zext <32 x i16> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
+define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+ %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
+ %ext = sext <32 x i16> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
+; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+; %ext = zext <64 x i16> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+; ret void
+; }
+
+; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
+; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
+; %ext = sext <64 x i16> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+; ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i32.ll b/test/CodeGen/AMDGPU/load-local-i32.ll
new file mode 100644
index 000000000000..d68a8518e2ed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -0,0 +1,182 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_load_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0, -1
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+entry:
+ %ld = load i32, i32 addrspace(3)* %in
+ store i32 %ld, i32 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i32:
+; GCN: ds_read_b64
+define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+ store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i32:
+; GCN-DAG: ds_read_b64
+; GCN-DAG: ds_read_b32
+define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
+ store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i32:
+; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+
+define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+ store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1{{$}}
+define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+ store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i32:
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+ store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
+define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+ %ld = load i32, i32 addrspace(3)* %in
+ %ext = zext i32 %ld to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
+define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+ %ld = load i32, i32 addrspace(3)* %in
+ %ext = sext i32 %ld to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
+define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+ %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
+ %ext = zext <1 x i32> %ld to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
+define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+ %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
+ %ext = sext <1 x i32> %ld to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
+define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+ %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+ %ext = zext <2 x i32> %ld to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
+define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+ %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
+ %ext = sext <2 x i32> %ld to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
+define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+ %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+ %ext = zext <4 x i32> %ld to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
+define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+ %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
+ %ext = sext <4 x i32> %ld to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
+define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+ %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+ %ext = zext <8 x i32> %ld to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
+define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+ %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
+ %ext = sext <8 x i32> %ld to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
+define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+ %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+ %ext = sext <16 x i32> %ld to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
+define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+ %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
+ %ext = zext <16 x i32> %ld to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
+define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+ %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
+ %ext = sext <32 x i32> %ld to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
+define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+ %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
+ %ext = zext <32 x i32> %ld to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i64.ll b/test/CodeGen/AMDGPU/load-local-i64.ll
new file mode 100644
index 000000000000..180807df7b9a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}local_load_i64:
+; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}}
+; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]]
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
+ %ld = load i64, i64 addrspace(3)* %in
+ store i64 %ld, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i64:
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
+ store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i64:
+; GCN-DAG: ds_read2_b64
+; GCN-DAG: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in
+ store <3 x i64> %ld, <3 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in
+ store <4 x i64> %ld, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in
+ store <8 x i64> %ld, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i64:
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+; GCN: ds_read2_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in
+ store <16 x i64> %ld, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll
new file mode 100644
index 000000000000..be865b078d74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -0,0 +1,556 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_load_i8:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u8
+
+; EG: LDS_UBYTE_READ_RET
+define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+entry:
+ %ld = load i8, i8 addrspace(3)* %in
+ store i8 %ld, i8 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v2i8:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u16
+
+; EG: LDS_USHORT_READ_RET
+define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
+ store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v3i8:
+; GCN: ds_read_b32
+
+; EG: DS_READ_RET
+define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+ store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v4i8:
+; GCN: ds_read_b32
+
+; EG: LDS_READ_RET
+define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
+ store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v8i8:
+; GCN: ds_read_b64
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
+ store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_load_v16i8:
+; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
+; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset0:1{{$}}
+
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+; EG: LDS_READ_RET
+define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
+ store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u8
+
+; EG: LDS_UBYTE_READ_RET
+define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+ %a = load i8, i8 addrspace(3)* %in
+ %ext = zext i8 %a to i32
+ store i32 %ext, i32 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_i8
+
+; EG: LDS_UBYTE_READ_RET
+; EG: BFE_INT
+define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+ %ld = load i8, i8 addrspace(3)* %in
+ %ext = sext i8 %ld to i32
+ store i32 %ext, i32 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
+define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+ %ext = zext <1 x i8> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
+define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+ %ext = sext <1 x i8> %load to <1 x i32>
+ store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
+; GCN: ds_read_u16
+
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+ %ext = zext <2 x i8> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_u16
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+ %ext = sext <2 x i8> %load to <2 x i32>
+ store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
+; GCN: ds_read_b32
+
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
+; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
+define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+ %ext = zext <3 x i8> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; GCN-DAG: v_bfe_i32
+; GCN-DAG: v_bfe_i32
+; GCN-DAG: v_bfe_i32
+; GCN-DAG: v_bfe_i32
+
+; GCN-DAG: ds_write_b64
+; GCN-DAG: ds_write_b32
+
+define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+entry:
+ %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
+ %ext = sext <3 x i8> %ld to <3 x i32>
+ store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+; EG: LDS_UBYTE_READ_RET
+define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+ %ext = zext <4 x i8> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
+; GCN-NOT: s_wqm_b64
+; GCN: s_mov_b32 m0
+; GCN: ds_read_b32
+
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: LDS_UBYTE_READ_RET
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+ %ext = sext <4 x i8> %load to <4 x i32>
+ store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
+define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+ %ext = zext <8 x i8> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
+define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+ %ext = sext <8 x i8> %load to <8 x i32>
+ store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
+define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+ %ext = zext <16 x i8> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
+define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+ %ext = sext <16 x i8> %load to <16 x i32>
+ store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
+define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+ %ext = zext <32 x i8> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
+define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+ %ext = sext <32 x i8> %load to <32 x i32>
+ store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
+define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+ %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+ %ext = zext <64 x i8> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
+define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+ %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+ %ext = sext <64 x i8> %load to <64 x i32>
+ store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+ %a = load i8, i8 addrspace(3)* %in
+ %ext = zext i8 %a to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
+; GCN: ds_read_i8 v[[LO:[0-9]+]],
+; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+
+; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+ %a = load i8, i8 addrspace(3)* %in
+ %ext = sext i8 %a to i64
+ store i64 %ext, i64 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
+define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+ %ext = zext <1 x i8> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
+define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+ %ext = sext <1 x i8> %load to <1 x i64>
+ store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
+define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+ %ext = zext <2 x i8> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
+define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+ %ext = sext <2 x i8> %load to <2 x i64>
+ store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
+define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+ %ext = zext <4 x i8> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
+define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+ %ext = sext <4 x i8> %load to <4 x i64>
+ store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
+define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+ %ext = zext <8 x i8> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
+define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+ %ext = sext <8 x i8> %load to <8 x i64>
+ store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
+define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+ %ext = zext <16 x i8> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
+define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+ %ext = sext <16 x i8> %load to <16 x i64>
+ store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
+define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+ %ext = zext <32 x i8> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
+define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+ %ext = sext <32 x i8> %load to <32 x i64>
+ store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
+ ret void
+}
+
+; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
+; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; %ext = zext <64 x i8> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+; ret void
+; }
+
+; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
+; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; %ext = sext <64 x i8> %load to <64 x i64>
+; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
+; ret void
+; }
+
+; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
+; GCN: ds_read_u8 v[[VAL:[0-9]+]],
+; GCN: ds_write_b16 v[[VAL:[0-9]+]]
+define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+ %a = load i8, i8 addrspace(3)* %in
+ %ext = zext i8 %a to i16
+ store i16 %ext, i16 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
+; GCN: ds_read_i8 v[[VAL:[0-9]+]],
+; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
+define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+ %a = load i8, i8 addrspace(3)* %in
+ %ext = sext i8 %a to i16
+ store i16 %ext, i16 addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
+define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+ %ext = zext <1 x i8> %load to <1 x i16>
+ store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
+define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+ %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
+ %ext = sext <1 x i8> %load to <1 x i16>
+ store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
+define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+ %ext = zext <2 x i8> %load to <2 x i16>
+ store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
+define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+ %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
+ %ext = sext <2 x i8> %load to <2 x i16>
+ store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
+define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+ %ext = zext <4 x i8> %load to <4 x i16>
+ store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
+define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+ %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
+ %ext = sext <4 x i8> %load to <4 x i16>
+ store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
+define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+ %ext = zext <8 x i8> %load to <8 x i16>
+ store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
+define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+ %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
+ %ext = sext <8 x i8> %load to <8 x i16>
+ store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
+define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+ %ext = zext <16 x i8> %load to <16 x i16>
+ store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
+define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+ %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
+ %ext = sext <16 x i8> %load to <16 x i16>
+ store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
+define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+ %ext = zext <32 x i8> %load to <32 x i16>
+ store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
+define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+ %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
+ %ext = sext <32 x i8> %load to <32 x i16>
+ store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
+ ret void
+}
+
+; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
+; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; %ext = zext <64 x i8> %load to <64 x i16>
+; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
+; ret void
+; }
+
+; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
+; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
+; %ext = sext <64 x i8> %load to <64 x i16>
+; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
+; ret void
+; }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll
new file mode 100644
index 000000000000..b9f7018b8107
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}load_i24:
+; SI: {{flat|buffer}}_load_ubyte
+; SI: {{flat|buffer}}_load_ushort
+; SI: {{flat|buffer}}_store_dword
+define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
+ %1 = load i24, i24 addrspace(1)* %in
+ %2 = zext i24 %1 to i32
+ store i32 %2, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}load_i25:
+; SI-NOHSA: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-NOHSA: buffer_store_dword [[VAL]]
+
+; CI-HSA: flat_load_dword [[VAL:v[0-9]+]]
+; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]]
+define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
+ %1 = load i25, i25 addrspace(1)* %in
+ %2 = zext i25 %1 to i32
+ store i32 %2, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll
deleted file mode 100644
index 6486c6ab2ffc..000000000000
--- a/test/CodeGen/AMDGPU/load.ll
+++ /dev/null
@@ -1,737 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck --check-prefix=FUNC --check-prefix=CI-HSA --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
-
-;===------------------------------------------------------------------------===;
-; GLOBAL ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load an i8 value from the global address space.
-; FUNC-LABEL: {{^}}load_i8:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-
-; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
-; CI-HSA: flat_load_ubyte
-define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
- %1 = load i8, i8 addrspace(1)* %in
- %2 = zext i8 %1 to i32
- store i32 %2, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_i8_sext:
-; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 8
-; SI-NOHSA: buffer_load_sbyte
-; CI-HSA: flat_load_sbyte
-define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
-entry:
- %0 = load i8, i8 addrspace(1)* %in
- %1 = sext i8 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8:
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
-entry:
- %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
- %1 = zext <2 x i8> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_sext:
-; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: 8
-; R600-DAG: 8
-
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
-entry:
- %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
- %1 = sext <2 x i8> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8:
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; R600: VTX_READ_8
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; SI-NOHSA: buffer_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-; CI-HSA: flat_load_ubyte
-define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
-entry:
- %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
- %1 = zext <4 x i8> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_sext:
-; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
-; R600-DAG: 8
-; R600-DAG: 8
-; R600-DAG: 8
-; R600-DAG: 8
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; SI-NOHSA: buffer_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-; CI-HSA: flat_load_sbyte
-define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
-entry:
- %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
- %1 = sext <4 x i8> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; Load an i16 value from the global address space.
-; FUNC-LABEL: {{^}}load_i16:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
-entry:
- %0 = load i16 , i16 addrspace(1)* %in
- %1 = zext i16 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_sext:
-; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 16
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
-entry:
- %0 = load i16, i16 addrspace(1)* %in
- %1 = sext i16 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16:
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-entry:
- %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
- %1 = zext <2 x i16> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_sext:
-; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: 16
-; R600-DAG: 16
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-entry:
- %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
- %1 = sext <2 x i16> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16:
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; R600: VTX_READ_16
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-entry:
- %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
- %1 = zext <4 x i16> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_sext:
-; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
-; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
-; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
-; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
-; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
-; R600-DAG: 16
-; R600-DAG: 16
-; R600-DAG: 16
-; R600-DAG: 16
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-entry:
- %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
- %1 = sext <4 x i16> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; load an i32 value from the global address space.
-; FUNC-LABEL: {{^}}load_i32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
-; CI-HSA: flat_load_dword
-define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
- %0 = load i32, i32 addrspace(1)* %in
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; load a f32 value from the global address space.
-; FUNC-LABEL: {{^}}load_f32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
-; CI-HSA: flat_load_dword
-define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
-entry:
- %0 = load float, float addrspace(1)* %in
- store float %0, float addrspace(1)* %out
- ret void
-}
-
-; load a v2f32 value from the global address space
-; FUNC-LABEL: {{^}}load_v2f32:
-; R600: MEM_RAT
-; R600: VTX_READ_64
-; SI-NOHSA: buffer_load_dwordx2
-; CI-HSA: flat_load_dwordx2
-define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
-entry:
- %0 = load <2 x float>, <2 x float> addrspace(1)* %in
- store <2 x float> %0, <2 x float> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64:
-; R600: VTX_READ_64
-; SI-NOHSA: buffer_load_dwordx2
-; CI-HSA: flat_load_dwordx2
-define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-entry:
- %0 = load i64, i64 addrspace(1)* %in
- store i64 %0, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64_sext:
-; R600: MEM_RAT
-; R600: MEM_RAT
-; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x
-; R600: 31
-; SI-NOHSA: buffer_load_dword
-; CI-HSA: flat_load_dword
-
-define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
- %0 = load i32, i32 addrspace(1)* %in
- %1 = sext i32 %0 to i64
- store i64 %1, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_i64_zext:
-; R600: MEM_RAT
-; R600: MEM_RAT
-define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
- %0 = load i32, i32 addrspace(1)* %in
- %1 = zext i32 %0 to i64
- store i64 %1, i64 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v8i32:
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
-entry:
- %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
- store <8 x i32> %0, <8 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v16i32:
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-; CI-HSA: flat_load_dwordx4
-define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
-entry:
- %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
- store <16 x i32> %0, <16 x i32> addrspace(1)* %out
- ret void
-}
-
-;===------------------------------------------------------------------------===;
-; CONSTANT ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load a sign-extended i8 value
-; FUNC-LABEL: {{^}}load_const_i8_sext:
-; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 8
-; SI-NOHSA: buffer_load_sbyte v{{[0-9]+}},
-; CI-HSA: flat_load_sbyte v{{[0-9]+}},
-define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
- %0 = load i8, i8 addrspace(2)* %in
- %1 = sext i8 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; Load an aligned i8 value
-; FUNC-LABEL: {{^}}load_const_i8_aligned:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
-; CI-HSA: flat_load_ubyte v{{[0-9]+}},
-define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
- %0 = load i8, i8 addrspace(2)* %in
- %1 = zext i8 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; Load an un-aligned i8 value
-; FUNC-LABEL: {{^}}load_const_i8_unaligned:
-; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
-; CI-HSA: flat_load_ubyte v{{[0-9]+}},
-define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
-entry:
- %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
- %1 = load i8, i8 addrspace(2)* %0
- %2 = zext i8 %1 to i32
- store i32 %2, i32 addrspace(1)* %out
- ret void
-}
-
-; Load a sign-extended i16 value
-; FUNC-LABEL: {{^}}load_const_i16_sext:
-; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
-; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; R600: 16
-; SI-NOHSA: buffer_load_sshort
-; CI-HSA: flat_load_sshort
-define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
- %0 = load i16, i16 addrspace(2)* %in
- %1 = sext i16 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; Load an aligned i16 value
-; FUNC-LABEL: {{^}}load_const_i16_aligned:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
- %0 = load i16, i16 addrspace(2)* %in
- %1 = zext i16 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; Load an un-aligned i16 value
-; FUNC-LABEL: {{^}}load_const_i16_unaligned:
-; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-NOHSA: buffer_load_ushort
-; CI-HSA: flat_load_ushort
-define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
-entry:
- %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
- %1 = load i16, i16 addrspace(2)* %0
- %2 = zext i16 %1 to i32
- store i32 %2, i32 addrspace(1)* %out
- ret void
-}
-
-; Load an i32 value from the constant address space.
-; FUNC-LABEL: {{^}}load_const_addrspace_i32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: s_load_dword s{{[0-9]+}}
-define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
-entry:
- %0 = load i32, i32 addrspace(2)* %in
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; Load a f32 value from the constant address space.
-; FUNC-LABEL: {{^}}load_const_addrspace_f32:
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-
-; SI: s_load_dword s{{[0-9]+}}
-define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
- %1 = load float, float addrspace(2)* %in
- store float %1, float addrspace(1)* %out
- ret void
-}
-
-;===------------------------------------------------------------------------===;
-; LOCAL ADDRESS SPACE
-;===------------------------------------------------------------------------===;
-
-; Load an i8 value from the local address space.
-; FUNC-LABEL: {{^}}load_i8_local:
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
- %1 = load i8, i8 addrspace(3)* %in
- %2 = zext i8 %1 to i32
- store i32 %2, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_i8_sext_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
-entry:
- %0 = load i8, i8 addrspace(3)* %in
- %1 = sext i8 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-; SI: ds_read_u8
-define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
-entry:
- %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
- %1 = zext <2 x i8> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i8_sext_local:
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-; SI: ds_read_i8
-define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
-entry:
- %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
- %1 = sext <2 x i8> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_local:
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; R600: LDS_UBYTE_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
-entry:
- %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
- %1 = zext <4 x i8> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i8_sext_local:
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: LDS_UBYTE_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i8
-; SI: ds_read_i8
-; SI: ds_read_i8
-; SI: ds_read_i8
-define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
-entry:
- %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
- %1 = sext <4 x i8> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; Load an i16 value from the local address space.
-; FUNC-LABEL: {{^}}load_i16_local:
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
-entry:
- %0 = load i16 , i16 addrspace(3)* %in
- %1 = zext i16 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_i16_sext_local:
-; R600: LDS_USHORT_READ_RET
-; R600: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
-entry:
- %0 = load i16, i16 addrspace(3)* %in
- %1 = sext i16 %0 to i32
- store i32 %1, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_local:
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-; SI: ds_read_u16
-define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
-entry:
- %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
- %1 = zext <2 x i16> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v2i16_sext_local:
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-; SI: ds_read_i16
-define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
-entry:
- %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
- %1 = sext <2 x i16> %0 to <2 x i32>
- store <2 x i32> %1, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_local:
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; R600: LDS_USHORT_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-; SI: ds_read_u16
-define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
-entry:
- %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
- %1 = zext <4 x i16> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}load_v4i16_sext_local:
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: LDS_USHORT_READ_RET
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; R600-DAG: BFE_INT
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_i16
-; SI: ds_read_i16
-; SI: ds_read_i16
-; SI: ds_read_i16
-define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
-entry:
- %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
- %1 = sext <4 x i16> %0 to <4 x i32>
- store <4 x i32> %1, <4 x i32> addrspace(1)* %out
- ret void
-}
-
-; load an i32 value from the local address space.
-; FUNC-LABEL: {{^}}load_i32_local:
-; R600: LDS_READ_RET
-; SI-NOT: s_wqm_b64
-; SI: s_mov_b32 m0
-; SI: ds_read_b32
-define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
- %0 = load i32, i32 addrspace(3)* %in
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; load a f32 value from the local address space.
-; FUNC-LABEL: {{^}}load_f32_local:
-; R600: LDS_READ_RET
-; SI: s_mov_b32 m0
-; SI: ds_read_b32
-define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
-entry:
- %0 = load float, float addrspace(3)* %in
- store float %0, float addrspace(1)* %out
- ret void
-}
-
-; load a v2f32 value from the local address space
-; FUNC-LABEL: {{^}}load_v2f32_local:
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; SI: s_mov_b32 m0
-; SI: ds_read_b64
-define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
-entry:
- %0 = load <2 x float>, <2 x float> addrspace(3)* %in
- store <2 x float> %0, <2 x float> addrspace(1)* %out
- ret void
-}
-
-; Test loading a i32 and v2i32 value from the same base pointer.
-; FUNC-LABEL: {{^}}load_i32_v2i32_local:
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; R600: LDS_READ_RET
-; SI-DAG: ds_read_b32
-; SI-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
- %scalar = load i32, i32 addrspace(3)* %in
- %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
- %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
- %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
- %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
- %vec = add <2 x i32> %vec0, %vec1
- store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-
-@lds = addrspace(3) global [512 x i32] undef, align 4
-
-; On SI we need to make sure that the base offset is a register and not
-; an immediate.
-; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
-; SI: ds_read_b32 v0, v[[ZERO]] offset:4
-; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
-entry:
- %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
- %tmp1 = load i32, i32 addrspace(3)* %tmp0
- %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
- store i32 %tmp1, i32 addrspace(1)* %tmp2
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/load.vec.ll b/test/CodeGen/AMDGPU/load.vec.ll
deleted file mode 100644
index 02f883cd8e9c..000000000000
--- a/test/CodeGen/AMDGPU/load.vec.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-; load a v2i32 value from the global address space.
-; EG: {{^}}load_v2i32:
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
-; SI: {{^}}load_v2i32:
-; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
- %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
- store <2 x i32> %a, <2 x i32> addrspace(1)* %out
- ret void
-}
-
-; load a v4i32 value from the global address space.
-; EG: {{^}}load_v4i32:
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
-; SI: {{^}}load_v4i32:
-; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
-define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
- %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
- store <4 x i32> %a, <4 x i32> addrspace(1)* %out
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/load64.ll b/test/CodeGen/AMDGPU/load64.ll
deleted file mode 100644
index 74beabdc0076..000000000000
--- a/test/CodeGen/AMDGPU/load64.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; load a f64 value from the global address space.
-; CHECK-LABEL: {{^}}load_f64:
-; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
- %1 = load double, double addrspace(1)* %in
- store double %1, double addrspace(1)* %out
- ret void
-}
-
-; CHECK-LABEL: {{^}}load_i64:
-; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
- %tmp = load i64, i64 addrspace(1)* %in
- store i64 %tmp, i64 addrspace(1)* %out, align 8
- ret void
-}
-
-; Load a f64 value from the constant address space.
-; CHECK-LABEL: {{^}}load_const_addrspace_f64:
-; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
-; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
-define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
- %1 = load double, double addrspace(2)* %in
- store double %1, double addrspace(1)* %out
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
index 33f3159d13eb..f63d6e08ef73 100644
--- a/test/CodeGen/AMDGPU/local-64.ll
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -122,8 +122,7 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
; BOTH-LABEL: {{^}}local_v2i64_store:
; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
+; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:15 offset1:14
; BOTH: s_endpgm
define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
%gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
@@ -133,8 +132,7 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
; BOTH: s_endpgm
define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
@@ -143,10 +141,8 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
; BOTH-LABEL: {{^}}local_v4i64_store:
; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:31 offset1:30
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:29 offset1:28
; BOTH: s_endpgm
define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
%gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
@@ -156,10 +152,8 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:3 offset1:2
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
; BOTH: s_endpgm
define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll
index 2aaf977ab903..ce82ff5475bc 100644
--- a/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
; EG: LDS_WRXCHG_RET *
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
; GCN: buffer_store_dword [[RESULT]],
@@ -31,8 +31,8 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
; XXX - Is it really necessary to load 4 into VGPR?
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
; GCN: buffer_store_dword [[RESULT]],
@@ -68,35 +68,35 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32:
; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
%result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_offset:
; EG: LDS_ADD_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
+; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_bad_si_offset:
; EG: LDS_ADD_RET *
-; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
%sub = sub i32 %a, %b
%add = add i32 %sub, 4
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -126,23 +126,23 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32:
; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
%result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32_offset:
; EG: LDS_SUB_RET *
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
store i32 %result, i32 addrspace(1)* %out, align 4
@@ -324,7 +324,6 @@ define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
ret void
}
-; XXX - Is it really necessary to load 4 into VGPR?
; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
; GCN: s_load_dword [[SPTR:s[0-9]+]],
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
@@ -357,30 +356,30 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]]
+; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
%result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
-; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_bad_si_offset:
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
%sub = sub i32 %a, %b
%add = add i32 %sub, 4
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -405,20 +404,20 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]]
+; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
%result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
-; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
+; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
ret void
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
index 0ffa5e751b7d..34be6511a602 100644
--- a/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
; GCN: ds_wrxchg_rtn_b64
; GCN: s_endpgm
define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -10,7 +10,7 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -20,7 +20,7 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
; GCN: ds_add_rtn_u64
; GCN: s_endpgm
define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -29,11 +29,11 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; GCN-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
; GCN: buffer_store_dwordx2 [[RESULT]],
@@ -45,29 +45,29 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: buffer_store_dwordx2 [[RESULT]],
; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
%result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
store i64 %result, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
-; GCN: ds_inc_rtn_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
+; GCN: ds_add_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
-define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
%result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
store i64 %result, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
; GCN: ds_sub_rtn_u64
; GCN: s_endpgm
define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -76,7 +76,7 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
; GCN: ds_sub_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -86,29 +86,29 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: buffer_store_dwordx2 [[RESULT]],
; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
%result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
store i64 %result, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
-; GCN: ds_dec_rtn_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
+; GCN: ds_sub_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
-define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
%result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
store i64 %result, i64 addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
; GCN: ds_and_rtn_b64
; GCN: s_endpgm
define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -117,7 +117,7 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
; GCN: ds_and_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -127,7 +127,7 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
; GCN: ds_or_rtn_b64
; GCN: s_endpgm
define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -136,7 +136,7 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
; GCN: ds_or_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -146,7 +146,7 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
; GCN: ds_xor_rtn_b64
; GCN: s_endpgm
define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -155,7 +155,7 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
; GCN: ds_xor_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -166,14 +166,14 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
}
; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
; store i64 %result, i64 addrspace(1)* %out, align 8
; ret void
; }
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
; GCN: ds_min_rtn_i64
; GCN: s_endpgm
define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -182,7 +182,7 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
; GCN: ds_min_rtn_i64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -192,7 +192,7 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
; GCN: ds_max_rtn_i64
; GCN: s_endpgm
define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -201,7 +201,7 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
; GCN: ds_max_rtn_i64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -211,7 +211,7 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
; GCN: ds_min_rtn_u64
; GCN: s_endpgm
define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -220,7 +220,7 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
; GCN: ds_min_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -230,7 +230,7 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
; GCN: ds_max_rtn_u64
; GCN: s_endpgm
define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -239,7 +239,7 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
; GCN: ds_max_rtn_u64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -249,7 +249,7 @@ define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
; GCN: ds_wrxchg_rtn_b64
; GCN: s_endpgm
define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -257,7 +257,7 @@ define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -266,7 +266,7 @@ define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
; GCN: ds_add_u64
; GCN: s_endpgm
define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -274,7 +274,7 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
@@ -288,26 +288,26 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64:
+; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
%result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
-; GCN: ds_inc_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
+; GCN: ds_add_u64 {{.*}} offset:32
; GCN: s_endpgm
-define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
%result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
; GCN: ds_sub_u64
; GCN: s_endpgm
define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -315,7 +315,7 @@ define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
; GCN: ds_sub_u64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -324,26 +324,26 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
-; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
-; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
-; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
+; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
%result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
-; GCN: ds_dec_u64 {{.*}} offset:32
+; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
+; GCN: ds_sub_u64 {{.*}} offset:32
; GCN: s_endpgm
-define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
%result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
; GCN: ds_and_b64
; GCN: s_endpgm
define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -351,7 +351,7 @@ define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
; GCN: ds_and_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -360,7 +360,7 @@ define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
; GCN: ds_or_b64
; GCN: s_endpgm
define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -368,7 +368,7 @@ define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
; GCN: ds_or_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -377,7 +377,7 @@ define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
; GCN: ds_xor_b64
; GCN: s_endpgm
define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -385,7 +385,7 @@ define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
; GCN: ds_xor_b64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -395,13 +395,13 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
}
; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
+; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
; ret void
; }
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
; GCN: ds_min_i64
; GCN: s_endpgm
define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -409,7 +409,7 @@ define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
; GCN: ds_min_i64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -418,7 +418,7 @@ define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
; GCN: ds_max_i64
; GCN: s_endpgm
define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -426,7 +426,7 @@ define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
; GCN: ds_max_i64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -435,7 +435,7 @@ define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
; GCN: ds_min_u64
; GCN: s_endpgm
define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -443,7 +443,7 @@ define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
; GCN: ds_min_u64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
@@ -452,7 +452,7 @@ define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
+; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
; GCN: ds_max_u64
; GCN: s_endpgm
define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
@@ -460,7 +460,7 @@ define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
ret void
}
-; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
+; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
; GCN: ds_max_u64 {{.*}} offset:32
; GCN: s_endpgm
define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
deleted file mode 100644
index 6b52b80ba082..000000000000
--- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s
-
-@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
-@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
-
-
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 8
-; GCN: .long 47180
-; GCN-NEXT: .long 32900
-
-; EG: {{^}}local_memory_two_objects:
-
-; We would like to check the lds writes are using different
-; addresses, but due to variations in the scheduler, we can't do
-; this consistently on evergreen GPUs.
-; EG: LDS_WRITE
-; EG: LDS_WRITE
-; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
-; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
-
-; GROUP_BARRIER must be the last instruction in a clause
-; EG: GROUP_BARRIER
-; EG-NEXT: ALU clause
-
-; Make sure the lds reads are using different addresses, at different
-; constant offsets.
-; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
-; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}}
-; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
-; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
-
-define void @local_memory_two_objects(i32 addrspace(1)* %out) {
-entry:
- %x.i = call i32 @llvm.r600.read.tidig.x() #0
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
- store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
- %mul = shl nsw i32 %x.i, 1
- %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
- store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
- %sub = sub nsw i32 3, %x.i
- call void @llvm.AMDGPU.barrier.local()
- %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
- %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
- store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
- %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
- %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
- %add = add nsw i32 %x.i, 4
- %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
- store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
- ret void
-}
-
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
-
-attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
new file mode 100644
index 000000000000..f6c0e3c62390
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; SI: .long 47180
+; SI-NEXT: .long 65668
+; CI: .long 47180
+; CI-NEXT: .long 32900
+
+; GCN-LABEL: {{^}}local_memory:
+
+; GCN-NOT: s_wqm_b64
+; GCN: ds_write_b32
+
+; GCN: s_barrier
+
+; GCN: ds_read_b32 {{v[0-9]+}},
+define void @local_memory(i32 addrspace(1)* %out) #0 {
+entry:
+ %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+ store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+ %add = add nsw i32 %y.i, 1
+ %cmp = icmp eq i32 %add, 16
+ %.add = select i1 %cmp, i32 0, i32 %add
+ call void @llvm.amdgcn.s.barrier()
+ %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+ %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+ store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
+ ret void
+}
+
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 32900
+
+; GCN-LABEL: {{^}}local_memory_two_objects:
+; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
+; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
+; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
+
+; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
+
+; SI-DAG: ds_write_b32 [[ADDRW]],
+; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
+
+; GCN: s_barrier
+
+; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
+; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
+
+; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
+define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+entry:
+ %x.i = call i32 @llvm.amdgcn.workitem.id.x()
+ %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+ store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+ %mul = shl nsw i32 %x.i, 1
+ %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+ store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+ %sub = sub nsw i32 3, %x.i
+ call void @llvm.amdgcn.s.barrier()
+ %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+ %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
+ store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
+ %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+ %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+ %add = add nsw i32 %x.i, 4
+ %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
+ store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
index 9ffb59e70920..1a11332f865d 100644
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -1,49 +1,44 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+@lds = addrspace(3) global [512 x i32] undef, align 4
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 128
-; SI: .long 47180
-; SI-NEXT: .long 65668
-; CI: .long 47180
-; CI-NEXT: .long 32900
+; On SI we need to make sure that the base offset is a register and
+; not an immediate.
-; FUNC-LABEL: {{^}}local_memory:
+; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
+; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
-; EG: LDS_WRITE
-; SI-NOT: s_wqm_b64
-; SI: ds_write_b32
-
-; GROUP_BARRIER must be the last instruction in a clause
-; EG: GROUP_BARRIER
-; EG-NEXT: ALU clause
-; SI: s_barrier
-
-; EG: LDS_READ_RET
-; SI: ds_read_b32 {{v[0-9]+}},
-
-define void @local_memory(i32 addrspace(1)* %out) {
+; R600: LDS_READ_RET
+define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
entry:
- %y.i = call i32 @llvm.r600.read.tidig.x() #0
- %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
- store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
- %add = add nsw i32 %y.i, 1
- %cmp = icmp eq i32 %add, 16
- %.add = select i1 %cmp, i32 0, i32 %add
- call void @llvm.AMDGPU.barrier.local()
- %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
- %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
- store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+ %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
+ %tmp1 = load i32, i32 addrspace(3)* %tmp0
+ %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+ store i32 %tmp1, i32 addrspace(1)* %tmp2
ret void
}
-declare i32 @llvm.r600.read.tidig.x() #0
-declare void @llvm.AMDGPU.barrier.local()
+; Test loading a i32 and v2i32 value from the same base pointer.
+; FUNC-LABEL: {{^}}load_i32_v2i32_local:
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_read2_b32
+define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+ %scalar = load i32, i32 addrspace(3)* %in
+ %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
+ %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
+ %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
+ %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
+ %vec = add <2 x i32> %vec0, %vec1
+ store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
+ ret void
+}
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/local-memory.r600.ll b/test/CodeGen/AMDGPU/local-memory.r600.ll
new file mode 100644
index 000000000000..9841b8882b39
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory.r600.ll
@@ -0,0 +1,87 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 128
+
+; FUNC-LABEL: {{^}}local_memory:
+
+; EG: LDS_WRITE
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+
+; EG: LDS_READ_RET
+define void @local_memory(i32 addrspace(1)* %out) #0 {
+entry:
+ %y.i = call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+ store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+ %add = add nsw i32 %y.i, 1
+ %cmp = icmp eq i32 %add, 16
+ %.add = select i1 %cmp, i32 0, i32 %add
+ call void @llvm.r600.group.barrier()
+ %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+ %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+ store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
+ ret void
+}
+
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 32900
+
+; FUNC-LABEL: {{^}}local_memory_two_objects:
+
+; We would like to check the lds writes are using different
+; addresses, but due to variations in the scheduler, we can't do
+; this consistently on evergreen GPUs.
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+
+; Make sure the lds reads are using different addresses, at different
+; constant offsets.
+; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+
+define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+entry:
+ %x.i = call i32 @llvm.r600.read.tidig.x() #1
+ %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+ store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+ %mul = shl nsw i32 %x.i, 1
+ %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+ store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+ %sub = sub nsw i32 3, %x.i
+ call void @llvm.r600.group.barrier()
+ %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+ %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
+ store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
+ %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+ %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+ %add = add nsw i32 %x.i, 4
+ %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
+ store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
+ ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare void @llvm.r600.group.barrier() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
new file mode 100644
index 000000000000..6e6f289f5d6d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+
+; This used to fail due to a v_add_i32 instruction with an illegal immediate
+; operand that was created during Local Stack Slot Allocation. Test case derived
+; from https://bugs.freedesktop.org/show_bug.cgi?id=96602
+;
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
+; CHECK: v_mov_b32_e32 [[HI_CONST:v[0-9]+]], 0x200
+; CHECK: v_mov_b32_e32 [[LO_CONST:v[0-9]+]], 0
+; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, [[BYTES]], [[HI_CONST]]
+; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, [[BYTES]], [[LO_CONST]]
+; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+define amdgpu_ps float @main(i32 %idx) {
+main_body:
+ %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
+ %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
+ %r = fadd float %v1, %v2
+ ret float %r
+}
diff --git a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
new file mode 100644
index 000000000000..e1fad13e0b51
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-unknown < %s | FileCheck %s
+
+; and can be eliminated
+; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range:
+; CHECK-NOT: v0
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
+define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+ %and = and i32 %id, 1023
+ store i32 %and, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
+; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
+; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+ %and = and i32 %id, 511
+ store i32 %and, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1:
+; CHECK-NOT: v0
+; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
+; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+ %and = and i32 %id, 255
+ store i32 %and, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{i32 0, i32 1024}
+!1 = !{i32 0, i32 1023}
diff --git a/test/CodeGen/AMDGPU/m0-spill.ll b/test/CodeGen/AMDGPU/m0-spill.ll
index 1dddc85f775d..2427c8de34f8 100644
--- a/test/CodeGen/AMDGPU/m0-spill.ll
+++ b/test/CodeGen/AMDGPU/m0-spill.ll
@@ -5,7 +5,7 @@
; CHECK-LABEL: {{^}}main:
; CHECK-NOT: v_readlane_b32 m0
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
main_body:
%4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
%cmp = fcmp ueq float 0.0, %4
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index c98f851f2b93..0e6281940c24 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -8,7 +8,7 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.fabs.f32(float) #0
declare float @llvm.fma.f32(float, float, float) #0
declare float @llvm.fmuladd.f32(float, float, float) #0
@@ -32,15 +32,15 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
; SI-DENORM: buffer_store_dword [[RESULT]]
; SI-STD: buffer_store_dword [[C]]
define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
%mul = fmul float %a, %b
%fma = fadd float %mul, %c
@@ -71,7 +71,7 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI: s_endpgm
define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -79,17 +79,17 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
%gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
- %d = load float, float addrspace(1)* %gep.3
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
+ %d = load volatile float, float addrspace(1)* %gep.3
%mul = fmul float %a, %b
%fma0 = fadd float %mul, %c
%fma1 = fadd float %mul, %d
- store float %fma0, float addrspace(1)* %gep.out.0
- store float %fma1, float addrspace(1)* %gep.out.1
+ store volatile float %fma0, float addrspace(1)* %gep.out.0
+ store volatile float %fma1, float addrspace(1)* %gep.out.1
ret void
}
@@ -108,15 +108,15 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
; SI-DENORM: buffer_store_dword [[RESULT]]
; SI-STD: buffer_store_dword [[C]]
define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
%mul = fmul float %a, %b
%fma = fadd float %c, %mul
@@ -138,15 +138,15 @@ define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrsp
; SI: buffer_store_dword [[RESULT]]
define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
%mul = fmul float %a, %b
%fma = fsub float %mul, %c
@@ -175,7 +175,7 @@ define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float a
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI: s_endpgm
define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -183,16 +183,16 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl
%gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
- %d = load float, float addrspace(1)* %gep.3
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
+ %d = load volatile float, float addrspace(1)* %gep.3
%mul = fmul float %a, %b
%fma0 = fsub float %mul, %c
%fma1 = fsub float %mul, %d
- store float %fma0, float addrspace(1)* %gep.out.0
- store float %fma1, float addrspace(1)* %gep.out.1
+ store volatile float %fma0, float addrspace(1)* %gep.out.0
+ store volatile float %fma1, float addrspace(1)* %gep.out.1
ret void
}
@@ -210,15 +210,15 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl
; SI: buffer_store_dword [[RESULT]]
define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
%mul = fmul float %a, %b
%fma = fsub float %c, %mul
@@ -246,7 +246,7 @@ define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float a
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI: s_endpgm
define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -254,16 +254,16 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
%gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
- %d = load float, float addrspace(1)* %gep.3
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
+ %d = load volatile float, float addrspace(1)* %gep.3
%mul = fmul float %a, %b
%fma0 = fsub float %c, %mul
%fma1 = fsub float %d, %mul
- store float %fma0, float addrspace(1)* %gep.out.0
- store float %fma1, float addrspace(1)* %gep.out.1
+ store volatile float %fma0, float addrspace(1)* %gep.out.0
+ store volatile float %fma1, float addrspace(1)* %gep.out.1
ret void
}
@@ -282,15 +282,15 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
; SI: buffer_store_dword [[RESULT]]
define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
%mul = fmul float %a, %b
%mul.neg = fsub float -0.0, %mul
@@ -320,7 +320,7 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI: s_endpgm
define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -328,18 +328,18 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
%gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
- %d = load float, float addrspace(1)* %gep.3
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
+ %d = load volatile float, float addrspace(1)* %gep.3
%mul = fmul float %a, %b
%mul.neg = fsub float -0.0, %mul
%fma0 = fsub float %mul.neg, %c
%fma1 = fsub float %mul.neg, %d
- store float %fma0, float addrspace(1)* %gep.out.0
- store float %fma1, float addrspace(1)* %gep.out.1
+ store volatile float %fma0, float addrspace(1)* %gep.out.0
+ store volatile float %fma1, float addrspace(1)* %gep.out.1
ret void
}
@@ -363,7 +363,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI: s_endpgm
define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -371,18 +371,18 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou
%gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
- %d = load float, float addrspace(1)* %gep.3
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
+ %d = load volatile float, float addrspace(1)* %gep.3
%mul = fmul float %a, %b
%mul.neg = fsub float -0.0, %mul
%fma0 = fsub float %mul.neg, %c
%fma1 = fsub float %mul, %d
- store float %fma0, float addrspace(1)* %gep.out.0
- store float %fma1, float addrspace(1)* %gep.out.1
+ store volatile float %fma0, float addrspace(1)* %gep.out.0
+ store volatile float %fma1, float addrspace(1)* %gep.out.1
ret void
}
@@ -408,7 +408,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -416,11 +416,11 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o
%gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
- %z = load float, float addrspace(1)* %gep.2
- %u = load float, float addrspace(1)* %gep.3
- %v = load float, float addrspace(1)* %gep.4
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
+ %z = load volatile float, float addrspace(1)* %gep.2
+ %u = load volatile float, float addrspace(1)* %gep.3
+ %v = load volatile float, float addrspace(1)* %gep.4
%tmp0 = fmul float %u, %v
%tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
@@ -454,7 +454,7 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -462,11 +462,11 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
%gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
- %z = load float, float addrspace(1)* %gep.2
- %u = load float, float addrspace(1)* %gep.3
- %v = load float, float addrspace(1)* %gep.4
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
+ %z = load volatile float, float addrspace(1)* %gep.2
+ %u = load volatile float, float addrspace(1)* %gep.3
+ %v = load volatile float, float addrspace(1)* %gep.4
%tmp0 = fmul float %u, %v
%tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
@@ -491,8 +491,8 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
@@ -500,7 +500,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -508,11 +508,11 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
%gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
- %z = load float, float addrspace(1)* %gep.2
- %u = load float, float addrspace(1)* %gep.3
- %v = load float, float addrspace(1)* %gep.4
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
+ %z = load volatile float, float addrspace(1)* %gep.2
+ %u = load volatile float, float addrspace(1)* %gep.3
+ %v = load volatile float, float addrspace(1)* %gep.4
%tmp0 = fmul float %u, %v
%tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
@@ -538,15 +538,15 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
@@ -554,11 +554,11 @@ define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %o
%gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %x = load float, float addrspace(1)* %gep.0
- %y = load float, float addrspace(1)* %gep.1
- %z = load float, float addrspace(1)* %gep.2
- %u = load float, float addrspace(1)* %gep.3
- %v = load float, float addrspace(1)* %gep.4
+ %x = load volatile float, float addrspace(1)* %gep.0
+ %y = load volatile float, float addrspace(1)* %gep.1
+ %z = load volatile float, float addrspace(1)* %gep.2
+ %u = load volatile float, float addrspace(1)* %gep.3
+ %v = load volatile float, float addrspace(1)* %gep.4
%tmp0 = fmul float %u, %v
%tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll
index 24ff23a4cfc1..7fcfe7f53f06 100644
--- a/test/CodeGen/AMDGPU/mad-sub.ll
+++ b/test/CodeGen/AMDGPU/mad-sub.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.fabs.f32(float) #0
; FUNC-LABEL: {{^}}mad_sub_f32:
@@ -10,7 +10,7 @@ declare float @llvm.fabs.f32(float) #0
; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
; SI: buffer_store_dword [[RESULT]]
define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
%add1 = add i64 %tid.ext, 1
@@ -18,9 +18,9 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
%add2 = add i64 %tid.ext, 2
%gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
%outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
- %a = load float, float addrspace(1)* %gep0, align 4
- %b = load float, float addrspace(1)* %gep1, align 4
- %c = load float, float addrspace(1)* %gep2, align 4
+ %a = load volatile float, float addrspace(1)* %gep0, align 4
+ %b = load volatile float, float addrspace(1)* %gep1, align 4
+ %c = load volatile float, float addrspace(1)* %gep2, align 4
%mul = fmul float %a, %b
%sub = fsub float %mul, %c
store float %sub, float addrspace(1)* %outgep, align 4
@@ -34,7 +34,7 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; SI: buffer_store_dword [[RESULT]]
define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
%add1 = add i64 %tid.ext, 1
@@ -42,9 +42,9 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
%add2 = add i64 %tid.ext, 2
%gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
%outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
- %a = load float, float addrspace(1)* %gep0, align 4
- %b = load float, float addrspace(1)* %gep1, align 4
- %c = load float, float addrspace(1)* %gep2, align 4
+ %a = load volatile float, float addrspace(1)* %gep0, align 4
+ %b = load volatile float, float addrspace(1)* %gep1, align 4
+ %c = load volatile float, float addrspace(1)* %gep2, align 4
%mul = fmul float %a, %b
%sub = fsub float %c, %mul
store float %sub, float addrspace(1)* %outgep, align 4
@@ -55,7 +55,7 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
; SI: v_mul_f64
; SI: v_add_f64
define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
%add1 = add i64 %tid.ext, 1
@@ -63,9 +63,9 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
%add2 = add i64 %tid.ext, 2
%gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2
%outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext
- %a = load double, double addrspace(1)* %gep0, align 8
- %b = load double, double addrspace(1)* %gep1, align 8
- %c = load double, double addrspace(1)* %gep2, align 8
+ %a = load volatile double, double addrspace(1)* %gep0, align 8
+ %b = load volatile double, double addrspace(1)* %gep1, align 8
+ %c = load volatile double, double addrspace(1)* %gep2, align 8
%mul = fmul double %a, %b
%sub = fsub double %mul, %c
store double %sub, double addrspace(1)* %outgep, align 8
@@ -79,7 +79,7 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
; SI: buffer_store_dword [[RESULT]]
define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
%add1 = add i64 %tid.ext, 1
@@ -87,9 +87,9 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
%add2 = add i64 %tid.ext, 2
%gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
%outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
- %a = load float, float addrspace(1)* %gep0, align 4
- %b = load float, float addrspace(1)* %gep1, align 4
- %c = load float, float addrspace(1)* %gep2, align 4
+ %a = load volatile float, float addrspace(1)* %gep0, align 4
+ %b = load volatile float, float addrspace(1)* %gep1, align 4
+ %c = load volatile float, float addrspace(1)* %gep2, align 4
%c.abs = call float @llvm.fabs.f32(float %c) #0
%mul = fmul float %a, %b
%sub = fsub float %mul, %c.abs
@@ -104,7 +104,7 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
; SI: buffer_store_dword [[RESULT]]
define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
%add1 = add i64 %tid.ext, 1
@@ -112,9 +112,9 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
%add2 = add i64 %tid.ext, 2
%gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
%outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
- %a = load float, float addrspace(1)* %gep0, align 4
- %b = load float, float addrspace(1)* %gep1, align 4
- %c = load float, float addrspace(1)* %gep2, align 4
+ %a = load volatile float, float addrspace(1)* %gep0, align 4
+ %b = load volatile float, float addrspace(1)* %gep1, align 4
+ %c = load volatile float, float addrspace(1)* %gep2, align 4
%c.abs = call float @llvm.fabs.f32(float %c) #0
%mul = fmul float %a, %b
%sub = fsub float %c.abs, %mul
@@ -125,7 +125,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
; FUNC-LABEL: {{^}}neg_neg_mad_f32:
; SI: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
%add1 = add i64 %tid.ext, 1
@@ -133,9 +133,9 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
%add2 = add i64 %tid.ext, 2
%gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
%outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
- %a = load float, float addrspace(1)* %gep0, align 4
- %b = load float, float addrspace(1)* %gep1, align 4
- %c = load float, float addrspace(1)* %gep2, align 4
+ %a = load volatile float, float addrspace(1)* %gep0, align 4
+ %b = load volatile float, float addrspace(1)* %gep1, align 4
+ %c = load volatile float, float addrspace(1)* %gep2, align 4
%nega = fsub float -0.000000e+00, %a
%negb = fsub float -0.000000e+00, %b
%mul = fmul float %nega, %negb
@@ -151,7 +151,7 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
; SI: buffer_store_dword [[RESULT]]
define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
- %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
%add1 = add i64 %tid.ext, 1
@@ -159,9 +159,9 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
%add2 = add i64 %tid.ext, 2
%gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
%outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
- %a = load float, float addrspace(1)* %gep0, align 4
- %b = load float, float addrspace(1)* %gep1, align 4
- %c = load float, float addrspace(1)* %gep2, align 4
+ %a = load volatile float, float addrspace(1)* %gep0, align 4
+ %b = load volatile float, float addrspace(1)* %gep1, align 4
+ %c = load volatile float, float addrspace(1)* %gep2, align 4
%b.abs = call float @llvm.fabs.f32(float %b) #0
%mul = fmul float %a, %b.abs
%sub = fsub float %mul, %c
@@ -175,13 +175,13 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
; SI: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
; SI: buffer_store_dword [[R2]]
define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%add = fadd float %r1, %r1
%r3 = fsub float %r2, %add
@@ -196,13 +196,13 @@ define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in)
; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
; SI: buffer_store_dword [[RESULT]]
define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
- %r1 = load float, float addrspace(1)* %gep.0
- %r2 = load float, float addrspace(1)* %gep.1
+ %r1 = load volatile float, float addrspace(1)* %gep.0
+ %r2 = load volatile float, float addrspace(1)* %gep.1
%add = fadd float %r1, %r1
%r3 = fsub float %add, %r2
diff --git a/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
new file mode 100644
index 000000000000..9183ae0972dc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; If the workgroup id range is restricted, we should be able to use
+; mad24 for the usual indexing pattern.
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+; GCN-LABEL: {{^}}get_global_id_0:
+; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
+; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
+; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
+define void @get_global_id_0(i32 addrspace(1)* %out) #1 {
+ %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+ %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+ %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
+ %workgroup.size.xy = load i32, i32 addrspace(2)* %gep, align 4, !invariant.load !0
+ %workgroup.size.x = and i32 %workgroup.size.xy, 65535
+
+ %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+ %workgroup.id.x = call i32 @llvm.amdgcn.workgroup.id.x(), !range !2
+
+ %mul = mul i32 %workgroup.id.x, %workgroup.size.x
+ %add = add i32 %mul, %workitem.id.x
+
+ store i32 %add, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!0 = !{}
+!1 = !{i32 0, i32 1024}
+!2 = !{i32 0, i32 16777216}
diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll
index 86d75a63ca40..f177608a62fc 100644
--- a/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/test/CodeGen/AMDGPU/mad_int24.ll
@@ -1,9 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-
-declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
; FUNC-LABEL: {{^}}i32_mad24:
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
@@ -24,12 +22,3 @@ entry:
store i32 %3, i32 addrspace(1)* %out
ret void
}
-
-; FUNC-LABEL: @test_imul24
-; SI: v_mad_i32_i24
-define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
- %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
- %add = add i32 %mul, %src2
- store i32 %add, i32 addrspace(1)* %out, align 4
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
index 95fe34119596..72c6b2b26173 100644
--- a/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -1,7 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; FUNC-LABEL: {{^}}u32_mad24:
; EG: MULADD_UINT24
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 2e90cf10a3b5..6ea1202ac500 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -3,7 +3,7 @@
; FIXME: Enable VI
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
; GCN-LABEL: {{^}}madak_f32:
@@ -11,7 +11,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
; GCN: buffer_load_dword [[VB:v[0-9]+]]
; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -38,7 +38,7 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
; GCN: s_endpgm
define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -47,17 +47,17 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
%out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
- %a = load float, float addrspace(1)* %in.gep.0, align 4
- %b = load float, float addrspace(1)* %in.gep.1, align 4
- %c = load float, float addrspace(1)* %in.gep.2, align 4
+ %a = load volatile float, float addrspace(1)* %in.gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %in.gep.1, align 4
+ %c = load volatile float, float addrspace(1)* %in.gep.2, align 4
%mul0 = fmul float %a, %b
%mul1 = fmul float %a, %c
%madak0 = fadd float %mul0, 10.0
%madak1 = fadd float %mul1, 10.0
- store float %madak0, float addrspace(1)* %out.gep.0, align 4
- store float %madak1, float addrspace(1)* %out.gep.1, align 4
+ store volatile float %madak0, float addrspace(1)* %out.gep.0, align 4
+ store volatile float %madak1, float addrspace(1)* %out.gep.1, align 4
ret void
}
@@ -65,7 +65,7 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
; GCN: buffer_load_dword [[VA:v[0-9]+]]
; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -85,7 +85,7 @@ define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addr
; GCN: buffer_load_dword [[VB:v[0-9]+]]
; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -101,13 +101,13 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
; We can't use an SGPR when forming madak
; GCN-LABEL: {{^}}s_v_madak_f32:
-; GCN: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
; GCN-NOT: v_madak_f32
; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -126,7 +126,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)*
; GCN-NOT: v_madak_f32
; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -154,7 +154,7 @@ define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwin
; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
; GCN: s_endpgm
define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -176,7 +176,7 @@ define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float
; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
; GCN: s_endpgm
define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -191,3 +191,32 @@ define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float
store float %madak, float addrspace(1)* %out.gep, align 4
ret void
}
+
+; SIFoldOperands should not fold the SGPR copy into the instruction
+; because the implicit immediate already uses the constant bus.
+; GCN-LABEL: {{^}}madak_constant_bus_violation:
+; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
+; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
+; GCN: buffer_load_dword [[VGPR:v[0-9]+]]
+; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
+; GCN: buffer_store_dword [[MUL]]
+define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+bb:
+ %tmp = icmp eq i32 %arg1, 0
+ br i1 %tmp, label %bb3, label %bb4
+
+bb3:
+ store volatile float 0.0, float addrspace(1)* undef
+ br label %bb4
+
+bb4:
+ %vgpr = load volatile float, float addrspace(1)* undef
+ %tmp0 = fmul float %sgpr0, 0.5
+ %tmp1 = fadd float %tmp0, 42.0
+ %tmp2 = fmul float %tmp1, %vgpr
+ store volatile float %tmp2, float addrspace(1)* undef, align 4
+ ret void
+}
+
+attributes #0 = { nounwind}
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index f8e14e34af67..1adf82402b72 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -1,21 +1,25 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+ ; FIXME: None of these trigger madmk emission anymore. It is still
+ ; possible, but requires the correct registers to be used which is
+ ; hard to trigger.
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
; GCN-LABEL: {{^}}madmk_f32:
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%mul = fmul float %a, 10.0
%madmk = fadd float %mul, %b
@@ -32,7 +36,7 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
; GCN: s_endpgm
define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -41,9 +45,9 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
%out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
%out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
- %a = load float, float addrspace(1)* %in.gep.0, align 4
- %b = load float, float addrspace(1)* %in.gep.1, align 4
- %c = load float, float addrspace(1)* %in.gep.2, align 4
+ %a = load volatile float, float addrspace(1)* %in.gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %in.gep.1, align 4
+ %c = load volatile float, float addrspace(1)* %in.gep.2, align 4
%mul0 = fmul float %a, 10.0
%mul1 = fmul float %a, 10.0
@@ -61,13 +65,13 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%mul = fmul float %a, 4.0
%madmk = fadd float %mul, %b
@@ -80,7 +84,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
; GCN: v_mac_f32_e32
; GCN: s_endpgm
define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%mul = fmul float %a, 10.0
@@ -94,7 +98,7 @@ define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b)
; GCN: v_mad_f32
; GCN: s_endpgm
define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%a = load float, float addrspace(1)* %gep.0, align 4
@@ -110,7 +114,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)*
; GCN: v_mac_f32_e32
; GCN: s_endpgm
define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%b = load float, float addrspace(1)* %gep.0, align 4
@@ -126,13 +130,13 @@ define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float add
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
@@ -147,13 +151,13 @@ define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
- %a = load float, float addrspace(1)* %gep.0, align 4
- %b = load float, float addrspace(1)* %gep.1, align 4
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
%b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
@@ -168,7 +172,7 @@ define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float
; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
- %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -182,7 +186,7 @@ define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float ad
; SI-LABEL: {{^}}kill_madmk_verifier_error:
; SI: s_xor_b64
-; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c
+; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}}
; SI: s_or_b64
define void @kill_madmk_verifier_error() nounwind {
bb:
@@ -193,7 +197,9 @@ bb1: ; preds = %bb2
bb2: ; preds = %bb6, %bb
%tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ]
- %tmp3 = fsub float undef, %tmp
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+ %f_tid = bitcast i32 %tid to float
+ %tmp3 = fsub float %f_tid, %tmp
%tmp5 = fcmp oeq float %tmp3, 1.000000e+04
br i1 %tmp5, label %bb1, label %bb6
@@ -203,3 +209,7 @@ bb6: ; preds = %bb2
%tmp8 = fadd float %tmp7, undef
br label %bb2
}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/max-literals.ll b/test/CodeGen/AMDGPU/max-literals.ll
index c357524b140f..3f80d5e41a3f 100644
--- a/test/CodeGen/AMDGPU/max-literals.ll
+++ b/test/CodeGen/AMDGPU/max-literals.ll
@@ -3,7 +3,7 @@
; CHECK-LABEL: {{^}}main:
; CHECK: ADD *
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -23,16 +23,16 @@ main_body:
%15 = insertelement <4 x float> %14, float %8, i32 3
%16 = insertelement <4 x float> %15, float %11, i32 3
- %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+ %17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
%18 = insertelement <4 x float> undef, float %17, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %18, i32 0, i32 2)
ret void
}
; CHECK-LABEL: {{^}}main2:
; CHECK-NOT: ADD *
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_vs void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -52,16 +52,15 @@ main_body:
%15 = insertelement <4 x float> %14, float %8, i32 3
%16 = insertelement <4 x float> %15, float %11, i32 3
- %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+ %17 = call float @llvm.r600.dot4(<4 x float> %15,<4 x float> %16)
%18 = insertelement <4 x float> undef, float %17, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %18, i32 0, i32 2)
ret void
}
; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-attributes #0 = { "ShaderType"="1" }
attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
index eeb915c10a96..5fa307be0fd5 100644
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -1,19 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
; SI: v_max_i32_e32
+
+; EG: MAX_INT
define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp sge i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
@@ -22,21 +20,25 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
; SI: v_max_i32_e32
; SI: v_max_i32_e32
; SI: v_max_i32_e32
+
+; These could be merged into one
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %out, i32 %tid
- %a = load <4 x i32>, <4 x i32> addrspace(1)* %gep0, align 4
- %b = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 4
+ %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
+ %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4
%cmp = icmp sge <4 x i32> %a, %b
%val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
- store <4 x i32> %val, <4 x i32> addrspace(1)* %outgep, align 4
+ store <4 x i32> %val, <4 x i32> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: @s_test_imax_sge_i32
; SI: s_max_i32
+
+; EG: MAX_INT
define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp sge i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -46,6 +48,8 @@ define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32:
; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
%cmp = icmp sge i32 %a, 9
%val = select i1 %cmp, i32 %a, i32 9
@@ -57,21 +61,21 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; SI: buffer_load_sbyte
; SI: buffer_load_sbyte
; SI: v_max_i32_e32
+
+; EG: MAX_INT
define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
- %a = load i8, i8 addrspace(1)* %gep0, align 1
- %b = load i8, i8 addrspace(1)* %gep1, align 1
+ %a = load i8, i8 addrspace(1)* %aptr, align 1
+ %b = load i8, i8 addrspace(1)* %bptr, align 1
%cmp = icmp sge i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
- store i8 %val, i8 addrspace(1)* %outgep, align 1
+ store i8 %val, i8 addrspace(1)* %out, align 1
ret void
}
; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32:
; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
%cmp = icmp sgt i32 %a, 9
%val = select i1 %cmp, i32 %a, i32 9
@@ -82,29 +86,33 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_v2i32:
; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
+; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
%cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
ret void
}
+
; FUNC-LABEL: @v_test_imax_sgt_i32
; SI: v_max_i32_e32
+
+; EG: MAX_INT
define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp sgt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: @s_test_imax_sgt_i32
; SI: s_max_i32
+
+; EG: MAX_INT
define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp sgt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -114,21 +122,21 @@ define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; FUNC-LABEL: @v_test_umax_uge_i32
; SI: v_max_u32_e32
+
+; EG: MAX_UINT
define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp uge i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: @s_test_umax_uge_i32
; SI: s_max_u32
+
+; EG: MAX_UINT
define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp uge i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -142,6 +150,11 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; SI: s_max_u32
; SI-NOT: s_max_u32
; SI: s_endpgm
+
+; EG: MAX_UINT
+; EG: MAX_UINT
+; EG: MAX_UINT
+; EG-NOT: MAX_UINT
define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
%cmp = icmp uge <3 x i32> %a, %b
%val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
@@ -153,36 +166,34 @@ define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: v_max_u32_e32
+
+; EG: MAX_UINT
define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
- %a = load i8, i8 addrspace(1)* %gep0, align 1
- %b = load i8, i8 addrspace(1)* %gep1, align 1
+ %a = load i8, i8 addrspace(1)* %aptr, align 1
+ %b = load i8, i8 addrspace(1)* %bptr, align 1
%cmp = icmp uge i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
- store i8 %val, i8 addrspace(1)* %outgep, align 1
+ store i8 %val, i8 addrspace(1)* %out, align 1
ret void
}
; FUNC-LABEL: @v_test_umax_ugt_i32
; SI: v_max_u32_e32
+
+; EG: MAX_UINT
define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp ugt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}s_test_umax_ugt_i32:
; SI: s_max_u32
+
+; EG: MAX_UINT
define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp ugt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -191,8 +202,11 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
}
; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32:
-; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
-; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
+; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
+; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
+
+; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
+; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
%cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
@@ -205,8 +219,10 @@ define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
-; SI-NEXT: buffer_store_dword [[VMAX]]
+; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI: buffer_store_dword [[VMAX]]
+
+; EG: MAX_UINT
define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
@@ -223,8 +239,10 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
-; SI-NEXT: buffer_store_dword [[VMAX]]
+; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
+; SI: buffer_store_dword [[VMAX]]
+
+; EG: MAX_INT
define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32
@@ -242,9 +260,60 @@ define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16
; SI: s_sext_i32_i16
; SI: s_sext_i32_i16
; SI: s_max_i32
+
+; EG: MAX_INT
define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
%cmp = icmp sge i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %out
ret void
}
+
+; 64 bit
+; FUNC-LABEL: {{^}}test_umax_ugt_i64
+; SI: s_endpgm
+
+; EG: MAX_UINT
+; EG: MAX_UINT
+define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp ugt i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_umax_uge_i64
+; SI: s_endpgm
+
+; EG: MAX_UINT
+; EG: MAX_UINT
+define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp uge i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_imax_sgt_i64
+; SI: s_endpgm
+
+; EG-DAG: MAX_UINT
+; EG-DAG: MAX_INT
+define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp sgt i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_imax_sge_i64
+; SI: s_endpgm
+
+; EG-DAG: MAX_UINT
+; EG-DAG: MAX_INT
+define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp sge i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
index cfb94b272e51..a12dba2eb6e9 100644
--- a/test/CodeGen/AMDGPU/max3.ll
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; FUNC-LABEL: @v_test_imax3_sgt_i32
; SI: v_max3_i32
define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
; FUNC-LABEL: @v_test_umax3_ugt_i32
; SI: v_max3_u32
define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index 65b454b5d8cb..17b4af818f8f 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -231,8 +231,8 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
}
; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
@@ -334,8 +334,8 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
}
; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
-; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
@@ -376,7 +376,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
%w = load i32, i32 addrspace(1)* %in.gep.3
; Make sure the barrier doesn't stop this
- tail call void @llvm.AMDGPU.barrier.local() #1
+ tail call void @llvm.amdgcn.s.barrier() #1
store i32 %w, i32 addrspace(1)* %out.gep.3
store i32 %z, i32 addrspace(1)* %out.gep.2
@@ -413,7 +413,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
%w = load i32, i32 addrspace(1)* %in.gep.3
; Make sure the barrier doesn't stop this
- tail call void @llvm.AMDGPU.barrier.local() #1
+ tail call void @llvm.amdgcn.s.barrier() #1
store i32 %w, i32 addrspace(1)* %out
store i32 %z, i32 addrspace(1)* %out.gep.1
@@ -640,13 +640,13 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
; GCN-LABEL: {{^}}copy_v3i32_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
; GCN: ScratchSize: 0{{$}}
define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
@@ -657,13 +657,13 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
; GCN-LABEL: {{^}}copy_v3i64_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
; GCN: ScratchSize: 0{{$}}
define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
@@ -673,13 +673,13 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
; GCN-LABEL: {{^}}copy_v3f32_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
; GCN: ScratchSize: 0{{$}}
define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
@@ -690,13 +690,13 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
; GCN-LABEL: {{^}}copy_v3f64_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
; GCN: ScratchSize: 0{{$}}
define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
@@ -705,7 +705,7 @@ define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x dou
ret void
}
-declare void @llvm.AMDGPU.barrier.local() #1
+declare void @llvm.amdgcn.s.barrier() #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 215dbeb4b2fd..5d64a152af3c 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,24 +1,25 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
; SI: v_min_i32_e32
+
+; EG: MIN_INT
define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp sle i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}s_test_imin_sle_i32:
; SI: s_min_i32
+
+; EG: MIN_INT
define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp sle i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -28,6 +29,8 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32:
; SI: s_min_i32
+
+; EG: MIN_INT
define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
%cmp = icmp sle <1 x i32> %a, %b
%val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
@@ -40,6 +43,11 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
; SI: s_min_i32
; SI: s_min_i32
; SI: s_min_i32
+
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
%cmp = icmp sle <4 x i32> %a, %b
%val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
@@ -79,6 +87,11 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
; SI: v_min_i32
; SI: s_endpgm
+
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind {
%cmp = icmp sle <4 x i8> %a, %b
%val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
@@ -91,6 +104,11 @@ define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x
; SI: v_min_i32
; SI: v_min_i32
; SI: v_min_i32
+
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
+; EG: MIN_INT
define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
%cmp = icmp sle <4 x i16> %a, %b
%val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -100,21 +118,21 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
; FUNC-LABEL: @v_test_imin_slt_i32
; SI: v_min_i32_e32
+
+; EG: MIN_INT
define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp slt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: @s_test_imin_slt_i32
; SI: s_min_i32
+
+; EG: MIN_INT
define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp slt i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -125,6 +143,9 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32:
; SI: s_min_i32
; SI: s_min_i32
+
+; EG: MIN_INT
+; EG: MIN_INT
define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
%cmp = icmp slt <2 x i32> %a, %b
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
@@ -134,6 +155,8 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+
+; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
%cmp = icmp slt i32 %a, 8
%val = select i1 %cmp, i32 %a, i32 8
@@ -143,6 +166,8 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+
+; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
%cmp = icmp sle i32 %a, 8
%val = select i1 %cmp, i32 %a, i32 8
@@ -152,16 +177,14 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; FUNC-LABEL: @v_test_umin_ule_i32
; SI: v_min_u32_e32
+
+; EG: MIN_UINT
define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp ule i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
@@ -171,20 +194,22 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
; SI: v_min_u32_e32
; SI-NOT: v_min_u32_e32
; SI: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
- %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep0
- %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep1
+ %a = load <3 x i32>, <3 x i32> addrspace(1)* %aptr
+ %b = load <3 x i32>, <3 x i32> addrspace(1)* %bptr
%cmp = icmp ule <3 x i32> %a, %b
%val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
- store <3 x i32> %val, <3 x i32> addrspace(1)* %outgep
+ store <3 x i32> %val, <3 x i32> addrspace(1)* %out
ret void
}
; FUNC-LABEL: @s_test_umin_ule_i32
; SI: s_min_u32
+
+; EG: MIN_UINT
define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp ule i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -194,16 +219,14 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; FUNC-LABEL: @v_test_umin_ult_i32
; SI: v_min_u32_e32
+
+; EG: MIN_UINT
define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep, align 4
+ store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
@@ -211,21 +234,21 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: v_min_u32_e32
+
+; EG: MIN_UINT
define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i8, i8 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i8, i8 addrspace(1)* %bptr, i32 %tid
- %outgep = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
- %a = load i8, i8 addrspace(1)* %gep0, align 1
- %b = load i8, i8 addrspace(1)* %gep1, align 1
+ %a = load i8, i8 addrspace(1)* %aptr, align 1
+ %b = load i8, i8 addrspace(1)* %bptr, align 1
%cmp = icmp ult i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
- store i8 %val, i8 addrspace(1)* %outgep, align 1
+ store i8 %val, i8 addrspace(1)* %out, align 1
ret void
}
; FUNC-LABEL: @s_test_umin_ult_i32
; SI: s_min_u32
+
+; EG: MIN_UINT
define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
@@ -239,24 +262,23 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; SI-NEXT: v_cndmask_b32
; SI-NOT: v_min
; SI: s_endpgm
+
+; EG-NOT: MIN_UINT
define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
- %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
- %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
- %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid
- %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
+ %a = load i32, i32 addrspace(1)* %aptr, align 4
+ %b = load i32, i32 addrspace(1)* %bptr, align 4
%cmp = icmp ult i32 %a, %b
%val = select i1 %cmp, i32 %a, i32 %b
- store i32 %val, i32 addrspace(1)* %outgep0, align 4
- store i1 %cmp, i1 addrspace(1)* %outgep1
+ store i32 %val, i32 addrspace(1)* %out0, align 4
+ store i1 %cmp, i1 addrspace(1)* %out1
ret void
}
; FUNC-LABEL: @s_test_umin_ult_v1i32
; SI: s_min_u32
+
+; EG: MIN_UINT
define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
%cmp = icmp ult <1 x i32> %a, %b
%val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
@@ -273,6 +295,15 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
; SI: s_min_u32
; SI: s_min_u32
; SI: s_min_u32
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
%cmp = icmp ult <8 x i32> %a, %b
%val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
@@ -289,6 +320,15 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
; SI: v_min_u32
; SI: v_min_u32
; SI: v_min_u32
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
%cmp = icmp ult <8 x i16> %a, %b
%val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
@@ -301,8 +341,10 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
+; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI: buffer_store_dword [[VMIN]]
+
+; EG: MIN_UINT
define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
@@ -319,8 +361,10 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI-NEXT: buffer_store_dword [[VMIN]]
+; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI: buffer_store_dword [[VMIN]]
+
+; EG: MIN_INT
define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32
@@ -334,9 +378,60 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
; SI: s_min_i32
+
+; EG: MIN_INT
define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
%cmp = icmp sle i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %out
ret void
}
+
+; 64 bit
+; FUNC-LABEL: {{^}}test_umin_ult_i64
+; SI: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp ult i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_umin_ule_i64
+; SI: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp ule i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_imin_slt_i64
+; SI: s_endpgm
+
+; EG-DAG: MIN_UINT
+; EG-DAG: MIN_INT
+define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp slt i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_imin_sle_i64
+; SI: s_endpgm
+
+; EG-DAG: MIN_UINT
+; EG-DAG: MIN_INT
+define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+ %tmp = icmp sle i64 %a, %b
+ %val = select i1 %tmp, i64 %a, i64 %b
+ store i64 %val, i64 addrspace(1)* %out, align 8
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
index 38ef46d1bdd6..728479ad9f62 100644
--- a/test/CodeGen/AMDGPU/min3.ll
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; FUNC-LABEL: @v_test_imin3_slt_i32
; SI: v_min3_i32
define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
; FUNC-LABEL: @v_test_umin3_ult_i32
; SI: v_min3_u32
define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
@@ -44,7 +44,7 @@ define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
; SI: v_min_i32
; SI: v_min3_i32
define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%tid2 = mul i32 %tid, 2
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -78,7 +78,7 @@ define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %ap
; FUNC-LABEL: @v_test_umin3_2_uses
; SI-NOT: v_min3
define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%tid2 = mul i32 %tid, 2
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll
index 4af9cdf1b960..3d6d7fae0fd6 100644
--- a/test/CodeGen/AMDGPU/missing-store.ll
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@@ -7,8 +7,12 @@
; FUNC-LABEL: {{^}}missing_store_reduced:
; SI: ds_read_b64
-; SI: buffer_store_dword
-; SI: buffer_load_dword
+; SI-DAG: buffer_store_dword
+; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI: s_load_dword
+; SI: s_nop 2
+; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
; SI: buffer_store_dword
; SI: s_endpgm
define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index e9f641b736d5..36f12573c173 100644
--- a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -6,12 +6,13 @@
; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
-; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
-; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
+; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
-; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
; GCN-NOT: v_mov_b32
; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
diff --git a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index 8bca0575ecd2..1a0a39027853 100644
--- a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -7,12 +7,12 @@
; Check that moving the pointer out of the resource descriptor to
; vaddr works for atomics.
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; GCN-LABEL: {{^}}atomic_max_i32:
; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
%ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
%xor = xor i32 %tid, 1
@@ -32,7 +32,7 @@ exit:
; GCN-LABEL: {{^}}atomic_max_i32_noret:
; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
%ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
%xor = xor i32 %tid, 1
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index b19163f294e0..a574365da986 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
-declare i32 @llvm.r600.read.tidig.x() readnone
+declare i32 @llvm.amdgcn.workitem.id.x() readnone
;;;==========================================================================;;;
;;; MUBUF LOAD TESTS
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone
; MUBUF load with an immediate byte offset that fits into 12-bits
; CHECK-LABEL: {{^}}mubuf_load0:
-; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
+; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
@@ -19,7 +19,7 @@ entry:
; MUBUF load with the largest possible immediate offset
; CHECK-LABEL: {{^}}mubuf_load1:
-; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
+; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@@ -31,7 +31,7 @@ entry:
; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
; CHECK-LABEL: {{^}}mubuf_load2:
; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
-; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
+; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
@@ -55,15 +55,14 @@ entry:
; CHECK-LABEL: {{^}}soffset_max_imm:
; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
-define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+define amdgpu_gs void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
main_body:
%tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
%tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
%tmp2 = shl i32 %6, 2
%tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
%tmp4 = add i32 %6, 16
- %tmp5 = bitcast float 0.0 to i32
- call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+ call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
ret void
}
@@ -74,15 +73,14 @@ main_body:
; CHECK-LABEL: {{^}}soffset_no_fold:
; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
-define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+define amdgpu_gs void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
main_body:
%tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
%tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
%tmp2 = shl i32 %6, 2
%tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
%tmp4 = add i32 %6, 16
- %tmp5 = bitcast float 0.0 to i32
- call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+ call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
ret void
}
@@ -92,7 +90,7 @@ main_body:
; MUBUF store with an immediate byte offset that fits into 12-bits
; CHECK-LABEL: {{^}}mubuf_store0:
-; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
+; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
define void @mubuf_store0(i32 addrspace(1)* %out) {
entry:
%0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
@@ -102,7 +100,7 @@ entry:
; MUBUF store with the largest possible immediate offset
; CHECK-LABEL: {{^}}mubuf_store1:
-; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
+; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
define void @mubuf_store1(i8 addrspace(1)* %out) {
entry:
@@ -114,7 +112,7 @@ entry:
; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
; CHECK-LABEL: {{^}}mubuf_store2:
; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
-; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
+; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
define void @mubuf_store2(i32 addrspace(1)* %out) {
entry:
%0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
@@ -135,14 +133,14 @@ entry:
}
; CHECK-LABEL: {{^}}store_sgpr_ptr:
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
store i32 99, i32 addrspace(1)* %out, align 4
ret void
}
; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
+; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
store i32 99, i32 addrspace(1)* %out.gep, align 4
@@ -151,7 +149,7 @@ define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
-; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
store i32 99, i32 addrspace(1)* %out.gep, align 4
@@ -160,7 +158,7 @@ define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
-; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
%val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
@@ -170,14 +168,13 @@ define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
; CHECK-LABEL: {{^}}store_vgpr_ptr:
; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
store i32 99, i32 addrspace(1)* %out.gep, align 4
ret void
}
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
-attributes #3 = { nounwind readonly }
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index 94e0f96b323e..5ceef7fda826 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
; mul24 and mad24 are affected
@@ -96,8 +96,8 @@ define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
}
; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
-; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
-; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
; SI: s_endpgm
define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
%val = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll
index 7609dcc87afa..1a323fbaa1a3 100644
--- a/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/test/CodeGen/AMDGPU/mul_int24.ll
@@ -1,7 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; FUNC-LABEL: {{^}}i32_mul24:
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll
index 8a0e71d739be..fdd348403edf 100644
--- a/test/CodeGen/AMDGPU/mul_uint24.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24.ll
@@ -1,7 +1,7 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
; FUNC-LABEL: {{^}}u32_mul24:
; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
new file mode 100644
index 000000000000..e1130c9125e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}define amdgpu_vs void @main
+; CHECK: main_body:
+; CHECK: LOOP.outer:
+; CHECK: LOOP:
+; CHECK: [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(
+; CHECK: [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1
+;
+; CHECK: Flow:
+;
+; Ensure two else.break calls, for both the inner and outer loops
+;
+; CHECK: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
+; CHECK-NEXT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
+; CHECK-NEXT: call void @llvm.amdgcn.end.cf
+;
+; CHECK: Flow1:
+define amdgpu_vs void @main(<4 x float> %vec, i32 %ub, i32 %cont) {
+main_body:
+ br label %LOOP.outer
+
+LOOP.outer: ; preds = %ENDIF, %main_body
+ %tmp43 = phi i32 [ 0, %main_body ], [ %tmp47, %ENDIF ]
+ br label %LOOP
+
+LOOP: ; preds = %ENDIF, %LOOP.outer
+ %tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ]
+ %tmp47 = add i32 %tmp45, 1
+ %tmp48 = icmp slt i32 %tmp45, %ub
+ br i1 %tmp48, label %ENDIF, label %IF
+
+IF: ; preds = %LOOP
+ ret void
+
+ENDIF: ; preds = %LOOP
+ %tmp51 = icmp eq i32 %tmp47, %cont
+ br i1 %tmp51, label %LOOP, label %LOOP.outer
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
index 73a146710a9f..d1fe794b93fb 100644
--- a/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
+++ b/test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll
@@ -1,18 +1,16 @@
; RUN: not llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s 2>&1 | FileCheck %s
-; CHECK: error: unsupported non-compute shaders with HSA in pixel_shader
-define void @pixel_shader() #0 {
+; CHECK: in function pixel_s{{.*}}: unsupported non-compute shaders with HSA
+define amdgpu_ps void @pixel_shader() #0 {
ret void
}
-define void @vertex_shader() #1 {
+; CHECK: in function vertex_s{{.*}}: unsupported non-compute shaders with HSA
+define amdgpu_vs void @vertex_shader() #0 {
ret void
}
-define void @geometry_shader() #2 {
+; CHECK: in function geometry_s{{.*}}: unsupported non-compute shaders with HSA
+define amdgpu_gs void @geometry_shader() #0 {
ret void
}
-
-attributes #0 = { nounwind "ShaderType"="0" }
-attributes #1 = { nounwind "ShaderType"="1" }
-attributes #2 = { nounwind "ShaderType"="2" }
diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
index 9a814b579deb..9dd99efd997c 100644
--- a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -1,19 +1,24 @@
-; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s
-; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s
-; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
+; RUN: llc -march=amdgcn -mcpu=SI -filetype=obj < %s | llvm-readobj -relocations -symbols | FileCheck %s -check-prefix=GCN
+; RUN: llc -march=amdgcn -mcpu=tonga -filetype=obj < %s | llvm-readobj -relocations -symbols | FileCheck %s -check-prefix=GCN
+; RUN: llc -march=r600 -mcpu=cypress -filetype=obj < %s | llvm-readobj -relocations -symbols | FileCheck %s -check-prefix=EG
+; GCN: R_AMDGPU_REL32 extern_const_addrspace
+; EG: R_AMDGPU_ABS32 extern_const_addrspace
+
+; CHECK-DAG: Name: extern_const_addrspace
@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
-; FUNC-LABEL: {{^}}load_extern_const_init:
+; CHECK-DAG: Name: load_extern_const_init
define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
%val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
store i32 %val, i32 addrspace(1)* %out, align 4
ret void
}
+; CHECK-DAG: Name: undef_const_addrspace
@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
-; FUNC-LABEL: {{^}}load_undef_const_init:
+; CHECK-DAG: Name: undef_const_addrspace
define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
%val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
store i32 %val, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index f81911aafe22..fd66b0b5d1f6 100644
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; Make sure we don't turn the 32-bit argument load into a 16-bit
; load. There aren't extending scalar lods, so that would require
@@ -22,7 +22,7 @@ define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounw
; SI: buffer_load_dword v
; SI: buffer_store_short v
define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%load = load i32, i32 addrspace(1)* %gep.in
@@ -44,7 +44,7 @@ define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwin
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
%load = load i32, i32 addrspace(1)* %gep.in
@@ -66,7 +66,7 @@ define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwin
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
%load = load i32, i32 addrspace(1)* %gep.in
@@ -88,7 +88,7 @@ define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounw
; SI: buffer_load_dword v
; SI: buffer_store_dword v
define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%load = load i64, i64 addrspace(1)* %gep.in
@@ -111,7 +111,7 @@ define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
; SI: buffer_load_dword v
; SI: buffer_store_dword v
define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%load = load i64, i64 addrspace(1)* %gep.in
@@ -135,7 +135,7 @@ define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwin
; SI: buffer_load_ubyte v
; SI: buffer_store_byte v
define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
%load = load i16, i16 addrspace(1)* %gep.in
@@ -158,7 +158,7 @@ define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
%load = load i64, i64 addrspace(1)* %gep.in
@@ -181,7 +181,7 @@ define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwin
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
%load = load i64, i64 addrspace(1)* %gep.in
@@ -201,3 +201,15 @@ entry:
store i32 %mask, i32 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+ %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+ %bc = bitcast <2 x i32> %ld to i64
+ %hi = lshr i64 %bc, 32
+ %trunc = trunc i64 %hi to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index bc467e47dc31..0242f6d6145a 100644
--- a/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -1,5 +1,5 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; Make sure the OpenCL Image lowering pass doesn't crash when argument metadata
; is not in expected order.
diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll
index 9e514ef9970a..d6fc65fa7e83 100644
--- a/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: {{^}}fold_sgpr:
; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
@@ -8,7 +8,7 @@ entry:
br i1 %tmp0, label %if, label %endif
if:
- %id = call i32 @llvm.r600.read.tidig.x()
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
%offset = add i32 %fold, %id
%tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
store i32 0, i32 addrspace(1)* %tmp1
@@ -27,7 +27,7 @@ entry:
br i1 %tmp0, label %if, label %endif
if:
- %id = call i32 @llvm.r600.read.tidig.x()
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
%val = or i32 %id, %fold
store i32 %val, i32 addrspace(1)* %out
br label %endif
@@ -63,7 +63,7 @@ entry:
define void @vector_inline(<4 x i32> addrspace(1)* %out) {
entry:
- %tmp0 = call i32 @llvm.r600.read.tidig.x()
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = add i32 %tmp0, 1
%tmp2 = add i32 %tmp0, 2
%tmp3 = add i32 %tmp0, 3
@@ -82,7 +82,7 @@ entry:
define void @imm_one_use(i32 addrspace(1)* %out) {
entry:
- %tmp0 = call i32 @llvm.r600.read.tidig.x()
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = xor i32 %tmp0, 100
store i32 %tmp1, i32 addrspace(1)* %out
ret void
@@ -96,7 +96,7 @@ entry:
define void @vector_imm(<4 x i32> addrspace(1)* %out) {
entry:
- %tmp0 = call i32 @llvm.r600.read.tidig.x()
+ %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = add i32 %tmp0, 1
%tmp2 = add i32 %tmp0, 2
%tmp3 = add i32 %tmp0, 3
@@ -109,5 +109,6 @@ entry:
ret void
}
-declare i32 @llvm.r600.read.tidig.x() #0
-attributes #0 = { readnone }
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index e40f18f040b7..9b90ff798ca7 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -96,7 +96,7 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; SI: v_or_b32_e32 v{{[0-9]}}
define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
%loada = load i64, i64 addrspace(1)* %a, align 8
- %loadb = load i64, i64 addrspace(1)* %a, align 8
+ %loadb = load i64, i64 addrspace(1)* %b, align 8
%or = or i64 %loada, %loadb
store i64 %or, i64 addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/over-max-lds-size.ll b/test/CodeGen/AMDGPU/over-max-lds-size.ll
new file mode 100644
index 000000000000..32ad9aba04ed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/over-max-lds-size.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: local memory limit exceeded (400000) in use_huge_lds
+
+@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
+
+define void @use_huge_lds() {
+entry:
+ %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
+ store i32 0, i32 addrspace(3)* %v0
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index f32b044198ab..ea943a533c81 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -1,5 +1,4 @@
-; Function Attrs: nounwind
-; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s
;
; CFG flattening should use parallel-and mode to generate branch conditions and
; then merge if-regions with the same bodies.
diff --git a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 51985af42a29..3e0d36978ad4 100644
--- a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -8,10 +8,10 @@
; During live interval construction, the first sub register def is
; incorrectly marked as dead.
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%val = load i64, i64 addrspace(1)* %in.gep
diff --git a/test/CodeGen/AMDGPU/predicate-dp4.ll b/test/CodeGen/AMDGPU/predicate-dp4.ll
index 6bc187594359..7ac2bb7b0c7f 100644
--- a/test/CodeGen/AMDGPU/predicate-dp4.ll
+++ b/test/CodeGen/AMDGPU/predicate-dp4.ll
@@ -3,7 +3,7 @@
; CHECK-LABEL: {{^}}main:
; CHECK: PRED_SETE_INT * Pred,
; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
-define void @main(<4 x float> inreg) #0 {
+define amdgpu_ps void @main(<4 x float> inreg) {
main_body:
%1 = extractelement <4 x float> %0, i32 0
%2 = bitcast float %1 to i32
@@ -11,17 +11,16 @@ main_body:
br i1 %3, label %IF, label %ENDIF
IF: ; preds = %main_body
- %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
+ %4 = call float @llvm.r600.dot4(<4 x float> %0, <4 x float> %0)
br label %ENDIF
ENDIF: ; preds = %IF, %main_body
%5 = phi float [%4, %IF], [0.000000e+00, %main_body]
%6 = insertelement <4 x float> undef, float %5, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %6, i32 0, i32 0)
ret void
}
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
attributes #1 = { readnone }
-attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll
index 0ce74d97ba8e..c1af815c7b1e 100644
--- a/test/CodeGen/AMDGPU/predicates.ll
+++ b/test/CodeGen/AMDGPU/predicates.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s
+; RUN: llc -spec-exec-max-speculation-cost=0 -march=r600 -r600-ir-structurize=0 -mcpu=redwood < %s | FileCheck %s
; These tests make sure the compiler is optimizing branches using predicates
; when it is legal to do so.
-; CHECK: {{^}}simple_if:
+; CHECK-LABEL: {{^}}simple_if:
; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
entry:
- %0 = icmp sgt i32 %in, 0
- br i1 %0, label %IF, label %ENDIF
+ %cmp0 = icmp sgt i32 %in, 0
+ br i1 %cmp0, label %IF, label %ENDIF
IF:
- %1 = shl i32 %in, 1
+ %tmp1 = shl i32 %in, 1
br label %ENDIF
ENDIF:
- %2 = phi i32 [ %in, %entry ], [ %1, %IF ]
- store i32 %2, i32 addrspace(1)* %out
+ %tmp2 = phi i32 [ %in, %entry ], [ %tmp1, %IF ]
+ store i32 %tmp2, i32 addrspace(1)* %out
ret void
}
-; CHECK: {{^}}simple_if_else:
+; CHECK-LABEL: {{^}}simple_if_else:
; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
@@ -44,7 +44,7 @@ ENDIF:
ret void
}
-; CHECK: {{^}}nested_if:
+; CHECK-LABEL: {{^}}nested_if:
; CHECK: ALU_PUSH_BEFORE
; CHECK: JUMP
; CHECK: POP
@@ -71,7 +71,7 @@ ENDIF:
ret void
}
-; CHECK: {{^}}nested_if_else:
+; CHECK-LABEL: {{^}}nested_if_else:
; CHECK: ALU_PUSH_BEFORE
; CHECK: JUMP
; CHECK: POP
diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll
new file mode 100644
index 000000000000..cd8fb22e620a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-element-size.ll
@@ -0,0 +1,252 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL -check-prefix=HSA_ELTGE8 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL -check-prefix=HSA-ELTGE8 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT4 -check-prefix=HSA -check-prefix=HSA-ELT4 -check-prefix=ALL %s
+
+
+; ALL-LABEL: {{^}}private_elt_size_v4i32:
+
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+ %index.load = load i32, i32 addrspace(1)* %gep.index
+ %index = and i32 %index.load, 2
+ %alloca = alloca [2 x <4 x i32>], align 16
+ %gep0 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 1
+ store <4 x i32> zeroinitializer, <4 x i32>* %gep0
+ store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* %gep1
+ %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 %index
+ %load = load <4 x i32>, <4 x i32>* %gep2
+ store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}private_elt_size_v8i32:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
+
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:44{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:52{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}
+
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+ %index.load = load i32, i32 addrspace(1)* %gep.index
+ %index = and i32 %index.load, 2
+ %alloca = alloca [2 x <8 x i32>], align 16
+ %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 1
+ store <8 x i32> zeroinitializer, <8 x i32>* %gep0
+ store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %gep1
+ %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 %index
+ %load = load <8 x i32>, <8 x i32>* %gep2
+ store <8 x i32> %load, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+
+; ALL-LABEL: {{^}}private_elt_size_i64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+ %index.load = load i32, i32 addrspace(1)* %gep.index
+ %index = and i32 %index.load, 2
+ %alloca = alloca [2 x i64], align 16
+ %gep0 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 1
+ store i64 0, i64* %gep0
+ store i64 34359738602, i64* %gep1
+ %gep2 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 %index
+ %load = load i64, i64* %gep2
+ store i64 %load, i64 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}private_elt_size_f64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+ %index.load = load i32, i32 addrspace(1)* %gep.index
+ %index = and i32 %index.load, 2
+ %alloca = alloca [2 x double], align 16
+ %gep0 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 1
+ store double 0.0, double* %gep0
+ store double 4.0, double* %gep1
+ %gep2 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 %index
+ %load = load double, double* %gep2
+ store double %load, double addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}private_elt_size_v2i64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+ %index.load = load i32, i32 addrspace(1)* %gep.index
+ %index = and i32 %index.load, 2
+ %alloca = alloca [2 x <2 x i64>], align 16
+ %gep0 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 1
+ store <2 x i64> zeroinitializer, <2 x i64>* %gep0
+ store <2 x i64> <i64 1, i64 2>, <2 x i64>* %gep1
+ %gep2 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 %index
+ %load = load <2 x i64>, <2 x i64>* %gep2
+ store <2 x i64> %load, <2 x i64> addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll
index a008ac98a43b..eea10c862238 100644
--- a/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -7,11 +7,11 @@
define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
entry:
%tmp = alloca [2 x i32]
- %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
- %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+ %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+ %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
store i32 0, i32* %tmp1
store i32 1, i32* %tmp2
- %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+ %tmp3 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
%tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel
store i32 %tmp4, i32 addrspace(1)* %out
ret void
@@ -20,11 +20,11 @@ entry:
define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
entry:
%tmp = alloca [2 x i32]
- %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
- %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+ %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+ %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
store i32 0, i32* %tmp1
store i32 1, i32* %tmp2
- %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+ %tmp3 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
%tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic
%val = extractvalue { i32, i1 } %tmp4, 0
store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll
index 6b18a19f1956..8ba0b70dbdbb 100644
--- a/test/CodeGen/AMDGPU/private-memory-broken.ll
+++ b/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -march=amdgcn %s -o /dev/null 2>&1 | FileCheck %s
; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s
; Make sure promote alloca pass doesn't crash
diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll
new file mode 100644
index 000000000000..883bdc1ce265
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -0,0 +1,300 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}mova_same_clause:
+
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
+
+; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
+; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
+
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; FUNC-LABEL: {{^}}multiple_structs:
+; R600-NOT: MOVA_INT
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+entry:
+ %a = alloca %struct.point
+ %b = alloca %struct.point
+ %a.x.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
+ %a.y.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 1
+ %b.x.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
+ %b.y.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 1
+ store i32 0, i32* %a.x.ptr
+ store i32 1, i32* %a.y.ptr
+ store i32 2, i32* %b.x.ptr
+ store i32 3, i32* %b.y.ptr
+ %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
+ %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
+ %a.indirect = load i32, i32* %a.indirect.ptr
+ %b.indirect = load i32, i32* %b.indirect.ptr
+ %0 = add i32 %a.indirect, %b.indirect
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; Test direct access of a private array inside a loop. The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+entry:
+ %prv_array_const = alloca [2 x i32]
+ %prv_array = alloca [2 x i32]
+ %a = load i32, i32 addrspace(1)* %in
+ %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %b = load i32, i32 addrspace(1)* %b_src_ptr
+ %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+ store i32 %a, i32* %a_dst_ptr
+ %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
+ store i32 %b, i32* %b_dst_ptr
+ br label %for.body
+
+for.body:
+ %inc = phi i32 [0, %entry], [%count, %for.body]
+ %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+ %x = load i32, i32* %x_ptr
+ %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+ %y = load i32, i32* %y_ptr
+ %xy = add i32 %x, %y
+ store i32 %xy, i32* %y_ptr
+ %count = add i32 %inc, 1
+ %done = icmp eq i32 %count, 4095
+ br i1 %done, label %for.end, label %for.body
+
+for.end:
+ %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+ %value = load i32, i32* %value_ptr
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}short_array:
+
+; R600: MOVA_INT
+define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %0 = alloca [2 x i16]
+ %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
+ store i16 0, i16* %1
+ store i16 1, i16* %2
+ %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
+ %4 = load i16, i16* %3
+ %5 = sext i16 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}char_array:
+
+; R600: MOVA_INT
+define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %0 = alloca [2 x i8]
+ %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
+ store i8 0, i8* %1
+ store i8 1, i8* %2
+ %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
+ %4 = load i8, i8* %3
+ %5 = sext i8 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+
+}
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-NOT: MOV * TO.X
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+ %0 = alloca [2 x i32]
+ %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
+ %2 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 1
+ store i32 0, i32* %1
+ store i32 1, i32* %2
+ %3 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+ %4 = load i32, i32* %3
+ %5 = call i32 @llvm.r600.read.tidig.x()
+ %6 = add i32 %4, %5
+ store i32 %6, i32 addrspace(1)* %out
+ ret void
+}
+
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: {{^}}no_overlap:
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-NOT: [[CHAN]]+
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+ %0 = alloca [3 x i8], align 1
+ %1 = alloca [2 x i8], align 1
+ %2 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 0
+ %3 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 1
+ %4 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 2
+ %5 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 0
+ %6 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 1
+ store i8 0, i8* %2
+ store i8 1, i8* %3
+ store i8 2, i8* %4
+ store i8 1, i8* %5
+ store i8 0, i8* %6
+ %7 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 %in
+ %8 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 %in
+ %9 = load i8, i8* %7
+ %10 = load i8, i8* %8
+ %11 = add i8 %9, %10
+ %12 = sext i8 %11 to i32
+ store i32 %12, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x i8]]
+ %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+ store i8 0, i8* %gep0
+ store i8 1, i8* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i8, i8* %gep2
+ %sext = sext i8 %load to i32
+ store i32 %sext, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x i32]]
+ %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x i64]]
+ %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+ store i64 0, i64* %gep0
+ store i64 1, i64* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i64, i64* %gep2
+ store i64 %load, i64 addrspace(1)* %out
+ ret void
+}
+
+%struct.pair32 = type { i32, i32 }
+
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x %struct.pair32]]
+ %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+ %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x %struct.pair32]
+ %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+ %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+ %tmp = alloca [2 x i32]
+ %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+ %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+ store i32 0, i32* %tmp1
+ store i32 1, i32* %tmp2
+ %cmp = icmp eq i32 %in, 0
+ %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
+ %load = load i32, i32* %sel
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %alloca = alloca [16 x i32]
+ %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ store i32 5, i32* %tmp0
+ %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+ %tmp2 = add i32 %tmp1, 5
+ %tmp3 = inttoptr i32 %tmp2 to i32*
+ %tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b
+ %tmp5 = load i32, i32* %tmp4
+ store i32 %tmp5, i32 addrspace(1)* %out
+ ret void
+}
+
+; OPT: !0 = !{i32 0, i32 2048}
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll
deleted file mode 100644
index 79778eebd802..000000000000
--- a/test/CodeGen/AMDGPU/private-memory.ll
+++ /dev/null
@@ -1,325 +0,0 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-
-; FUNC-LABEL: {{^}}mova_same_clause:
-
-; R600: LDS_WRITE
-; R600: LDS_WRITE
-; R600: LDS_READ
-; R600: LDS_READ
-
-; HSA-PROMOTE: .amd_kernel_code_t
-; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
-; HSA-PROMOTE: .end_amd_kernel_code_t
-
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_write_b32
-; SI-PROMOTE: ds_read_b32
-; SI-PROMOTE: ds_read_b32
-
-; HSA-ALLOCA: .amd_kernel_code_t
-; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
-; by 4 bytes.
-; HSA-ALLOCA: workitem_private_segment_byte_size = 24
-; HSA-ALLOCA: .end_amd_kernel_code_t
-
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
-; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
-entry:
- %stack = alloca [5 x i32], align 4
- %0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
- store i32 4, i32* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
- store i32 5, i32* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
- %2 = load i32, i32* %arrayidx10, align 4
- store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
- %3 = load i32, i32* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %3, i32 addrspace(1)* %arrayidx13
- ret void
-}
-
-; This test checks that the stack offset is calculated correctly for structs.
-; All register loads/stores should be optimized away, so there shouldn't be
-; any MOVA instructions.
-;
-; XXX: This generated code has unnecessary MOVs, we should be able to optimize
-; this.
-
-; FUNC-LABEL: {{^}}multiple_structs:
-; R600-NOT: MOVA_INT
-; SI-NOT: v_movrel
-; SI-NOT: v_movrel
-%struct.point = type { i32, i32 }
-
-define void @multiple_structs(i32 addrspace(1)* %out) {
-entry:
- %a = alloca %struct.point
- %b = alloca %struct.point
- %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
- %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
- %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
- %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
- store i32 0, i32* %a.x.ptr
- store i32 1, i32* %a.y.ptr
- store i32 2, i32* %b.x.ptr
- store i32 3, i32* %b.y.ptr
- %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
- %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
- %a.indirect = load i32, i32* %a.indirect.ptr
- %b.indirect = load i32, i32* %b.indirect.ptr
- %0 = add i32 %a.indirect, %b.indirect
- store i32 %0, i32 addrspace(1)* %out
- ret void
-}
-
-; Test direct access of a private array inside a loop. The private array
-; loads and stores should be lowered to copies, so there shouldn't be any
-; MOVA instructions.
-
-; FUNC-LABEL: {{^}}direct_loop:
-; R600-NOT: MOVA_INT
-; SI-NOT: v_movrel
-
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-entry:
- %prv_array_const = alloca [2 x i32]
- %prv_array = alloca [2 x i32]
- %a = load i32, i32 addrspace(1)* %in
- %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
- %b = load i32, i32 addrspace(1)* %b_src_ptr
- %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
- store i32 %a, i32* %a_dst_ptr
- %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
- store i32 %b, i32* %b_dst_ptr
- br label %for.body
-
-for.body:
- %inc = phi i32 [0, %entry], [%count, %for.body]
- %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
- %x = load i32, i32* %x_ptr
- %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
- %y = load i32, i32* %y_ptr
- %xy = add i32 %x, %y
- store i32 %xy, i32* %y_ptr
- %count = add i32 %inc, 1
- %done = icmp eq i32 %count, 4095
- br i1 %done, label %for.end, label %for.body
-
-for.end:
- %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
- %value = load i32, i32* %value_ptr
- store i32 %value, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}short_array:
-
-; R600: MOVA_INT
-
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
-; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
-; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
-define void @short_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
- %0 = alloca [2 x i16]
- %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0
- %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1
- store i16 0, i16* %1
- store i16 1, i16* %2
- %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index
- %4 = load i16, i16* %3
- %5 = sext i16 %4 to i32
- store i32 %5, i32 addrspace(1)* %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}char_array:
-
-; R600: MOVA_INT
-
-; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
-; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
- %0 = alloca [2 x i8]
- %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0
- %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1
- store i8 0, i8* %1
- store i8 1, i8* %2
- %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index
- %4 = load i8, i8* %3
- %5 = sext i8 %4 to i32
- store i32 %5, i32 addrspace(1)* %out
- ret void
-
-}
-
-; Make sure we don't overwrite workitem information with private memory
-
-; FUNC-LABEL: {{^}}work_item_info:
-; R600-NOT: MOV T0.X
-; Additional check in case the move ends up in the last slot
-; R600-NOT: MOV * TO.X
-
-; SI-NOT: v_mov_b32_e{{(32|64)}} v0
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
-entry:
- %0 = alloca [2 x i32]
- %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
- %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
- store i32 0, i32* %1
- store i32 1, i32* %2
- %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
- %4 = load i32, i32* %3
- %5 = call i32 @llvm.r600.read.tidig.x()
- %6 = add i32 %4, %5
- store i32 %6, i32 addrspace(1)* %out
- ret void
-}
-
-; Test that two stack objects are not stored in the same register
-; The second stack object should be in T3.X
-; FUNC-LABEL: {{^}}no_overlap:
-; R600_CHECK: MOV
-; R600_CHECK: [[CHAN:[XYZW]]]+
-; R600-NOT: [[CHAN]]+
-; SI: v_mov_b32_e32 v3
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
-entry:
- %0 = alloca [3 x i8], align 1
- %1 = alloca [2 x i8], align 1
- %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
- %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
- %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
- %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
- %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
- store i8 0, i8* %2
- store i8 1, i8* %3
- store i8 2, i8* %4
- store i8 1, i8* %5
- store i8 0, i8* %6
- %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
- %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
- %9 = load i8, i8* %7
- %10 = load i8, i8* %8
- %11 = add i8 %9, %10
- %12 = sext i8 %11 to i32
- store i32 %12, i32 addrspace(1)* %out
- ret void
-}
-
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
- %alloca = alloca [2 x [2 x i8]]
- %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
- store i8 0, i8* %gep0
- store i8 1, i8* %gep1
- %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i8, i8* %gep2
- %sext = sext i8 %load to i32
- store i32 %sext, i32 addrspace(1)* %out
- ret void
-}
-
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
- %alloca = alloca [2 x [2 x i32]]
- %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i32, i32* %gep2
- store i32 %load, i32 addrspace(1)* %out
- ret void
-}
-
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
-entry:
- %alloca = alloca [2 x [2 x i64]]
- %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
- store i64 0, i64* %gep0
- store i64 1, i64* %gep1
- %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
- %load = load i64, i64* %gep2
- store i64 %load, i64 addrspace(1)* %out
- ret void
-}
-
-%struct.pair32 = type { i32, i32 }
-
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
- %alloca = alloca [2 x [2 x %struct.pair32]]
- %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
- %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
- %load = load i32, i32* %gep2
- store i32 %load, i32 addrspace(1)* %out
- ret void
-}
-
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
-entry:
- %alloca = alloca [2 x %struct.pair32]
- %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
- %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
- store i32 0, i32* %gep0
- store i32 1, i32* %gep1
- %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
- %load = load i32, i32* %gep2
- store i32 %load, i32 addrspace(1)* %out
- ret void
-}
-
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
-entry:
- %tmp = alloca [2 x i32]
- %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
- %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
- store i32 0, i32* %tmp1
- store i32 1, i32* %tmp2
- %cmp = icmp eq i32 %in, 0
- %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
- %load = load i32, i32* %sel
- store i32 %load, i32 addrspace(1)* %out
- ret void
-}
-
-; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it
-; finds one, it should stop trying to promote.
-
-; FUNC-LABEL: ptrtoint:
-; SI-NOT: ds_write
-; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
-; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
- %alloca = alloca [16 x i32]
- %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
- store i32 5, i32* %tmp0
- %tmp1 = ptrtoint [16 x i32]* %alloca to i32
- %tmp2 = add i32 %tmp1, 5
- %tmp3 = inttoptr i32 %tmp2 to i32*
- %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
- %tmp5 = load i32, i32* %tmp4
- store i32 %tmp5, i32 addrspace(1)* %out
- ret void
-}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
new file mode 100644
index 000000000000..3bd0aecf7aa9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+; Make sure this allocates the correct size if the alloca has a non-0
+; number of elements.
+
+; CHECK-LABEL: @array_alloca(
+; CHECK: %stack = alloca i32, i32 5, align 4
+define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca i32, i32 5, align 4
+ %ld0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0
+ %ld2 = load i32, i32* %arrayidx10, align 4
+ store i32 %ld2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1
+ %ld3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %ld3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK-LABEL: @array_alloca_dynamic(
+; CHECK: %stack = alloca i32, i32 %size, align 4
+define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+entry:
+ %stack = alloca i32, i32 %size, align 4
+ %ld0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0
+ %ld2 = load i32, i32* %arrayidx10, align 4
+ store i32 %ld2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1
+ %ld3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %ld3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 10739df08379..82030f377d9f 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -6,13 +6,14 @@
declare void @foo(float*) #0
declare void @foo.varargs(...) #0
-; CHECK: error: unsupported call to function foo in crash_call_constexpr_cast
+; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
define void @crash_call_constexpr_cast() #0 {
%alloca = alloca i32
call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0
ret void
}
+; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
define void @crash_call_constexpr_cast_varargs() #0 {
%alloca = alloca i32
call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0
diff --git a/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
new file mode 100644
index 000000000000..eb0d0cc62697
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s
+
+
+@global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
+@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
+
+; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; IR: alloca [10 x i32]
+; ASM-LABEL: {{^}}promote_alloca_size_256:
+; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
+
+define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+ %stack = alloca [10 x i32], align 4
+ %tmp = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 %tmp
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 %tmp1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 0
+ %tmp2 = load i32, i32* %arrayidx10, align 4
+ store i32 %tmp2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [10 x i32], [10 x i32]* %stack, i32 0, i32 1
+ %tmp3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+ %v0 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array0, i32 0, i32 0, i32 0
+ store i32 %tmp3, i32 addrspace(3)* %v0
+ %v1 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array1, i32 0, i32 0, i32 0
+ store i32 %tmp3, i32 addrspace(3)* %v1
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
new file mode 100644
index 000000000000..6a9ec31696d2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare {}* @llvm.invariant.start(i64, i8* nocapture) #0
+declare void @llvm.invariant.end({}*, i64, i8* nocapture) #0
+declare i8* @llvm.invariant.group.barrier(i8*) #1
+
+; GCN-LABEL: {{^}}use_invariant_promotable_lds:
+; GCN: buffer_load_dword
+; GCN: ds_write_b32
+define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+bb:
+ %tmp = alloca i32, align 4
+ %tmp1 = bitcast i32* %tmp to i8*
+ %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %tmp3 = load i32, i32 addrspace(1)* %tmp2
+ store i32 %tmp3, i32* %tmp
+ %tmp4 = call {}* @llvm.invariant.start(i64 4, i8* %tmp1) #0
+ call void @llvm.invariant.end({}* %tmp4, i64 4, i8* %tmp1) #0
+ %tmp5 = call i8* @llvm.invariant.group.barrier(i8* %tmp1) #1
+ ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
new file mode 100644
index 000000000000..eeda19fa27ac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca %s | FileCheck -check-prefix=OPT %s
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; OPT-LABEL: @use_lifetime_promotable_lds(
+; OPT-NOT: alloca i32
+; OPT-NOT: llvm.lifetime
+; OPT: store i32 %tmp3, i32 addrspace(3)*
+define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+bb:
+ %tmp = alloca i32, align 4
+ %tmp1 = bitcast i32* %tmp to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %tmp1)
+ %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %tmp3 = load i32, i32 addrspace(1)* %tmp2
+ store i32 %tmp3, i32* %tmp
+ call void @llvm.lifetime.end(i64 4, i8* %tmp1)
+ ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
new file mode 100644
index 000000000000..01ecb638b033
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i32, i1) #0
+declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) #0
+
+declare void @llvm.memmove.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i32, i1) #0
+declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) #0
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) #1
+
+; CHECK-LABEL: @promote_with_memcpy(
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
+define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %alloca = alloca [17 x i32], align 4
+ %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+ %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
+ %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
+ call void @llvm.memcpy.p0i8.p1i8.i32(i8* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+ call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %out.bc, i8* %alloca.bc, i32 68, i32 4, i1 false)
+ ret void
+}
+
+; CHECK-LABEL: @promote_with_memmove(
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
+define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %alloca = alloca [17 x i32], align 4
+ %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+ %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
+ %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
+ call void @llvm.memmove.p0i8.p1i8.i32(i8* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
+ call void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* %out.bc, i8* %alloca.bc, i32 68, i32 4, i1 false)
+ ret void
+}
+
+; CHECK-LABEL: @promote_with_memset(
+; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
+define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+ %alloca = alloca [17 x i32], align 4
+ %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+ %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
+ %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
+ call void @llvm.memset.p0i8.i32(i8* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
+ ret void
+}
+
+; CHECK-LABEL: @promote_with_objectsize(
+; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false)
+define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+ %alloca = alloca [17 x i32], align 4
+ %alloca.bc = bitcast [17 x i32]* %alloca to i8*
+ %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false)
+ store i32 %size, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" "amdgpu-max-waves-per-eu"="3" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
new file mode 100644
index 000000000000..7c5a5182bc8e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -0,0 +1,38 @@
+; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=+promote-alloca < %s | FileCheck -check-prefix=NOOPTS -check-prefix=ALL %s
+; RUN: llc -O1 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=+promote-alloca < %s | FileCheck -check-prefix=OPTS -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}promote_alloca_i32_array_array:
+; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
+; NOOPTS-NOT ds_write
+; OPTS: ds_write
+define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+entry:
+ %alloca = alloca [2 x [2 x i32]]
+ %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
+; ALL: workgroup_group_segment_byte_size = 0{{$}}
+; ALL-NOT ds_write
+define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+entry:
+ %alloca = alloca [2 x [2 x i32]]
+ %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+ %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+ store i32 0, i32* %gep0
+ store i32 1, i32* %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, i32* %gep2
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
+attributes #1 = { nounwind optnone noinline "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
new file mode 100644
index 000000000000..46fe307a17fe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -0,0 +1,130 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+
+; This shows that the amount of LDS estimate is sensitive to the order
+; of the LDS globals.
+
+; Both of these functions use the same amount of LDS, but the total
+; changes depending on the visit order of first use.
+
+; The one with the suboptimal order resulting in extra padding exceeds
+; the desired limit
+
+; The padding estimate heuristic used by the promote alloca pass
+; is mostly determined by the order of the globals,
+
+; Raw usage = 1060 bytes
+; Rounded usage:
+; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072
+; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060
+
+; At default occupancy guess of 7, 2340 bytes available total.
+
+; 1280 need to be left to promote alloca
+; optimally packed, this requires
+
+
+@lds0 = internal unnamed_addr addrspace(3) global [32 x <4 x i32>] undef, align 16
+@lds2 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
+@lds1 = internal unnamed_addr addrspace(3) global [73 x i32] undef, align 4
+
+
+; GCN-LABEL: {{^}}promote_alloca_size_order_0:
+; GCN: workgroup_group_segment_byte_size = 2340
+define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %tmp2 = load i32, i32* %arrayidx10, align 4
+ store i32 %tmp2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %tmp3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+ %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
+ store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+
+ %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
+ store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+
+ %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
+ store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}promote_alloca_size_order_1:
+; GCN: workgroup_group_segment_byte_size = 2352
+define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %tmp2 = load i32, i32* %arrayidx10, align 4
+ store i32 %tmp2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %tmp3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+ %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
+ store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+
+ %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
+ store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+
+ %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
+ store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+
+ ret void
+}
+
+@lds3 = internal unnamed_addr addrspace(3) global [13 x i32] undef, align 4
+@lds4 = internal unnamed_addr addrspace(3) global [63 x <4 x i32>] undef, align 16
+
+; The guess from the alignment padding pushes this over the determined
+; size limit, so it isn't promoted
+
+; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
+; GCN: workgroup_group_segment_byte_size = 1060
+define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %tmp2 = load i32, i32* %arrayidx10, align 4
+ store i32 %tmp2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %tmp3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+
+ %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
+ store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
+
+ %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
+ store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
+
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-shaders.ll b/test/CodeGen/AMDGPU/promote-alloca-shaders.ll
new file mode 100644
index 000000000000..d40fca9f4fd5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-shaders.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s
+
+; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+; IR: alloca [5 x i32]
+; ASM-LABEL: {{^}}promote_alloca_shaders:
+; ASM: ; LDSByteSize: 0 bytes/workgroup (compile time only)
+
+define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %tmp2 = load i32, i32* %arrayidx4, align 4
+ store i32 %tmp2, i32 addrspace(1)* %out, align 4
+ %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %tmp3 = load i32, i32* %arrayidx5
+ %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %tmp3, i32 addrspace(1)* %arrayidx6
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index 2ee98cc3d2d2..307eca712cc8 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Pointer value is stored in a candidate for LDS usage.
@@ -11,6 +12,18 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
ret void
}
+; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
+; GCN: buffer_store_dword v
+define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+ %tmp0 = alloca float
+ %tmp1 = alloca float
+ store float 0.0, float *%tmp0
+ store float 0.0, float *%tmp1
+ store volatile float* %tmp0, float* addrspace(1)* %ptr
+ store volatile float* %tmp1, float* addrspace(1)* %ptr
+ ret void
+}
+
; GCN-LABEL: {{^}}stored_lds_pointer_value_gep:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
@@ -36,17 +49,27 @@ bb:
define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
entry:
%tmp0 = alloca [4 x i32]
- %x = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
- %y = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
+ %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
+ %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
+ %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
+ %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
store i32 0, i32* %x
store i32 1, i32* %y
store i32 2, i32* %z
store i32 3, i32* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
+ %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
store i32* %tmp1, i32* addrspace(1)* %out
ret void
}
+; GCN-LABEL: {{^}}stored_fi_to_self:
+; GCN-NOT: ds_
+define void @stored_fi_to_self() #0 {
+ %tmp = alloca i32*
+ store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+ %bitcast = bitcast i32** %tmp to i32*
+ store volatile i32* %bitcast, i32** %tmp
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
new file mode 100644
index 000000000000..857e547aa03b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; This normally would be fixed by instcombine to be compare to the GEP
+; indices
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+ %cmp = icmp eq i32* %ptr0, %ptr1
+ %zext = zext i1 %cmp to i32
+ store volatile i32 %zext, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null
+define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ %cmp = icmp eq i32* %ptr0, null
+ %zext = zext i1 %cmp to i32
+ store volatile i32 %zext, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0
+define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ %cmp = icmp eq i32* null, %ptr0
+ %zext = zext i1 %cmp to i32
+ store volatile i32 %zext, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
+; CHECK: %alloca = alloca [16 x i32], align 4
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+; CHECK: %ptr1 = call i32* @get_unknown_pointer()
+; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
+define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ %ptr1 = call i32* @get_unknown_pointer()
+ %cmp = icmp eq i32* %ptr0, %ptr1
+ %zext = zext i1 %cmp to i32
+ store volatile i32 %zext, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
new file mode 100644
index 000000000000..a0ad564a6c8f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -0,0 +1,204 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+
+; CHECK-LABEL: @branch_ptr_var_same_alloca(
+; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+entry:
+ %alloca = alloca [64 x i32], align 4
+ br i1 undef, label %if, label %else
+
+if:
+ %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+ br label %endif
+
+else:
+ %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %b
+ br label %endif
+
+endif:
+ %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+ store i32 0, i32* %phi.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
+define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
+entry:
+ %alloca = alloca [64 x i32], align 4
+ br i1 undef, label %if, label %endif
+
+if:
+ %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+ br label %endif
+
+endif:
+ %phi.ptr = phi i32* [ %arrayidx0, %if ], [ null, %entry ]
+ store i32 0, i32* %phi.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ null, %entry ], [ %arrayidx0, %if ]
+define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
+entry:
+ %alloca = alloca [64 x i32], align 4
+ br i1 undef, label %if, label %endif
+
+if:
+ %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+ br label %endif
+
+endif:
+ %phi.ptr = phi i32* [ null, %entry ], [ %arrayidx0, %if ]
+ store i32 0, i32* %phi.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: @one_phi_value(
+; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+
+; CHECK: br label %exit
+; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
+; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+define void @one_phi_value(i32 %a) #0 {
+entry:
+ %alloca = alloca [64 x i32], align 4
+ %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+ br label %exit
+
+exit:
+ %phi.ptr = phi i32* [ %arrayidx0, %entry ]
+ store i32 0, i32* %phi.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: @branch_ptr_alloca_unknown_obj(
+; CHECK: %alloca = alloca [64 x i32], align 4
+
+; CHECK: if:
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+
+; CHECK: else:
+; CHECK: %arrayidx1 = call i32* @get_unknown_pointer()
+
+; CHECK: endif:
+; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, i32* %phi.ptr, align 4
+define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+entry:
+ %alloca = alloca [64 x i32], align 4
+ br i1 undef, label %if, label %else
+
+if:
+ %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
+ br label %endif
+
+else:
+ %arrayidx1 = call i32* @get_unknown_pointer()
+ br label %endif
+
+endif:
+ %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+ store i32 0, i32* %phi.ptr, align 4
+ ret void
+}
+
+; kernel void ptr_induction_var_same_alloca(void)
+; {
+; int alloca[64];
+; int i = 0;
+
+; #pragma nounroll
+; for (int* p = &alloca[2], *e = &alloca[48]; p != e; ++p, ++i)
+; {
+; *p = i;
+; }
+; }
+
+; FIXME: This should be promotable. We need to use
+; GetUnderlyingObjects when looking at the icmp user.
+
+; CHECK-LABEL: @ptr_induction_var_same_alloca(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+define void @ptr_induction_var_same_alloca() #0 {
+entry:
+ %alloca = alloca [64 x i32], align 4
+ %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+ %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 48
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %p.08 = phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+ store i32 %i.09, i32* %p.08, align 4
+ %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+ %inc = add nuw nsw i32 %i.09, 1
+ %cmp = icmp eq i32* %incdec.ptr, %arrayidx1
+ br i1 %cmp, label %for.cond.cleanup, label %for.body
+}
+
+
+; extern int* get_unknown_pointer(void);
+
+; kernel void ptr_induction_var_alloca_unknown(void)
+; {
+; int alloca[64];
+; int i = 0;
+;
+; for (int* p = &alloca[2], *e = get_unknown_pointer(); p != e; ++p, ++i)
+; {
+; *p = i;
+; }
+; }
+
+; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
+; CHECK: %alloca = alloca [64 x i32], align 4
+; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
+define void @ptr_induction_var_alloca_unknown() #0 {
+entry:
+ %alloca = alloca [64 x i32], align 4
+ %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
+ %call = tail call i32* @get_unknown_pointer() #2
+ %cmp.7 = icmp eq i32* %arrayidx, %call
+ br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+ store i32 %i.09, i32* %p.08, align 4
+ %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1
+ %inc = add nuw nsw i32 %i.09, 1
+ %cmp = icmp eq i32* %incdec.ptr, %call
+ br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32* @get_unknown_pointer() #0
+
+attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
new file mode 100644
index 000000000000..bb13adb19852
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -0,0 +1,133 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
+; CHECK: %alloca = alloca i32
+; CHECK: select i1 undef, i32* undef, i32* %alloca
+define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+ %alloca = alloca i32, align 4
+ %select = select i1 undef, i32* undef, i32* %alloca
+ store i32 0, i32* %select, align 4
+ ret void
+}
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+ %select = select i1 undef, i32* %ptr0, i32* %ptr1
+ store i32 0, i32* %select, align 4
+ ret void
+}
+
+; FIXME: This should be promotable but requires knowing that both will be promoted first.
+
+; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
+; CHECK: %alloca0 = alloca i32, i32 16, align 4
+; CHECK: %alloca1 = alloca i32, i32 16, align 4
+; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
+define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+ %alloca0 = alloca i32, i32 16, align 4
+ %alloca1 = alloca i32, i32 16, align 4
+ %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
+ %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
+ %select = select i1 undef, i32* %ptr0, i32* %ptr1
+ store i32 0, i32* %select, align 4
+ ret void
+}
+
+; TODO: Maybe this should be canonicalized to select on the constant and GEP after.
+; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers(
+; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_constant_pointers.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 1
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
+; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
+define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
+ %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
+ %select = select i1 undef, i32* %ptr0, i32* %ptr1
+ store i32 0, i32* %select, align 4
+ ret void
+}
+
+; CHECK-LABEL: @lds_promoted_alloca_select_input_select(
+; CHECK: getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_select_input_select.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %b
+; CHECK: %ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %c
+; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
+; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
+; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
+define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+ %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
+ %select0 = select i1 undef, i32* %ptr0, i32* %ptr1
+ %select1 = select i1 undef, i32* %select0, i32* %ptr2
+ store i32 0, i32* %select1, align 4
+ ret void
+}
+
+define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+entry:
+ %alloca = alloca [16 x i32], align 4
+ %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+ %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
+ store i32 0, i32* %ptr0
+ br i1 undef, label %bb1, label %bb2
+
+bb1:
+ %ptr2 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %c
+ %select0 = select i1 undef, i32* undef, i32* %ptr2
+ store i32 0, i32* %ptr1
+ br label %bb2
+
+bb2:
+ %phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ]
+ %select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1
+ store i32 0, i32* %select1, align 4
+ ret void
+}
+
+; CHECK-LABEL: @select_null_rhs(
+; CHECK-NOT: alloca
+; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
+define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+bb:
+ %tmp = alloca double, align 8
+ store double 0.000000e+00, double* %tmp, align 8
+ %tmp2 = icmp eq i32 %arg1, 0
+ %tmp3 = select i1 %tmp2, double* %tmp, double* null
+ store double 1.000000e+00, double* %tmp3, align 8
+ %tmp4 = load double, double* %tmp, align 8
+ store double %tmp4, double addrspace(1)* %arg
+ ret void
+}
+
+; CHECK-LABEL: @select_null_lhs(
+; CHECK-NOT: alloca
+; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
+define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+bb:
+ %tmp = alloca double, align 8
+ store double 0.000000e+00, double* %tmp, align 8
+ %tmp2 = icmp eq i32 %arg1, 0
+ %tmp3 = select i1 %tmp2, double* null, double* %tmp
+ store double 1.000000e+00, double* %tmp3, align 8
+ %tmp4 = load double, double* %tmp, align 8
+ store double %tmp4, double addrspace(1)* %arg
+ ret void
+}
+
+attributes #0 = { norecurse nounwind "amdgpu-max-waves-per-eu"="1" }
+attributes #1 = { norecurse nounwind } \ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
new file mode 100644
index 000000000000..e331731f90f6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+; This is just an arbitrary intrinisic that shouldn't ever need to be
+; handled to ensure it doesn't crash.
+
+declare void @llvm.stackrestore(i8*) #2
+
+; CHECK-LABEL: @try_promote_unhandled_intrinsic(
+; CHECK: alloca
+; CHECK: call void @llvm.stackrestore(i8* %tmp1)
+define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+bb:
+ %tmp = alloca i32, align 4
+ %tmp1 = bitcast i32* %tmp to i8*
+ %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %tmp3 = load i32, i32 addrspace(1)* %tmp2
+ store i32 %tmp3, i32* %tmp
+ call void @llvm.stackrestore(i8* %tmp1)
+ ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
new file mode 100644
index 000000000000..f9de38839bc5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK-LABEL: @volatile_load(
+; CHECK: alloca [5 x i32]
+; CHECK load volatile i32, i32*
+define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ %load = load volatile i32, i32* %arrayidx1
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: @volatile_store(
+; CHECK: alloca [5 x i32]
+; CHECK store volatile i32 %tmp, i32*
+define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ store volatile i32 %tmp, i32* %arrayidx1
+ ret void
+}
+
+; Has on OK non-volatile user but also a volatile user
+; CHECK-LABEL: @volatile_and_non_volatile_load(
+; CHECK: alloca double
+; CHECK: load double
+; CHECK: load volatile double
+define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+bb:
+ %tmp = alloca double, align 8
+ store double 0.000000e+00, double* %tmp, align 8
+
+ %tmp4 = load double, double* %tmp, align 8
+ %tmp5 = load volatile double, double* %tmp, align 8
+
+ store double %tmp4, double addrspace(1)* %arg
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/pv-packing.ll b/test/CodeGen/AMDGPU/pv-packing.ll
index abeae563ff3f..b01c00daede3 100644
--- a/test/CodeGen/AMDGPU/pv-packing.ll
+++ b/test/CodeGen/AMDGPU/pv-packing.ll
@@ -3,7 +3,7 @@
;CHECK: DOT4 T{{[0-9]\.X}}
;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -16,7 +16,7 @@ main_body:
%8 = extractelement <4 x float> %reg3, i32 2
%9 = load <4 x float>, <4 x float> addrspace(8)* null
%10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
- %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
+ %11 = call float @llvm.r600.dot4(<4 x float> %9, <4 x float> %9)
%12 = fmul float %0, %3
%13 = fadd float %12, %6
%14 = fmul float %1, %4
@@ -29,17 +29,16 @@ main_body:
%21 = insertelement <4 x float> %20, float %15, i32 1
%22 = insertelement <4 x float> %21, float %17, i32 2
%23 = insertelement <4 x float> %22, float %19, i32 3
- %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
+ %24 = call float @llvm.r600.dot4(<4 x float> %23, <4 x float> %10)
%25 = insertelement <4 x float> undef, float %24, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %25, i32 0, i32 2)
ret void
}
; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-attributes #0 = { "ShaderType"="1" }
attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll
index 9a57dd19765a..d5f9833d6ad0 100644
--- a/test/CodeGen/AMDGPU/pv.ll
+++ b/test/CodeGen/AMDGPU/pv.ll
@@ -3,7 +3,7 @@
; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -101,9 +101,9 @@ main_body:
%93 = insertelement <4 x float> %92, float %5, i32 1
%94 = insertelement <4 x float> %93, float %6, i32 2
%95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
- %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
- %97 = call float @fabs(float %96)
- %98 = call float @llvm.AMDGPU.rsq.f32(float %97)
+ %96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95)
+ %97 = call float @llvm.fabs.f32(float %96)
+ %98 = call float @llvm.r600.recipsqrt.clamped.f32(float %97)
%99 = fmul float %4, %98
%100 = fmul float %5, %98
%101 = fmul float %6, %98
@@ -119,10 +119,10 @@ main_body:
%111 = extractelement <4 x float> %110, i32 2
%112 = fmul float %111, %10
%113 = fadd float %112, %22
- %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00)
- %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
- %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
- %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+ %114 = call float @llvm.AMDGPU.clamp.f32(float %105, float 0.000000e+00, float 1.000000e+00)
+ %115 = call float @llvm.AMDGPU.clamp.f32(float %109, float 0.000000e+00, float 1.000000e+00)
+ %116 = call float @llvm.AMDGPU.clamp.f32(float %113, float 0.000000e+00, float 1.000000e+00)
+ %117 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
%118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
%119 = extractelement <4 x float> %118, i32 0
%120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
@@ -137,7 +137,7 @@ main_body:
%129 = insertelement <4 x float> %128, float %121, i32 1
%130 = insertelement <4 x float> %129, float %123, i32 2
%131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
- %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
+ %132 = call float @llvm.r600.dot4(<4 x float> %127, <4 x float> %131)
%133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
%134 = extractelement <4 x float> %133, i32 0
%135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
@@ -152,7 +152,7 @@ main_body:
%144 = insertelement <4 x float> %143, float %136, i32 1
%145 = insertelement <4 x float> %144, float %138, i32 2
%146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
- %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
+ %147 = call float @llvm.r600.dot4(<4 x float> %142, <4 x float> %146)
%148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
%149 = extractelement <4 x float> %148, i32 0
%150 = fmul float %149, %8
@@ -202,40 +202,39 @@ main_body:
%194 = fadd float %193, %188
%195 = fmul float %181, %174
%196 = fadd float %195, %190
- %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00)
- %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00)
- %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00)
+ %197 = call float @llvm.AMDGPU.clamp.f32(float %192, float 0.000000e+00, float 1.000000e+00)
+ %198 = call float @llvm.AMDGPU.clamp.f32(float %194, float 0.000000e+00, float 1.000000e+00)
+ %199 = call float @llvm.AMDGPU.clamp.f32(float %196, float 0.000000e+00, float 1.000000e+00)
%200 = insertelement <4 x float> undef, float %75, i32 0
%201 = insertelement <4 x float> %200, float %79, i32 1
%202 = insertelement <4 x float> %201, float %83, i32 2
%203 = insertelement <4 x float> %202, float %87, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %203, i32 60, i32 1)
%204 = insertelement <4 x float> undef, float %197, i32 0
%205 = insertelement <4 x float> %204, float %198, i32 1
%206 = insertelement <4 x float> %205, float %199, i32 2
%207 = insertelement <4 x float> %206, float %117, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %207, i32 0, i32 2)
ret void
}
; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
; Function Attrs: readonly
-declare float @fabs(float) #2
+declare float @llvm.fabs.f32(float) #1
; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #1
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #1
; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #1
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #3
+declare float @llvm.pow.f32(float, float) #2
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #3
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
-attributes #3 = { nounwind readonly }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/r600-encoding.ll b/test/CodeGen/AMDGPU/r600-encoding.ll
index 3a82ee30a328..e14b30680ba1 100644
--- a/test/CodeGen/AMDGPU/r600-encoding.ll
+++ b/test/CodeGen/AMDGPU/r600-encoding.ll
@@ -10,16 +10,14 @@
; R600: {{^}}test:
; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
entry:
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
%r2 = fmul float %r0, %r1
%vec = insertelement <4 x float> undef, float %r2, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/r600-export-fix.ll b/test/CodeGen/AMDGPU/r600-export-fix.ll
index 7cb80195b368..7d86f9e3b3f1 100644
--- a/test/CodeGen/AMDGPU/r600-export-fix.ll
+++ b/test/CodeGen/AMDGPU/r600-export-fix.ll
@@ -10,7 +10,7 @@
;CHECK: EXPORT T{{[0-9]}}.0000
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -98,45 +98,43 @@ main_body:
%83 = insertelement <4 x float> %82, float %55, i32 1
%84 = insertelement <4 x float> %83, float %59, i32 2
%85 = insertelement <4 x float> %84, float %63, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %85, i32 60, i32 1)
%86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
%87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1
%88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2
%89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %89, i32 0, i32 2)
%90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
%91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1
%92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2
%93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %93, i32 1, i32 2)
%94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
%95 = insertelement <4 x float> %94, float %65, i32 1
%96 = insertelement <4 x float> %95, float %67, i32 2
%97 = insertelement <4 x float> %96, float %69, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %97, i32 2, i32 2)
%98 = insertelement <4 x float> undef, float %77, i32 0
%99 = insertelement <4 x float> %98, float %79, i32 1
%100 = insertelement <4 x float> %99, float %81, i32 2
%101 = insertelement <4 x float> %100, float %71, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %101, i32 3, i32 2)
%102 = insertelement <4 x float> undef, float %73, i32 0
%103 = insertelement <4 x float> %102, float %75, i32 1
%104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2
%105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %105, i32 4, i32 2)
%106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
%107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1
%108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2
%109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %109, i32 5, i32 2)
%110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
%111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1
%112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2
%113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %113, i32 6, i32 2)
ret void
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
index f388f8ffe293..461caf5b5d20 100644
--- a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -1,58 +1,58 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman
+; RUN: llc -march=r600 -mcpu=cayman < %s
-define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
+define amdgpu_ps void @main(<4 x float> inreg %arg, <4 x float> inreg %arg1) {
main_body:
- %2 = extractelement <4 x float> %0, i32 0
- %3 = extractelement <4 x float> %0, i32 1
- %4 = extractelement <4 x float> %0, i32 2
- %5 = extractelement <4 x float> %0, i32 3
- %6 = insertelement <4 x float> undef, float %2, i32 0
- %7 = insertelement <4 x float> %6, float %3, i32 1
- %8 = insertelement <4 x float> %7, float %4, i32 2
- %9 = insertelement <4 x float> %8, float %5, i32 3
- %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
- %11 = extractelement <4 x float> %10, i32 0
- %12 = extractelement <4 x float> %10, i32 1
- %13 = extractelement <4 x float> %10, i32 2
- %14 = extractelement <4 x float> %10, i32 3
- %15 = call float @fabs(float %13)
- %16 = fdiv float 1.000000e+00, %15
- %17 = fmul float %11, %16
- %18 = fadd float %17, 1.500000e+00
- %19 = fmul float %12, %16
- %20 = fadd float %19, 1.500000e+00
- %21 = insertelement <4 x float> undef, float %20, i32 0
- %22 = insertelement <4 x float> %21, float %18, i32 1
- %23 = insertelement <4 x float> %22, float %14, i32 2
- %24 = insertelement <4 x float> %23, float %5, i32 3
- %25 = extractelement <4 x float> %24, i32 0
- %26 = extractelement <4 x float> %24, i32 1
- %27 = extractelement <4 x float> %24, i32 2
- %28 = extractelement <4 x float> %24, i32 3
- %29 = insertelement <4 x float> undef, float %25, i32 0
- %30 = insertelement <4 x float> %29, float %26, i32 1
- %31 = insertelement <4 x float> %30, float %27, i32 2
- %32 = insertelement <4 x float> %31, float %28, i32 3
- %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
- %34 = extractelement <4 x float> %33, i32 0
- %35 = insertelement <4 x float> undef, float %34, i32 0
- %36 = insertelement <4 x float> %35, float %34, i32 1
- %37 = insertelement <4 x float> %36, float %34, i32 2
- %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
+ %tmp = extractelement <4 x float> %arg, i32 0
+ %tmp2 = extractelement <4 x float> %arg, i32 1
+ %tmp3 = extractelement <4 x float> %arg, i32 2
+ %tmp4 = extractelement <4 x float> %arg, i32 3
+ %tmp5 = insertelement <4 x float> undef, float %tmp, i32 0
+ %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 1
+ %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 2
+ %tmp8 = insertelement <4 x float> %tmp7, float %tmp4, i32 3
+ %tmp9 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp8)
+ %tmp10 = extractelement <4 x float> %tmp9, i32 0
+ %tmp11 = extractelement <4 x float> %tmp9, i32 1
+ %tmp12 = extractelement <4 x float> %tmp9, i32 2
+ %tmp13 = extractelement <4 x float> %tmp9, i32 3
+ %tmp14 = call float @fabs(float %tmp12)
+ %tmp15 = fdiv float 1.000000e+00, %tmp14
+ %tmp16 = fmul float %tmp10, %tmp15
+ %tmp17 = fadd float %tmp16, 1.500000e+00
+ %tmp18 = fmul float %tmp11, %tmp15
+ %tmp19 = fadd float %tmp18, 1.500000e+00
+ %tmp20 = insertelement <4 x float> undef, float %tmp19, i32 0
+ %tmp21 = insertelement <4 x float> %tmp20, float %tmp17, i32 1
+ %tmp22 = insertelement <4 x float> %tmp21, float %tmp13, i32 2
+ %tmp23 = insertelement <4 x float> %tmp22, float %tmp4, i32 3
+ %tmp24 = extractelement <4 x float> %tmp23, i32 0
+ %tmp25 = extractelement <4 x float> %tmp23, i32 1
+ %tmp26 = extractelement <4 x float> %tmp23, i32 2
+ %tmp27 = extractelement <4 x float> %tmp23, i32 3
+ %tmp28 = insertelement <4 x float> undef, float %tmp24, i32 0
+ %tmp29 = insertelement <4 x float> %tmp28, float %tmp25, i32 1
+ %tmp30 = insertelement <4 x float> %tmp29, float %tmp26, i32 2
+ %tmp31 = insertelement <4 x float> %tmp30, float %tmp27, i32 3
+ %tmp32 = shufflevector <4 x float> %tmp31, <4 x float> %tmp31, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp33 = call <4 x float> @llvm.r600.texc(<4 x float> %tmp32, i32 0, i32 0, i32 0, i32 16, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp34 = extractelement <4 x float> %tmp33, i32 0
+ %tmp35 = insertelement <4 x float> undef, float %tmp34, i32 0
+ %tmp36 = insertelement <4 x float> %tmp35, float %tmp34, i32 1
+ %tmp37 = insertelement <4 x float> %tmp36, float %tmp34, i32 2
+ %tmp38 = insertelement <4 x float> %tmp37, float 1.000000e+00, i32 3
+ call void @llvm.r600.store.swizzle(<4 x float> %tmp38, i32 0, i32 0)
ret void
}
; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
; Function Attrs: readnone
-declare float @fabs(float) #1
+declare float @fabs(float) #0
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+; Function Attrs: readnone
+declare <4 x float> @llvm.r600.texc(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/r600.private-memory.ll b/test/CodeGen/AMDGPU/r600.private-memory.ll
new file mode 100644
index 000000000000..f406c160cbbe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.private-memory.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-NOT: MOV * TO.X
+
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+ %0 = alloca [2 x i32]
+ %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
+ %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
+ store i32 0, i32* %1
+ store i32 1, i32* %2
+ %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+ %4 = load i32, i32* %3
+ %5 = call i32 @llvm.r600.read.tidig.x()
+ %6 = add i32 %4, %5
+ store i32 %6, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
new file mode 100644
index 000000000000..ff248a89cedc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}tgid_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW T1.X
+define void @tgid_x(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.tgid.x() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y
+define void @tgid_y(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.tgid.y() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z
+define void @tgid_z(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.tgid.z() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.X
+define void @tidig_x(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.tidig.x() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y
+define void @tidig_y(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.tidig.y() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z
+define void @tidig_z(i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.tidig.z() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_implicit:
+; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
+; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
+define void @test_implicit(i32 addrspace(1)* %out) #1 {
+ %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
+ %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
+ %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4
+ %value = load i32, i32 addrspace(7)* %gep
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_implicit_dyn:
+; 36 prepended implicit bytes + 8(out pointer + in) = 44
+; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
+define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
+ %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
+ %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
+ %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in
+ %value = load i32, i32 addrspace(7)* %gep
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+
+
+; DEPRECATED but R600 only
+
+; FUNC-LABEL: {{^}}workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV {{\*? *}}[[VAL]], KC0[2].Z
+define void @workdim (i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.workdim() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.r600.read.workdim() #0
+
+declare i8 addrspace(7)* @llvm.r600.implicitarg.ptr() #0
+
+declare i32 @llvm.r600.read.tgid.x() #0
+declare i32 @llvm.r600.read.tgid.y() #0
+declare i32 @llvm.r600.read.tgid.z() #0
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.tidig.y() #0
+declare i32 @llvm.r600.read.tidig.z() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/r600cfg.ll b/test/CodeGen/AMDGPU/r600cfg.ll
index c7b9d65220f3..2996a1053da5 100644
--- a/test/CodeGen/AMDGPU/r600cfg.ll
+++ b/test/CodeGen/AMDGPU/r600cfg.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=redwood
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -32,27 +32,27 @@ IF41: ; preds = %LOOP
%17 = insertelement <4 x float> %16, float %temp8.0, i32 1
%18 = insertelement <4 x float> %17, float %temp12.0, i32 2
%19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3
- call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
+ call void @llvm.r600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
%20 = insertelement <4 x float> undef, float %0, i32 0
%21 = insertelement <4 x float> %20, float %temp8.0, i32 1
%22 = insertelement <4 x float> %21, float %temp12.0, i32 2
%23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3
- call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
+ call void @llvm.r600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
%24 = insertelement <4 x float> undef, float %0, i32 0
%25 = insertelement <4 x float> %24, float %temp8.0, i32 1
%26 = insertelement <4 x float> %25, float %temp12.0, i32 2
%27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3
- call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
+ call void @llvm.r600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
%28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
%29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1
%30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2
%31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %31, i32 60, i32 1)
%32 = insertelement <4 x float> undef, float %0, i32 0
%33 = insertelement <4 x float> %32, float %temp8.0, i32 1
%34 = insertelement <4 x float> %33, float %temp12.0, i32 2
%35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %35, i32 0, i32 2)
ret void
ENDIF40: ; preds = %LOOP
@@ -112,8 +112,6 @@ ENDIF48: ; preds = %LOOP47
br label %LOOP47
}
-declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
+declare void @llvm.r600.store.stream.output(<4 x float>, i32, i32, i32)
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll
new file mode 100644
index 000000000000..b1d422062543
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: Evergreen only ever does unsafe fp math.
+; FUNC-LABEL: {{^}}rcp_pat_f32:
+; EG: RECIP_IEEE
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+ %rcp = fdiv float 1.0, %src
+ store float %rcp, float addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
new file mode 100644
index 000000000000..a5581d73cb25
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid register "flat_scratch_lo" for subtarget.
+
+declare i32 @llvm.read_register.i32(metadata) #0
+
+define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
+ store volatile i32 0, i32 addrspace(3)* undef
+ %m0 = call i32 @llvm.read_register.i32(metadata !0)
+ store i32 %m0, i32 addrspace(1)* %out
+ ret void
+}
+
+!0 = !{!"flat_scratch_lo"}
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
new file mode 100644
index 000000000000..2617ad7402ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid type for register "exec".
+
+declare i32 @llvm.read_register.i32(metadata) #0
+
+define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
+ store volatile i32 0, i32 addrspace(3)* undef
+ %m0 = call i32 @llvm.read_register.i32(metadata !0)
+ store i32 %m0, i32 addrspace(1)* %out
+ ret void
+}
+
+!0 = !{!"exec"}
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
new file mode 100644
index 000000000000..dcde8a1894fc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck %s
+
+; CHECK: invalid type for register "m0".
+
+declare i64 @llvm.read_register.i64(metadata) #0
+
+define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
+ %exec = call i64 @llvm.read_register.i64(metadata !0)
+ store i64 %exec, i64 addrspace(1)* %out
+ ret void
+}
+
+!0 = !{!"m0"}
diff --git a/test/CodeGen/AMDGPU/read_register.ll b/test/CodeGen/AMDGPU/read_register.ll
new file mode 100644
index 000000000000..58a9e34b77f2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/read_register.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.read_register.i32(metadata) #0
+declare i64 @llvm.read_register.i64(metadata) #0
+
+; CHECK-LABEL: {{^}}test_read_m0:
+; CHECK: s_mov_b32 m0, -1
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], m0
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_m0(i32 addrspace(1)* %out) #0 {
+ store volatile i32 0, i32 addrspace(3)* undef
+ %m0 = call i32 @llvm.read_register.i32(metadata !0)
+ store i32 %m0, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_exec:
+; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
+; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_read_exec(i64 addrspace(1)* %out) #0 {
+ %exec = call i64 @llvm.read_register.i64(metadata !1)
+ store i64 %exec, i64 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_flat_scratch:
+; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
+; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
+ %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
+ store i64 %flat_scratch, i64 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_flat_scratch_lo:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
+ %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3)
+ store i32 %flat_scratch_lo, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_flat_scratch_hi:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
+ %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4)
+ store i32 %flat_scratch_hi, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_exec_lo:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
+ %exec_lo = call i32 @llvm.read_register.i32(metadata !5)
+ store i32 %exec_lo, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_read_exec_hi:
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi
+; CHECK: buffer_store_dword [[COPY]]
+define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
+ %exec_hi = call i32 @llvm.read_register.i32(metadata !6)
+ store i32 %exec_hi, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!"m0"}
+!1 = !{!"exec"}
+!2 = !{!"flat_scratch"}
+!3 = !{!"flat_scratch_lo"}
+!4 = !{!"flat_scratch_hi"}
+!5 = !{!"exec_lo"}
+!6 = !{!"exec_hi"}
diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll
new file mode 100644
index 000000000000..e6d0efd0ff94
--- /dev/null
+++ b/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare i64 @llvm.readcyclecounter() #0
+
+; GCN-LABEL: {{^}}test_readcyclecounter:
+; SI-DAG: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; VI-DAG: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: s_load_dwordx2
+; GCN: lgkmcnt
+; GCN: buffer_store_dwordx2
+; GCN-NOT: lgkmcnt
+; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}}
+; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
+ %cycle0 = call i64 @llvm.readcyclecounter()
+ store volatile i64 %cycle0, i64 addrspace(1)* %out
+
+ %cycle1 = call i64 @llvm.readcyclecounter()
+ store volatile i64 %cycle1, i64 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll
index b4ac47afced7..f9292a788521 100644
--- a/test/CodeGen/AMDGPU/reciprocal.ll
+++ b/test/CodeGen/AMDGPU/reciprocal.ll
@@ -2,14 +2,12 @@
;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_ps void @test(<4 x float> inreg %reg0) {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = fdiv float 1.0, %r0
%vec = insertelement <4 x float> undef, float %r1, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
ret void
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="0" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
new file mode 100644
index 000000000000..c902cb9e1dfb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]]
+; GCN: buffer_store_dwordx2
+define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+ %a = load i64, i64 addrspace(1)* %in, align 4
+ %and = and i64 %a, 1234567
+ store i64 %and, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: buffer_store_dword [[VAL]]
+define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+ %a = load i64, i64 addrspace(1)* %in, align 4
+ %vec = bitcast i64 %a to <2 x i32>
+ %elt0 = extractelement <2 x i32> %vec, i32 0
+ store i32 %elt0, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
+; GCN: buffer_store_dword [[VAL]]
+define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+ %a = load i64, i64 addrspace(1)* %in, align 4
+ %vec = bitcast i64 %a to <2 x i32>
+ %elt0 = extractelement <2 x i32> %vec, i32 1
+ store i32 %elt0, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
new file mode 100644
index 000000000000..281e49f804c6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
+; GCN: s_load_dwordx2
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+ %x.bc = bitcast <2 x i32> %x to <4 x i16>
+ store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_v4i32_as_v8i16_align_4:
+; GCN: s_load_dwordx4
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+ %x.bc = bitcast <4 x i32> %x to <8 x i16>
+ store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4:
+; GCN: s_load_dwordx2
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+ %x.bc = bitcast <2 x i32> %x to <4 x i16>
+ store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_v4i32_as_v2i64_align_4:
+; GCN: s_load_dwordx4
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+ %x.bc = bitcast <4 x i32> %x to <2 x i64>
+ store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
+define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
+ %x.bc = bitcast <4 x i16> %x to <2 x i32>
+ store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
new file mode 100644
index 000000000000..6e95f4c7521f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -o /dev/null < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o /dev/null < %s
+
+; The register coalescer introduces a verifier error which later
+; results in a crash during scheduling.
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
+bb:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp0 = icmp eq i32 %id.x, 0
+ br i1 %cmp0, label %bb3, label %bb4
+
+bb3: ; preds = %bb
+ %tmp = ashr exact i32 undef, 8
+ br label %bb6
+
+bb4: ; preds = %bb6, %bb
+ %tmp5 = phi <2 x i32> [ zeroinitializer, %bb ], [ %tmp13, %bb6 ]
+ br i1 undef, label %bb15, label %bb16
+
+bb6: ; preds = %bb6, %bb3
+ %tmp7 = phi <2 x i32> [ zeroinitializer, %bb3 ], [ %tmp13, %bb6 ]
+ %tmp8 = add nsw i32 0, %arg1
+ %tmp9 = add nsw i32 %tmp8, 0
+ %tmp10 = sext i32 %tmp9 to i64
+ %tmp11 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg, i64 %tmp10
+ %tmp12 = load <2 x i32>, <2 x i32> addrspace(1)* %tmp11, align 8
+ %tmp13 = add <2 x i32> %tmp12, %tmp7
+ %tmp14 = icmp slt i32 undef, %arg2
+ br i1 %tmp14, label %bb6, label %bb4
+
+bb15: ; preds = %bb4
+ store <2 x i32> %tmp5, <2 x i32> addrspace(3)* undef, align 8
+ br label %bb16
+
+bb16: ; preds = %bb15, %bb4
+ unreachable
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll
index 4bb315049be4..bff3a9f5d2b0 100644
--- a/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -1,7 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.SI.tid() nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
; SI-LABEL: {{^}}foo:
; SI: .section .AMDGPU.csdata
@@ -9,7 +10,8 @@ declare i32 @llvm.SI.tid() nounwind readnone
; SI: ; NumSgprs: {{[0-9]+}}
; SI: ; NumVgprs: {{[0-9]+}}
define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
- %tid = call i32 @llvm.SI.tid() nounwind readnone
+ %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
+ %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
%aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
%bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid
%outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
new file mode 100644
index 000000000000..47bdfba96530
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -0,0 +1,33 @@
+; RUN: llc -verify-machineinstrs -o /dev/null %s
+; Check that renameDisconnectedComponents() does not create vregs without a
+; definition on every path (there should at least be IMPLICIT_DEF instructions).
+target triple = "amdgcn--"
+
+define void @func() {
+B0:
+ br i1 undef, label %B1, label %B2
+
+B1:
+ br label %B2
+
+B2:
+ %v0 = phi <4 x float> [ zeroinitializer, %B1 ], [ <float 0.0, float 0.0, float 0.0, float undef>, %B0 ]
+ br i1 undef, label %B20.1, label %B20.2
+
+B20.1:
+ br label %B20.2
+
+B20.2:
+ %v2 = phi <4 x float> [ zeroinitializer, %B20.1 ], [ %v0, %B2 ]
+ br i1 undef, label %B30.1, label %B30.2
+
+B30.1:
+ %sub = fsub <4 x float> %v2, undef
+ br label %B30.2
+
+B30.2:
+ %v3 = phi <4 x float> [ %sub, %B30.1 ], [ %v2, %B20.2 ]
+ %ve0 = extractelement <4 x float> %v3, i32 0
+ store float %ve0, float addrspace(3)* undef, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
new file mode 100644
index 000000000000..2dd21ca51e46
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -0,0 +1,30 @@
+# RUN: llc -march=amdgcn -run-pass rename-independent-subregs -o - %s | FileCheck %s
+--- |
+ define void @test0() { ret void }
+...
+---
+# In the test below we have two independent def+use pairs of subregister1 which
+# can be moved to a new virtual register. The third def of sub1 however is used
+# in combination with sub0 and needs to stay with the original vreg.
+# CHECK-LABEL: name: test0
+# CHECK: S_NOP 0, implicit-def undef %0:sub0
+# CHECK: S_NOP 0, implicit-def undef %2:sub1
+# CHECK: S_NOP 0, implicit %2:sub1
+# CHECK: S_NOP 0, implicit-def undef %1:sub1
+# CHECK: S_NOP 0, implicit %1:sub1
+# CHECK: S_NOP 0, implicit-def %0:sub1
+# CHECK: S_NOP 0, implicit %0
+name: test0
+isSSA: true
+registers:
+ - { id: 0, class: sreg_128 }
+body: |
+ bb.0:
+ S_NOP 0, implicit-def undef %0:sub0
+ S_NOP 0, implicit-def %0:sub1
+ S_NOP 0, implicit %0:sub1
+ S_NOP 0, implicit-def %0:sub1
+ S_NOP 0, implicit %0:sub1
+ S_NOP 0, implicit-def %0:sub1
+ S_NOP 0, implicit %0
+...
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index d5e10d0be883..ad8d00c36393 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -16,10 +16,8 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
}
; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_read2_b64
+; SI: ds_write2_b64
; SI: s_endpgm
define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
%tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll
index 2bd9fd6858fe..915c4383ff49 100644
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@@ -1,8 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-attributes #0 = { "ShaderType"="1" }
-
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
; GCN-LABEL: {{^}}vgpr:
@@ -11,7 +9,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
; GCN: s_waitcnt expcnt(0)
; GCN-NOT: s_endpgm
-define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
%x = fadd float %3, 1.0
%a = insertvalue {float, float} undef, float %x, 0
@@ -20,15 +18,14 @@ define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32
}
; GCN-LABEL: {{^}}vgpr_literal:
-; GCN: v_mov_b32_e32 v4, v0
+; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
+; GCN: s_waitcnt expcnt(0)
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: v_mov_b32_e32 v1, 2.0
; GCN-DAG: v_mov_b32_e32 v2, 4.0
; GCN-DAG: v_mov_b32_e32 v3, -1.0
-; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
-; GCN: s_waitcnt expcnt(0)
; GCN-NOT: s_endpgm
-define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
}
@@ -45,8 +42,8 @@ define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)*
; GCN: v_mov_b32_e32 v3, v4
; GCN: v_mov_b32_e32 v4, v6
; GCN-NOT: s_endpgm
-attributes #1 = { "ShaderType"="0" "InitialPSInputAddr"="0" }
-define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+attributes #0 = { "InitialPSInputAddr"="0" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
%i0 = extractelement <2 x i32> %4, i32 0
%i1 = extractelement <2 x i32> %4, i32 1
%i2 = extractelement <2 x i32> %7, i32 0
@@ -71,7 +68,7 @@ define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrsp
; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
; GCN: v_mov_b32_e32 v0, 1.0
; GCN-NOT: s_endpgm
-define float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
ret float 1.0
}
@@ -85,7 +82,7 @@ define float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 in
; GCN-DAG: v_mov_b32_e32 v1, v2
; GCN: v_mov_b32_e32 v2, v3
; GCN-NOT: s_endpgm
-define {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
%f = bitcast <2 x i32> %8 to <2 x float>
%s = insertvalue {float, <2 x float>} undef, float %14, 0
%s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1
@@ -104,8 +101,8 @@ define {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* by
; GCN-DAG: v_mov_b32_e32 v3, v6
; GCN-DAG: v_mov_b32_e32 v4, v8
; GCN-NOT: s_endpgm
-attributes #2 = { "ShaderType"="0" "InitialPSInputAddr"="1" }
-define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
+attributes #1 = { "InitialPSInputAddr"="1" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
%i0 = extractelement <2 x i32> %4, i32 0
%i1 = extractelement <2 x i32> %4, i32 1
%i2 = extractelement <2 x i32> %7, i32 0
@@ -134,8 +131,8 @@ define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrsp
; GCN: v_mov_b32_e32 v3, v8
; GCN: v_mov_b32_e32 v4, v12
; GCN-NOT: s_endpgm
-attributes #3 = { "ShaderType"="0" "InitialPSInputAddr"="119" }
-define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
+attributes #2 = { "InitialPSInputAddr"="119" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
%i0 = extractelement <2 x i32> %4, i32 0
%i1 = extractelement <2 x i32> %4, i32 1
%i2 = extractelement <2 x i32> %7, i32 0
@@ -164,8 +161,8 @@ define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addr
; GCN: v_mov_b32_e32 v3, v4
; GCN: v_mov_b32_e32 v4, v8
; GCN-NOT: s_endpgm
-attributes #4 = { "ShaderType"="0" "InitialPSInputAddr"="418" }
-define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 {
+attributes #3 = { "InitialPSInputAddr"="418" }
+define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
%i0 = extractelement <2 x i32> %4, i32 0
%i1 = extractelement <2 x i32> %4, i32 1
%i2 = extractelement <2 x i32> %7, i32 0
@@ -187,7 +184,7 @@ define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addr
; GCN: s_add_i32 s0, s3, 2
; GCN: s_mov_b32 s2, s3
; GCN-NOT: s_endpgm
-define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
%x = add i32 %2, 2
%a = insertvalue {i32, i32, i32} undef, i32 %x, 0
%b = insertvalue {i32, i32, i32} %a, i32 %1, 1
@@ -203,7 +200,7 @@ define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32
; GCN-DAG: s_mov_b32 s2, 7
; GCN-DAG: s_mov_b32 s3, 8
; GCN-NOT: s_endpgm
-define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
%x = add i32 %2, 2
ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
}
@@ -218,7 +215,7 @@ define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i
; GCN: s_mov_b32 s2, s3
; GCN: s_waitcnt expcnt(0)
; GCN-NOT: s_endpgm
-define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
%v = fadd float %3, 1.0
%s = add i32 %2, 2
@@ -232,14 +229,14 @@ define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval,
; GCN-LABEL: {{^}}structure_literal:
-; GCN: v_mov_b32_e32 v3, v0
+; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
+; GCN: s_waitcnt expcnt(0)
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: s_mov_b32 s0, 2
; GCN-DAG: s_mov_b32 s1, 3
; GCN-DAG: v_mov_b32_e32 v1, 2.0
; GCN-DAG: v_mov_b32_e32 v2, 4.0
-; GCN-DAG: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
-define {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
}
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
new file mode 100644
index 000000000000..f7380cd96921
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; This should end with an no-op sequence of exec mask manipulations
+; Mask should be in original state after executed unreachable block
+
+; GCN-LABEL: {{^}}main:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: [[RET_BB]]:
+; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: [[UNREACHABLE_BB]]:
+; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]]
+; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: .Lfunc_end0
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+main_body:
+ %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
+ %p87 = fmul float undef, %p83
+ %p88 = fadd float %p87, undef
+ %p93 = fadd float %p88, undef
+ %p97 = fmul float %p93, undef
+ %p102 = fsub float %p97, undef
+ %p104 = fmul float %p102, undef
+ %p106 = fadd float 0.000000e+00, %p104
+ %p108 = fadd float undef, %p106
+ br i1 undef, label %ENDIF69, label %ELSE
+
+ELSE: ; preds = %main_body
+ %p124 = fmul float %p108, %p108
+ %p125 = fsub float %p124, undef
+ %p126 = fcmp olt float %p125, 0.000000e+00
+ br i1 %p126, label %ENDIF69, label %ELSE41
+
+ELSE41: ; preds = %ELSE
+ unreachable
+
+ENDIF69: ; preds = %ELSE, %main_body
+ ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.fabs.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.floor.f32(float) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll
index 6c144cd56ea7..7d2b5538ca33 100644
--- a/test/CodeGen/AMDGPU/rotl.ll
+++ b/test/CodeGen/AMDGPU/rotl.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}rotl_i32:
; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
index b67b800c7374..8192b861b602 100644
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.sqrt.f32(float) nounwind readnone
declare double @llvm.sqrt.f64(double) nounwind readnone
@@ -56,15 +56,15 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind
; SI: s_endpgm
define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
- %a = load float, float addrspace(1)* %gep.0
- %b = load float, float addrspace(1)* %gep.1
- %c = load float, float addrspace(1)* %gep.2
+ %a = load volatile float, float addrspace(1)* %gep.0
+ %b = load volatile float, float addrspace(1)* %gep.1
+ %c = load volatile float, float addrspace(1)* %gep.2
%x = call float @llvm.sqrt.f32(float %a)
%y = fmul float %x, %b
diff --git a/test/CodeGen/AMDGPU/runtime-metadata.ll b/test/CodeGen/AMDGPU/runtime-metadata.ll
new file mode 100644
index 000000000000..052ad5b9c15b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/runtime-metadata.ll
@@ -0,0 +1,848 @@
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck %s
+
+%struct.A = type { i8, float }
+%opencl.image1d_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+%opencl.queue_t = type opaque
+%opencl.pipe_t = type opaque
+%struct.B = type { i32 addrspace(1)*}
+%opencl.clk_event_t = type opaque
+
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .short 256
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .short 200
+
+; CHECK-LABEL:{{^}}test_char:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "test_char"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .ascii "char"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_char(i8 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_ushort2:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 12
+; CHECK-NEXT: .ascii "test_ushort2"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 7
+; CHECK-NEXT: .ascii "ushort2"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_ushort2(<2 x i16> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_int3:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "test_int3"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .ascii "int3"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_int3(<3 x i32> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_ulong4:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 11
+; CHECK-NEXT: .ascii "test_ulong4"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 32
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 32
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 6
+; CHECK-NEXT: .ascii "ulong4"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 10
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_ulong4(<4 x i64> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_half8:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 10
+; CHECK-NEXT: .ascii "test_half8"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "half8"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 5
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_half8(<8 x half> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_float16:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 12
+; CHECK-NEXT: .ascii "test_float16"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 64
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 64
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 7
+; CHECK-NEXT: .ascii "float16"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 8
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_float16(<16 x float> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_double16:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 13
+; CHECK-NEXT: .ascii "test_double16"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 128
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 128
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .ascii "double16"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 11
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_double16(<16 x double> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_pointer:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 12
+; CHECK-NEXT: .ascii "test_pointer"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "int *"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_image:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 10
+; CHECK-NEXT: .ascii "test_image"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "image2d_t"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_sampler:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 12
+; CHECK-NEXT: .ascii "test_sampler"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "sampler_t"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_sampler(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_queue:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 10
+; CHECK-NEXT: .ascii "test_queue"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 7
+; CHECK-NEXT: .ascii "queue_t"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_struct:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 11
+; CHECK-NEXT: .ascii "test_struct"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .ascii "struct A"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_struct(%struct.A* byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_i128:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "test_i128"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .ascii "i128"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_i128(i128 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_multi_arg:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 14
+; CHECK-NEXT: .ascii "test_multi_arg"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 3
+; CHECK-NEXT: .ascii "int"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 6
+; CHECK-NEXT: .ascii "short2"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 3
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "char3"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24 !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_addr_space:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 15
+; CHECK-NEXT: .ascii "test_addr_space"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "int *"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "int *"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "int *"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, i32 addrspace(2)* %c, i32 addrspace(3)* %l) !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_type_qual:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 14
+; CHECK-NEXT: .ascii "test_type_qual"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "int *"
+; CHECK-NEXT: .byte 19
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "int *"
+; CHECK-NEXT: .byte 17
+; CHECK-NEXT: .byte 18
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "int *"
+; CHECK-NEXT: .byte 20
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a, i32 addrspace(1)* %b, %opencl.pipe_t addrspace(1)* %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_access_qual:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .ascii "test_access_qual"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "image1d_t"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "image2d_t"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 9
+; CHECK-NEXT: .ascii "image3d_t"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro, %opencl.image2d_t addrspace(1)* %wo, %opencl.image3d_t addrspace(1)* %rw) !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62 !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_reqd_wgs_vec_type_hint:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 27
+; CHECK-NEXT: .ascii "test_reqd_wgs_vec_type_hint"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 3
+; CHECK-NEXT: .ascii "int"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 21
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 23
+; CHECK-NEXT: .long 3
+; CHECK-NEXT: .ascii "int"
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5 !reqd_work_group_size !6 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_wgs_hint_vec_type_hint:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 27
+; CHECK-NEXT: .ascii "test_wgs_hint_vec_type_hint"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 3
+; CHECK-NEXT: .ascii "int"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 22
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .long 32
+; CHECK-NEXT: .byte 23
+; CHECK-NEXT: .long 5
+; CHECK-NEXT: .ascii "uint4"
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7 !work_group_size_hint !8 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_ptr_to_ptr:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 19
+; CHECK-NEXT: .ascii "test_arg_ptr_to_ptr"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 6
+; CHECK-NEXT: .ascii "int **"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 * addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_struct_contains_ptr:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 28
+; CHECK-NEXT: .ascii "test_arg_struct_contains_ptr"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .ascii "struct B"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B * byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_vector_of_ptr:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 22
+; CHECK-NEXT: .ascii "test_arg_vector_of_ptr"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 16
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 47
+; CHECK-NEXT: .ascii "global int* __attribute__((ext_vector_type(2)))"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83 !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK-LABEL:{{^}}test_arg_unknown_builtin_type:
+; CHECK: .section .AMDGPU.runtime_metadata
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 6
+; CHECK-NEXT: .long 29
+; CHECK-NEXT: .ascii "test_arg_unknown_builtin_type"
+; CHECK-NEXT: .byte 7
+; CHECK-NEXT: .byte 9
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 10
+; CHECK-NEXT: .long 8
+; CHECK-NEXT: .byte 11
+; CHECK-NEXT: .long 11
+; CHECK-NEXT: .ascii "clk_event_t"
+; CHECK-NEXT: .byte 13
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 14
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 5
+
+define amdgpu_kernel void @test_arg_unknown_builtin_type(%opencl.clk_event_t addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84 !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"int"}
+!4 = !{!""}
+!5 = !{i32 undef, i32 1}
+!6 = !{i32 1, i32 2, i32 4}
+!7 = !{<4 x i32> undef, i32 0}
+!8 = !{i32 8, i32 16, i32 32}
+!9 = !{!"char"}
+!10 = !{!"ushort2"}
+!11 = !{!"int3"}
+!12 = !{!"ulong4"}
+!13 = !{!"half8"}
+!14 = !{!"float16"}
+!15 = !{!"double16"}
+!16 = !{!"int *"}
+!17 = !{!"image2d_t"}
+!18 = !{!"sampler_t"}
+!19 = !{!"queue_t"}
+!20 = !{!"struct A"}
+!21 = !{!"i128"}
+!22 = !{i32 0, i32 0, i32 0}
+!23 = !{!"none", !"none", !"none"}
+!24 = !{!"int", !"short2", !"char3"}
+!25 = !{!"", !"", !""}
+!50 = !{i32 1, i32 2, i32 3}
+!51 = !{!"int *", !"int *", !"int *"}
+!60 = !{i32 1, i32 1, i32 1}
+!61 = !{!"read_only", !"write_only", !"read_write"}
+!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
+!70 = !{!"volatile", !"const restrict", !"pipe"}
+!80 = !{!"int **"}
+!81 = !{i32 1}
+!82 = !{!"struct B"}
+!83 = !{!"global int* __attribute__((ext_vector_type(2)))"}
+!84 = !{!"clk_event_t"}
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
diff --git a/test/CodeGen/AMDGPU/rv7x0_count3.ll b/test/CodeGen/AMDGPU/rv7x0_count3.ll
index c3fd923e4593..50df64bf5471 100644
--- a/test/CodeGen/AMDGPU/rv7x0_count3.ll
+++ b/test/CodeGen/AMDGPU/rv7x0_count3.ll
@@ -1,41 +1,52 @@
; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rv710 | FileCheck %s
; CHECK: TEX 9 @6 ; encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
-
-define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
- %1 = extractelement <4 x float> %reg1, i32 0
- %2 = extractelement <4 x float> %reg1, i32 1
- %3 = extractelement <4 x float> %reg1, i32 2
- %4 = extractelement <4 x float> %reg1, i32 3
- %5 = insertelement <4 x float> undef, float %1, i32 0
- %6 = insertelement <4 x float> %5, float %2, i32 1
- %7 = insertelement <4 x float> %6, float %3, i32 2
- %8 = insertelement <4 x float> %7, float %4, i32 3
- %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
- %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
- %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
- %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
- %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
- %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
- %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
- %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
- %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
- %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
- %19 = fadd <4 x float> %9, %10
- %20 = fadd <4 x float> %19, %11
- %21 = fadd <4 x float> %20, %12
- %22 = fadd <4 x float> %21, %13
- %23 = fadd <4 x float> %22, %14
- %24 = fadd <4 x float> %23, %15
- %25 = fadd <4 x float> %24, %16
- %26 = fadd <4 x float> %25, %17
- %27 = fadd <4 x float> %26, %18
- call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
- ret void
+define amdgpu_vs void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
+bb:
+ %tmp = extractelement <4 x float> %reg1, i32 0
+ %tmp1 = extractelement <4 x float> %reg1, i32 1
+ %tmp2 = extractelement <4 x float> %reg1, i32 2
+ %tmp3 = extractelement <4 x float> %reg1, i32 3
+ %tmp4 = insertelement <4 x float> undef, float %tmp, i32 0
+ %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 1
+ %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 2
+ %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 3
+ %tmp8 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp9 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp10 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp11 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp10, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp12 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp13 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp12, i32 0, i32 0, i32 0, i32 2, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp14 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp15 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp14, i32 0, i32 0, i32 0, i32 3, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp16 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp17 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp16, i32 0, i32 0, i32 0, i32 4, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp18 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp19 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp18, i32 0, i32 0, i32 0, i32 5, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp20 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp21 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp20, i32 0, i32 0, i32 0, i32 6, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp22 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp23 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp22, i32 0, i32 0, i32 0, i32 7, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp24 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp25 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp24, i32 0, i32 0, i32 0, i32 8, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp26 = shufflevector <4 x float> %tmp7, <4 x float> %tmp7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp27 = call <4 x float> @llvm.r600.tex(<4 x float> %tmp26, i32 0, i32 0, i32 0, i32 9, i32 0, i32 1, i32 1, i32 1, i32 1)
+ %tmp28 = fadd <4 x float> %tmp9, %tmp11
+ %tmp29 = fadd <4 x float> %tmp28, %tmp13
+ %tmp30 = fadd <4 x float> %tmp29, %tmp15
+ %tmp31 = fadd <4 x float> %tmp30, %tmp17
+ %tmp32 = fadd <4 x float> %tmp31, %tmp19
+ %tmp33 = fadd <4 x float> %tmp32, %tmp21
+ %tmp34 = fadd <4 x float> %tmp33, %tmp23
+ %tmp35 = fadd <4 x float> %tmp34, %tmp25
+ %tmp36 = fadd <4 x float> %tmp35, %tmp27
+ call void @llvm.r600.store.swizzle(<4 x float> %tmp36, i32 0, i32 2)
+ ret void
}
-declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
-attributes #0 = { "ShaderType"="1" }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/s_addk_i32.ll b/test/CodeGen/AMDGPU/s_addk_i32.ll
new file mode 100644
index 000000000000..987056010e69
--- /dev/null
+++ b/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_addk_i32_k0:
+; SI: s_load_dword [[VAL:s[0-9]+]]
+; SI: s_addk_i32 [[VAL]], 0x41
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VRESULT]]
+; SI: s_endpgm
+define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+ %add = add i32 %b, 65
+ store i32 %add, i32 addrspace(1)* %out
+ ret void
+}
+
+; FIXME: This should be folded with any number of uses.
+; SI-LABEL: {{^}}s_addk_i32_k0_x2:
+; SI: s_movk_i32 [[K:s[0-9]+]], 0x41
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
+; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
+; SI: s_endpgm
+define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
+ %add0 = add i32 %a, 65
+ %add1 = add i32 %b, 65
+ store i32 %add0, i32 addrspace(1)* %out0
+ store i32 %add1, i32 addrspace(1)* %out1
+ ret void
+}
+
+; SI-LABEL: {{^}}s_addk_i32_k1:
+; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}}
+; SI: s_endpgm
+define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+ %add = add i32 %b, 32767 ; (1 << 15) - 1
+ store i32 %add, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_addk_i32_k2:
+; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_endpgm
+define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+ %add = add i32 %b, -17
+ store i32 %add, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_addk_v2i32_k0:
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI: s_endpgm
+define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
+ %add = add <2 x i32> %b, <i32 65, i32 66>
+ store <2 x i32> %add, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_addk_v4i32_k0:
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
+; SI: s_endpgm
+define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
+ %add = add <4 x i32> %b, <i32 65, i32 66, i32 67, i32 68>
+ store <4 x i32> %add, <4 x i32> addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_addk_v8i32_k0:
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x45
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x46
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47
+; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48
+; SI: s_endpgm
+define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
+ %add = add <8 x i32> %b, <i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72>
+ store <8 x i32> %add, <8 x i32> addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}no_s_addk_i32_k0:
+; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
+; SI: s_endpgm
+define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+ %add = add i32 %b, 32768 ; 1 << 15
+ store i32 %add, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/s_mulk_i32.ll b/test/CodeGen/AMDGPU/s_mulk_i32.ll
new file mode 100644
index 000000000000..33d7eeacdb83
--- /dev/null
+++ b/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_mulk_i32_k0:
+; SI: s_load_dword [[VAL:s[0-9]+]]
+; SI: s_mulk_i32 [[VAL]], 0x41
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VRESULT]]
+; SI: s_endpgm
+define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+ %mul = mul i32 %b, 65
+ store i32 %mul, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_mulk_i32_k1:
+; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
+; SI: s_endpgm
+define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+ %mul = mul i32 %b, 32767 ; (1 << 15) - 1
+ store i32 %mul, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_mulk_i32_k2:
+; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_endpgm
+define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+ %mul = mul i32 %b, -17
+ store i32 %mul, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}no_s_mulk_i32_k0:
+; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
+; SI: s_endpgm
+define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+ %mul = mul i32 %b, 32769 ; 1 << 15 + 1
+ store i32 %mul, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 551f34339a12..52f3cceac2a0 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -2,8 +2,8 @@
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
-declare i32 @llvm.r600.read.tidig.x() #0
-declare i32 @llvm.r600.read.tidig.y() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
; In this test both the pointer and the offset operands to the
; BUFFER_LOAD instructions end up being stored in vgprs. This
@@ -26,8 +26,8 @@ declare i32 @llvm.r600.read.tidig.y() #0
define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x()
- %tmp1 = call i32 @llvm.r600.read.tidig.y()
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
%tmp2 = sext i32 %tmp to i64
%tmp3 = sext i32 %tmp1 to i64
br label %loop
@@ -51,12 +51,20 @@ done: ; preds = %loop
}
; Test moving an SMRD instruction to the VALU
+; FIXME: movs can be moved before nop to reduce count
; GCN-LABEL: {{^}}smrd_valu:
-; FIXME: We should be using flat load for HSA.
-; GCN: buffer_load_dword [[OUT:v[0-9]+]]
-; GCN-NOHSA: buffer_store_dword [[OUT]]
-; GCN-HSA: flat_store_dword [[OUT]]
+; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
+; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI: s_nop 3
+; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
+; SI: s_mov_b32
+
+; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
+; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
+; GCN-NOHSA: buffer_store_dword [[V_OUT]]
+; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
entry:
%tmp = icmp ne i32 %a, 0
@@ -87,7 +95,7 @@ endif: ; preds = %else, %if
; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = add i32 %tmp, 4
%tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
%tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -104,10 +112,10 @@ entry:
; GCN-NOHSA: v_add_i32_e32
; GCN-NOHSA: buffer_store_dword
; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-; GCN-HSA: flat_store_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
%tmp3 = getelementptr i32, i32 addrspace(2)* %tmp2, i32 5000
%tmp4 = load i32, i32 addrspace(2)* %tmp3
@@ -127,7 +135,7 @@ entry:
; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
%tmp3 = getelementptr i64, i64 addrspace(2)* %tmp2, i32 5000
%tmp4 = load i64, i64 addrspace(2)* %tmp3
@@ -149,7 +157,7 @@ entry:
; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
%tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %tmp2, i32 1234
%tmp4 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp3
@@ -185,7 +193,7 @@ entry:
; GCN-HSA: flat_load_dwordx4
define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
%tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %tmp2, i32 1234
%tmp4 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp3
@@ -196,22 +204,14 @@ entry:
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
+; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@@ -234,7 +234,7 @@ entry:
; GCN: s_endpgm
define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
%tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %tmp2, i32 1234
%tmp4 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp3
@@ -248,10 +248,10 @@ entry:
; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
; GCN-NOHSA: buffer_store_dword [[ADD]]
-; GCN-HSA: flat_store_dword [[ADD]]
+; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = add i32 %tmp, 4
%tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
%tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -265,7 +265,7 @@ entry:
; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = add i32 %tmp, 4
%tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 255
%tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -279,7 +279,7 @@ entry:
; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = add i32 %tmp, 4
%tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 256
%tmp3 = load i32, i32 addrspace(2)* %tmp2
@@ -294,7 +294,7 @@ entry:
; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
- %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
+ %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
%tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
%tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
@@ -317,7 +317,7 @@ entry:
; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
- %tmp0 = tail call i32 @llvm.r600.read.tidig.x()
+ %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
%tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
%tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
@@ -354,7 +354,7 @@ entry:
; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
- %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
%tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
%tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
@@ -389,7 +389,7 @@ entry:
; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
- %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
%tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
%tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
@@ -431,5 +431,33 @@ entry:
ret void
}
+; Make sure we legalize vopc operands after moving an sopc to the value.
+
+; {{^}}sopc_vopc_legalize_bug:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
+; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN-NOHSA: buffer_store_dword [[ONE]]
+; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
+; GCN; {{^}}[[EXIT]]:
+; GCN: s_endpgm
+define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+bb3: ; preds = %bb2
+ %tmp0 = bitcast i32 %cond to float
+ %tmp1 = fadd float %tmp0, 2.500000e-01
+ %tmp2 = bitcast float %tmp1 to i32
+ %tmp3 = icmp ult i32 %tmp2, %cond
+ br i1 %tmp3, label %bb6, label %bb7
+
+bb6:
+ store i32 1, i32 addrspace(1)* %out
+ br label %bb7
+
+bb7: ; preds = %bb3
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 0970e5d30630..55b392a32729 100644
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,15 +1,14 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
+; XXX - Why the packing?
; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: s_endpgm
+; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
+; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
+; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
+; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
+; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
%tmp1 = load i32, i32 addrspace(1)* %in, align 4
%bc = bitcast i32 %tmp1 to <2 x i16>
@@ -21,11 +20,7 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(
; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
; SI: buffer_load_dword [[VAL:v[0-9]+]],
; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: buffer_store_short [[RESULT]]
-; SI: s_endpgm
+; SI: buffer_store_dwordx2
define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
%tmp1 = load float, float addrspace(1)* %in, align 4
%bc = bitcast float %tmp1 to <2 x i16>
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
index 11e8f5176f44..e040639a2d94 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -1,7 +1,7 @@
;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
;REQUIRES: asserts
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -44,15 +44,15 @@ ENDIF: ; preds = %ELSE17, %ELSE, %IF
%temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
%temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
%temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
- %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00)
- %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
- %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
- %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+ %27 = call float @llvm.AMDGPU.clamp.f32(float %temp.0, float 0.000000e+00, float 1.000000e+00)
+ %28 = call float @llvm.AMDGPU.clamp.f32(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
+ %29 = call float @llvm.AMDGPU.clamp.f32(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
+ %30 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
%31 = insertelement <4 x float> undef, float %27, i32 0
%32 = insertelement <4 x float> %31, float %28, i32 1
%33 = insertelement <4 x float> %32, float %29, i32 2
%34 = insertelement <4 x float> %33, float %30, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %34, i32 0, i32 0)
ret void
ELSE17: ; preds = %ELSE
@@ -74,9 +74,8 @@ ELSE17: ; preds = %ELSE
br label %ENDIF
}
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
attributes #0 = { readnone }
-attributes #1 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index 759197ca61f7..f907e154f962 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -43,15 +43,15 @@ LOOP: ; preds = %IF31, %main_body
br i1 %29, label %IF, label %LOOP29
IF: ; preds = %LOOP
- %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
- %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
- %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
- %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+ %30 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+ %31 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+ %32 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+ %33 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
%34 = insertelement <4 x float> undef, float %30, i32 0
%35 = insertelement <4 x float> %34, float %31, i32 1
%36 = insertelement <4 x float> %35, float %32, i32 2
%37 = insertelement <4 x float> %36, float %33, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %37, i32 0, i32 0)
ret void
LOOP29: ; preds = %LOOP, %ENDIF30
@@ -81,8 +81,8 @@ ENDIF30: ; preds = %LOOP29
br label %LOOP29
}
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
index 28cc08abc022..5839785f00d5 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
@@ -30,15 +30,15 @@ LOOP: ; preds = %ENDIF, %main_body
br i1 %16, label %IF, label %ENDIF
IF: ; preds = %LOOP
- %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
- %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
- %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
- %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+ %17 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+ %18 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+ %19 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+ %20 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
%21 = insertelement <4 x float> undef, float %17, i32 0
%22 = insertelement <4 x float> %21, float %18, i32 1
%23 = insertelement <4 x float> %22, float %19, i32 2
%24 = insertelement <4 x float> %23, float %20, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %24, i32 0, i32 0)
ret void
ENDIF: ; preds = %LOOP
@@ -48,8 +48,8 @@ ENDIF: ; preds = %LOOP
br label %LOOP
}
-declare float @llvm.AMDIL.clamp.(float, float, float) #0
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 3f728fd873b3..1bf109dec032 100644
--- a/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -1,21 +1,19 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() #1
-
; FIXME: This currently doesn't do a great job of clustering the
; loads, which end up with extra moves between them. Right now, it
; seems the only things areLoadsFromSameBasePtr is accomplishing is
; ordering the loads so that the lower address loads come first.
; FUNC-LABEL: {{^}}cluster_global_arg_loads:
-; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
+; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
; SI: buffer_store_dword [[REG0]]
; SI: buffer_store_dword [[REG1]]
define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
%load0 = load i32, i32 addrspace(1)* %ptr, align 4
- %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
%load1 = load i32, i32 addrspace(1)* %gep, align 4
store i32 %load0, i32 addrspace(1)* %out0, align 4
store i32 %load1, i32 addrspace(1)* %out1, align 4
diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll
index 549465096833..aa67b2e0f7db 100644
--- a/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -66,7 +66,7 @@ ENDIF: ; preds = %IF23, %ELSE, %IF
%45 = insertelement <4 x float> %44, float %temp5.0, i32 1
%46 = insertelement <4 x float> %45, float %temp6.0, i32 2
%47 = insertelement <4 x float> %46, float %temp7.0, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %47, i32 0, i32 0)
ret void
IF23: ; preds = %ELSE
@@ -89,6 +89,6 @@ IF23: ; preds = %ELSE
declare float @fabs(float) #0
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
attributes #0 = { readonly }
diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll
index 94c653c8f25b..6637b3897717 100644
--- a/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-if.ll
@@ -32,7 +32,7 @@ ENDIF: ; preds = %IF13, %ELSE, %main_
%17 = insertelement <4 x float> %16, float %temp1.0, i32 1
%18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2
%19 = insertelement <4 x float> %18, float %temp3.0, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %19, i32 0, i32 0)
ret void
IF13: ; preds = %ELSE
@@ -43,4 +43,4 @@ IF13: ; preds = %ELSE
br label %ENDIF
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index 6b3e0814c380..886d4a1dcb5c 100644
--- a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -1,18 +1,17 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
; FUNC-LABEL: {{^}}cluster_arg_loads:
+; FIXME: Due to changes in the load clustering heuristics. We no longer
+; cluster all argument loads together on SI.
+; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
+; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
store i32 %x, i32 addrspace(1)* %out0, align 4
store i32 %y, i32 addrspace(1)* %out1, align 4
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index e4b16c0a165f..9b490bb3a731 100644
--- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -1,13 +1,10 @@
-; XFAIL: *
-; REQUIRES: asserts
-; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
-declare void @llvm.AMDGPU.barrier.local() nounwind convergent
+declare void @llvm.amdgcn.s.barrier() nounwind convergent
-
-; SI-LABEL: {{^}}main(
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+; GCN-LABEL: {{^}}main:
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 2
@@ -39,63 +36,63 @@ ENDIF: ; preds = %main_body, %Flow2
%temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
%15 = extractelement <4 x float> %reg1, i32 1
%16 = extractelement <4 x float> %reg1, i32 3
- %17 = load <4 x float>, <4 x float> addrspace(9)* null
+ %17 = load <4 x float>, <4 x float> addrspace(2)* null
%18 = extractelement <4 x float> %17, i32 0
%19 = fmul float %18, %0
- %20 = load <4 x float>, <4 x float> addrspace(9)* null
+ %20 = load <4 x float>, <4 x float> addrspace(2)* null
%21 = extractelement <4 x float> %20, i32 1
%22 = fmul float %21, %0
- %23 = load <4 x float>, <4 x float> addrspace(9)* null
+ %23 = load <4 x float>, <4 x float> addrspace(2)* null
%24 = extractelement <4 x float> %23, i32 2
%25 = fmul float %24, %0
- %26 = load <4 x float>, <4 x float> addrspace(9)* null
+ %26 = load <4 x float>, <4 x float> addrspace(2)* null
%27 = extractelement <4 x float> %26, i32 3
%28 = fmul float %27, %0
- %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+ %29 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
%30 = extractelement <4 x float> %29, i32 0
%31 = fmul float %30, %15
%32 = fadd float %31, %19
- %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+ %33 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
%34 = extractelement <4 x float> %33, i32 1
%35 = fmul float %34, %15
%36 = fadd float %35, %22
- %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+ %37 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
%38 = extractelement <4 x float> %37, i32 2
%39 = fmul float %38, %15
%40 = fadd float %39, %25
- %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+ %41 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
%42 = extractelement <4 x float> %41, i32 3
%43 = fmul float %42, %15
%44 = fadd float %43, %28
- %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+ %45 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
%46 = extractelement <4 x float> %45, i32 0
%47 = fmul float %46, %1
%48 = fadd float %47, %32
- %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+ %49 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
%50 = extractelement <4 x float> %49, i32 1
%51 = fmul float %50, %1
%52 = fadd float %51, %36
- %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+ %53 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
%54 = extractelement <4 x float> %53, i32 2
%55 = fmul float %54, %1
%56 = fadd float %55, %40
- %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+ %57 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
%58 = extractelement <4 x float> %57, i32 3
%59 = fmul float %58, %1
%60 = fadd float %59, %44
- %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+ %61 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
%62 = extractelement <4 x float> %61, i32 0
%63 = fmul float %62, %16
%64 = fadd float %63, %48
- %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+ %65 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
%66 = extractelement <4 x float> %65, i32 1
%67 = fmul float %66, %16
%68 = fadd float %67, %52
- %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+ %69 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
%70 = extractelement <4 x float> %69, i32 2
%71 = fmul float %70, %16
%72 = fadd float %71, %56
- %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+ %73 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
%74 = extractelement <4 x float> %73, i32 3
%75 = fmul float %74, %16
%76 = fadd float %75, %60
@@ -103,12 +100,12 @@ ENDIF: ; preds = %main_body, %Flow2
%78 = insertelement <4 x float> %77, float %68, i32 1
%79 = insertelement <4 x float> %78, float %72, i32 2
%80 = insertelement <4 x float> %79, float %76, i32 3
- call void @llvm.AMDGPU.barrier.local()
+ call void @llvm.amdgcn.s.barrier()
%81 = insertelement <4 x float> undef, float %temp.0, i32 0
%82 = insertelement <4 x float> %81, float %temp1.0, i32 1
%83 = insertelement <4 x float> %82, float %temp2.0, i32 2
%84 = insertelement <4 x float> %83, float %temp3.0, i32 3
- call void @llvm.AMDGPU.barrier.local()
+ call void @llvm.amdgcn.s.barrier()
ret void
LOOP: ; preds = %main_body, %Flow
@@ -159,5 +156,3 @@ ENDIF19: ; preds = %ENDIF16
%115 = fadd float %temp4.0, 1.000000e+00
br label %Flow1
}
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
index 8d980dbf8995..00d4ba66913d 100644
--- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
@@ -1,7 +1,7 @@
;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
;REQUIRES: asserts
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -85,12 +85,12 @@ ENDIF: ; preds = %ENDIF16, %LOOP, %ma
%72 = insertelement <4 x float> %71, float %62, i32 1
%73 = insertelement <4 x float> %72, float %66, i32 2
%74 = insertelement <4 x float> %73, float %70, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %74, i32 60, i32 1)
%75 = insertelement <4 x float> undef, float %temp.0, i32 0
%76 = insertelement <4 x float> %75, float %temp1.0, i32 1
%77 = insertelement <4 x float> %76, float %temp2.0, i32 2
%78 = insertelement <4 x float> %77, float %temp3.0, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %78, i32 0, i32 2)
ret void
LOOP: ; preds = %main_body, %ENDIF19
@@ -127,6 +127,4 @@ ENDIF19: ; preds = %ENDIF16
br label %LOOP
}
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index d43de4766057..a66f074123c1 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
+; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
; When a frame index offset is more than 12-bits, make sure we don't store
; it in mubuf's offset field.
@@ -49,8 +47,8 @@ done:
}
-; GCN-LABEL: {{^}}legal_offset_fi_offset
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; GCN-LABEL: {{^}}legal_offset_fi_offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
; GCN: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8000
; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
@@ -87,11 +85,8 @@ done:
ret void
}
-; GCN-LABEL: @neg_vaddr_offset
-; We can't prove %offset is positive, so we must do the computation with the
-; immediate in an add instruction instead of folding offset and the immediate into
-; the store instruction.
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN-LABEL: {{^}}neg_vaddr_offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}}
define void @neg_vaddr_offset(i32 %offset) {
entry:
%array = alloca [8192 x i32]
@@ -101,9 +96,8 @@ entry:
ret void
}
-; GCN-LABEL: @pos_vaddr_offse
-; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
-; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
+; GCN-LABEL: {{^}}pos_vaddr_offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
entry:
%array = alloca [8192 x i32]
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index de645353a401..29d893414c07 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -34,8 +34,8 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
; working.
; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
; SI: v_add_i32
; SI: v_lshrrev_b32
@@ -82,6 +82,60 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
ret void
}
+; FUNC-LABEL: {{^}}v_sdiv_i8:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+ %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+ %num = load i8, i8 addrspace(1) * %in
+ %den = load i8, i8 addrspace(1) * %den_ptr
+ %result = sdiv i8 %num, %den
+ %result.ext = sext i8 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i23:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+ %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
+ %num = load i23, i23 addrspace(1) * %in
+ %den = load i23, i23 addrspace(1) * %den_ptr
+ %result = sdiv i23 %num, %den
+ %result.ext = sext i23 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i24:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+ %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
+ %num = load i24, i24 addrspace(1) * %in
+ %den = load i24, i24 addrspace(1) * %den_ptr
+ %result = sdiv i24 %num, %den
+ %result.ext = sext i24 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i25:
+; SI-NOT: v_rcp_f32
+define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+ %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
+ %num = load i25, i25 addrspace(1) * %in
+ %den = load i25, i25 addrspace(1) * %den_ptr
+ %result = sdiv i25 %num, %den
+ %result.ext = sext i25 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
; Tests for 64-bit divide bypass.
; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
; %result = sdiv i64 %a, %b
diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll
index ad5df39f5505..ccabd3c2a969 100644
--- a/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -181,13 +181,13 @@ define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
ret void
}
-; FUNC-LABEL: {{^}}srem25_i32:
+; FUNC-LABEL: {{^}}no_srem25_i32:
; SI-NOT: v_cvt_f32_i32
; SI-NOT: v_rcp_f32
; EG-NOT: INT_TO_FLT
; EG-NOT: RECIP_IEEE
-define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -200,40 +200,138 @@ define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
ret void
}
-; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
+; FUNC-LABEL: {{^}}no_sdiv25_i24_i25_i32:
; SI-NOT: v_cvt_f32_i32
; SI-NOT: v_rcp_f32
; EG-NOT: INT_TO_FLT
; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
%num.i24.0 = shl i32 %num, 8
- %den.i24.0 = shl i32 %den, 7
+ %den.i25.0 = shl i32 %den, 7
%num.i24 = ashr i32 %num.i24.0, 8
- %den.i24 = ashr i32 %den.i24.0, 7
- %result = srem i32 %num.i24, %den.i24
+ %den.i25 = ashr i32 %den.i25.0, 7
+ %result = sdiv i32 %num.i24, %den.i25
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
+; FUNC-LABEL: {{^}}no_sdiv25_i25_i24_i32:
; SI-NOT: v_cvt_f32_i32
; SI-NOT: v_rcp_f32
; EG-NOT: INT_TO_FLT
; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
- %num.i24.0 = shl i32 %num, 7
+ %num.i25.0 = shl i32 %num, 7
%den.i24.0 = shl i32 %den, 8
- %num.i24 = ashr i32 %num.i24.0, 7
+ %num.i25 = ashr i32 %num.i25.0, 7
%den.i24 = ashr i32 %den.i24.0, 8
- %result = srem i32 %num.i24, %den.i24
+ %result = sdiv i32 %num.i25, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}no_srem25_i24_i25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i24.0 = shl i32 %num, 8
+ %den.i25.0 = shl i32 %den, 7
+ %num.i24 = ashr i32 %num.i24.0, 8
+ %den.i25 = ashr i32 %den.i25.0, 7
+ %result = srem i32 %num.i24, %den.i25
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}no_srem25_i25_i24_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i25.0 = shl i32 %num, 7
+ %den.i24.0 = shl i32 %den, 8
+ %num.i25 = ashr i32 %num.i25.0, 7
+ %den.i24 = ashr i32 %den.i24.0, 8
+ %result = srem i32 %num.i25, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i24_i11_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i24.0 = shl i32 %num, 8
+ %den.i11.0 = shl i32 %den, 21
+ %num.i24 = ashr i32 %num.i24.0, 8
+ %den.i11 = ashr i32 %den.i11.0, 21
+ %result = srem i32 %num.i24, %den.i11
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i11_i24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i11.0 = shl i32 %num, 21
+ %den.i24.0 = shl i32 %den, 8
+ %num.i11 = ashr i32 %num.i11.0, 21
+ %den.i24 = ashr i32 %den.i24.0, 8
+ %result = srem i32 %num.i11, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i17_i12_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 17
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i17.0 = shl i32 %num, 15
+ %den.i12.0 = shl i32 %den, 20
+ %num.i17 = ashr i32 %num.i17.0, 15
+ %den.i12 = ashr i32 %den.i12.0, 20
+ %result = sdiv i32 %num.i17, %den.i12
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll
index a9b2b7f9df55..a7ce948acd4f 100644
--- a/test/CodeGen/AMDGPU/sdivrem64.ll
+++ b/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -1,8 +1,8 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-;FUNC-LABEL: {{^}}test_sdiv:
+;FUNC-LABEL: {{^}}s_test_sdiv:
;EG: RECIP_UINT
;EG: LSHL {{.*}}, 1,
;EG: BFE_UINT
@@ -36,47 +36,47 @@
;EG: BFE_UINT
;EG: BFE_UINT
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN: v_bfe_u32
-;GCN-NOT: v_mad_f32
-;SI-NOT: v_lshr_b64
-;VI-NOT: v_lshrrev_b64
-;GCN: s_endpgm
-define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN: s_bfe_u32
+; GCN-NOT: v_mad_f32
+; SI-NOT: v_lshr_b64
+; VI-NOT: v_lshrrev_b64
+; GCN: s_endpgm
+define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
%result = sdiv i64 %x, %y
store i64 %result, i64 addrspace(1)* %out
ret void
}
-;FUNC-LABEL: {{^}}test_srem:
+;FUNC-LABEL: {{^}}s_test_srem:
;EG: RECIP_UINT
;EG: BFE_UINT
;EG: BFE_UINT
@@ -144,7 +144,7 @@ define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
;SI-NOT: v_lshr_b64
;VI-NOT: v_lshrrev_b64
;GCN: s_endpgm
-define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
%result = urem i64 %x, %y
store i64 %result, i64 addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll
index 6735394e93a9..2406831b94c5 100644
--- a/test/CodeGen/AMDGPU/select-i1.ll
+++ b/test/CodeGen/AMDGPU/select-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
@@ -13,3 +13,15 @@ define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind
ret void
}
+; FUNC-LABEL: {{^}}s_minmax_i1:
+; SI-DAG: buffer_load_ubyte [[COND:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; SI-DAG: buffer_load_ubyte [[A:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:45
+; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[COND]]
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+ %cmp = icmp slt i1 %cond, false
+ %sel = select i1 %cmp, i1 %a, i1 %b
+ store i1 %sel, i1 addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 94758ad84c18..faf8d8a12c25 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; Test expansion of scalar selects on vectors.
@@ -29,30 +29,50 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
ret void
}
-; FUNC-LABEL: {{^}}select_v2i32:
+; FIXME: Expansion with bitwise operations may be better if doing a
+; vector select with SGPR inputs.
+
+; FUNC-LABEL: {{^}}s_select_v2i32:
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
; SI: buffer_store_dwordx2
-define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
ret void
}
-; FUNC-LABEL: {{^}}select_v4i32:
+; FUNC-LABEL: {{^}}s_select_v4i32:
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
; SI: buffer_store_dwordx4
-define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
ret void
}
+; FUNC-LABEL: {{^}}v_select_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_cmp_gt_u32_e64 vcc, 32, s{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: buffer_store_dwordx4
+define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
+bb:
+ %tmp2 = icmp ult i32 %cond, 32
+ %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer
+ store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16
+ ret void
+}
+
; FUNC-LABEL: {{^}}select_v8i32:
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
@@ -69,24 +89,61 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32>
ret void
}
-; FUNC-LABEL: {{^}}select_v2f32:
+; FUNC-LABEL: {{^}}s_select_v2f32:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
+
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
+; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
; SI: buffer_store_dwordx2
-define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <2 x float> %a, <2 x float> %b
store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
ret void
}
-; FUNC-LABEL: {{^}}select_v4f32:
+; FUNC-LABEL: {{^}}s_select_v4f32:
+; SI: s_load_dwordx4
+; SI: s_load_dwordx4
+; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+
; SI: buffer_store_dwordx4
-define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, <4 x float> %a, <4 x float> %b
store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
ret void
}
+; FUNC-LABEL: {{^}}v_select_v4f32:
+; SI: buffer_load_dwordx4
+; SI: v_cmp_gt_u32_e64 vcc, 32, s{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; SI: buffer_store_dwordx4
+define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
+bb:
+ %tmp2 = icmp ult i32 %cond, 32
+ %val = load <4 x float>, <4 x float> addrspace(1)* %in
+ %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer
+ store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16
+ ret void
+}
+
; FUNC-LABEL: {{^}}select_v8f32:
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
@@ -154,3 +211,9 @@ define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x
store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
ret void
}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/selected-stack-object.ll b/test/CodeGen/AMDGPU/selected-stack-object.ll
new file mode 100644
index 000000000000..37f2747d9815
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selected-stack-object.ll
@@ -0,0 +1,15 @@
+; "Assertion failure" should be caught with both XFAIL:* and +Asserts.
+; XFAIL: *
+; REQUIRES: asserts
+
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+
+; See also local-stack-slot-bug.ll
+; This fails because a stack object is created during instruction selection.
+
+; CHECK-LABEL: {{^}}main:
+define amdgpu_ps float @main(i32 %idx) {
+main_body:
+ %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
+ ret float %v1
+}
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
index 63d74820f961..d2c57a810c2c 100644
--- a/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -36,34 +36,30 @@ define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
ret void
}
-; This really folds away to false
-; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_neg1:
+; GCN-NOT: v_cmp
; GCN: v_cmp_eq_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%icmp0 = icmp eq i32 %a, %b
%ext = sext i1 %icmp0 to i32
- %icmp1 = icmp eq i32 %ext, 1
+ %icmp1 = icmp eq i32 %ext, -1
store i1 %icmp1, i1 addrspace(1)* %out
ret void
}
-; This really folds away to true
-; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
-; GCN: v_cmp_ne_i32_e32 vcc,
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
-; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
-; GCN-NEXT: buffer_store_byte [[TMP]]
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_neg1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%icmp0 = icmp ne i32 %a, %b
%ext = sext i1 %icmp0 to i32
- %icmp1 = icmp ne i32 %ext, 1
+ %icmp1 = icmp ne i32 %ext, -1
store i1 %icmp1, i1 addrspace(1)* %out
ret void
}
@@ -123,20 +119,28 @@ define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
ret void
}
-; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
-; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}}
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN: buffer_store_byte
-; GCN: s_endpgm
-define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+; Reduces to false:
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_neg1:
+; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+ %icmp0 = icmp eq i32 %a, %b
+ %ext = zext i1 %icmp0 to i32
+ %icmp1 = icmp eq i32 %ext, -1
+ store i1 %icmp1, i1 addrspace(1)* %out
+ ret void
+}
+
+; Reduces to true:
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_neg1:
+; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%icmp0 = icmp ne i32 %a, %b
- %ext = sext i1 %icmp0 to i32
- %icmp1 = icmp ne i32 %ext, 2
+ %ext = zext i1 %icmp0 to i32
+ %icmp1 = icmp ne i32 %ext, -1
store i1 %icmp1, i1 addrspace(1)* %out
ret void
}
@@ -145,10 +149,10 @@ define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff
-; GCN: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
-; GCN: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
+; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
+; GCN-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK255]]
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: buffer_store_byte [[RESULT]]
; GCN: s_endpgm
define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
@@ -162,7 +166,7 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
; GCN: buffer_load_sbyte [[B:v[0-9]+]]
; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}}
; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: buffer_store_byte [[RESULT]]
; GCN: s_endpgm
define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
%b = load i8, i8 addrspace(1)* %b.ptr
@@ -193,10 +197,10 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff
-; GCN: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
-; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
+; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK]]{{$}}
-; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: buffer_store_byte [[RESULT]]
; GCN: s_endpgm
define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
@@ -240,3 +244,40 @@ define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
store i1 %icmp1, i1 addrspace(1)* %out
ret void
}
+
+; FIXME: These cases should really be able fold to true/false in
+; DAGCombiner
+
+; This really folds away to false
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[K]]
+define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+ %icmp0 = icmp eq i32 %a, %b
+ %ext = sext i1 %icmp0 to i32
+ %icmp1 = icmp eq i32 %ext, 1
+ store i1 %icmp1, i1 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[K]]
+define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+ %icmp0 = icmp ne i32 %a, %b
+ %ext = sext i1 %icmp0 to i32
+ %icmp1 = icmp ne i32 %ext, 1
+ store i1 %icmp1, i1 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[K]]
+define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+ %icmp0 = icmp ne i32 %a, %b
+ %ext = sext i1 %icmp0 to i32
+ %icmp1 = icmp ne i32 %ext, 2
+ store i1 %icmp1, i1 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index f33a82df5ffb..c89e712e4cb0 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -1,5 +1,5 @@
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -375,3 +375,37 @@ define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra,
store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
ret void
}
+
+; Make sure we don't try to emit i1 setcc ops
+; FUNC-LABEL: setcc-i1
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
+; SI: s_cmp_eq_i32 [[AND]], 0
+define void @setcc-i1(i32 %in) {
+ %and = and i32 %in, 1
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %endif, label %if
+if:
+ unreachable
+endif:
+ ret void
+}
+
+; FUNC-LABEL: setcc-i1-and-xor
+; SI-DAG: v_cmp_le_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI-DAG: v_cmp_ge_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], 1.0, s{{[0-9]+}}
+; SI: s_and_b64 s[2:3], [[A]], [[B]]
+define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
+bb0:
+ %tmp5 = fcmp oge float %cond, 0.000000e+00
+ %tmp7 = fcmp ole float %cond, 1.000000e+00
+ %tmp9 = and i1 %tmp5, %tmp7
+ %tmp11 = xor i1 %tmp9, 1
+ br i1 %tmp11, label %bb2, label %bb1
+
+bb1:
+ store i32 0, i32 addrspace(1)* %out
+ br label %bb2
+
+bb2:
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll
index 231be7aa3da7..15db03cf906e 100644
--- a/test/CodeGen/AMDGPU/setcc64.ll
+++ b/test/CodeGen/AMDGPU/setcc64.ll
@@ -59,7 +59,7 @@ entry:
; FUNC-LABEL: {{^}}f64_one:
; SI: v_cmp_lg_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp one double %a, %b
@@ -80,7 +80,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ueq:
; SI: v_cmp_nlg_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ueq double %a, %b
@@ -92,7 +92,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ugt:
; SI: v_cmp_nle_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ugt double %a, %b
@@ -103,7 +103,7 @@ entry:
; FUNC-LABEL: {{^}}f64_uge:
; SI: v_cmp_nlt_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp uge double %a, %b
@@ -114,7 +114,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ult:
; SI: v_cmp_nge_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ult double %a, %b
@@ -125,7 +125,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ule:
; SI: v_cmp_ngt_f64_e32 vcc
-; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ule double %a, %b
diff --git a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
new file mode 100644
index 000000000000..08bdc3aba555
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -0,0 +1,22 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s
+; XUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+;
+; EG-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+
+; Works with the align 2 removed
+define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+ %c = add <2 x i32> %a, %b
+ %x = shl <2 x i32> %c, <i32 6, i32 6>
+ %y = ashr <2 x i32> %x, <i32 7, i32 7>
+ store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 23ae3b967971..a6c72a5165d6 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
@@ -95,17 +95,6 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: LSHL
-; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
-; EG: ASHR [[RES_HI]]
-; EG-NOT: BFE_INT
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%c = shl i64 %a, %b
%shl = shl i64 %c, 56
@@ -121,16 +110,6 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: LSHL
-; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
-; EG: ASHR [[RES_HI]]
-; EG-NOT: BFE_INT
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%c = shl i64 %a, %b
%shl = shl i64 %c, 48
@@ -145,17 +124,6 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG-NOT: BFE_INT
-
-; EG: ASHR [[RES_HI]]
-
-; EG: LSHR
-; EG: LSHR
-;; TODO Check address computation, using | with variables in {{}} does not work,
-;; also the _LO/_HI order might be different
define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%c = shl i64 %a, %b
%shl = shl i64 %c, 32
@@ -300,7 +268,7 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
%c = add <2 x i32> %a, %b
%x = shl <2 x i32> %c, <i32 6, i32 6>
%y = ashr <2 x i32> %x, <i32 7, i32 7>
- store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
+ store <2 x i32> %y, <2 x i32> addrspace(1)* %out
ret void
}
@@ -458,7 +426,8 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
%tmp5 = load i8, i8 addrspace(1)* %src, align 1
%tmp2 = sext i8 %tmp5 to i32
- %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone
+ %tmp2.5 = icmp sgt i32 %tmp2, 0
+ %tmp3 = select i1 %tmp2.5, i32 %tmp2, i32 0
%tmp4 = trunc i32 %tmp3 to i8
%tmp6 = sext i8 %tmp4 to i16
store i16 %tmp6, i16 addrspace(1)* %out, align 2
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 38289ced632a..f1b8e8eec85d 100644
--- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
;
;
; Most SALU instructions ignore control flow, so we need to make sure
@@ -40,7 +40,7 @@ endif:
define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
entry:
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%tid_f = uitofp i32 %tid to float
%tmp1 = fcmp ueq float %tid_f, 0.0
br i1 %tmp1, label %if, label %else
@@ -67,7 +67,7 @@ endif:
; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
-; SI: BB2_1:
+; SI: BB2_2:
; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
@@ -77,7 +77,7 @@ endif:
; SI: buffer_store_dword [[RESULT]]
define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
entry:
- %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%tmp1 = icmp eq i32 %tid, 0
br i1 %tmp1, label %if, label %else
@@ -100,6 +100,6 @@ endif:
ret void
}
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index b849c4038bc7..da270c533ece 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -3,189 +3,193 @@
; This test checks that no VGPR to SGPR copies are created by the register
; allocator.
+
+
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+
; CHECK-LABEL: {{^}}phi1:
; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
-
-define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
main_body:
- %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
- %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
- %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
- %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
- %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
- %25 = fptosi float %23 to i32
- %26 = icmp ne i32 %25, 0
- br i1 %26, label %ENDIF, label %ELSE
+ %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+ %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0)
+ %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+ %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32)
+ %tmp24 = fptosi float %tmp22 to i32
+ %tmp25 = icmp ne i32 %tmp24, 0
+ br i1 %tmp25, label %ENDIF, label %ELSE
ELSE: ; preds = %main_body
- %27 = fsub float -0.000000e+00, %22
+ %tmp26 = fsub float -0.000000e+00, %tmp21
br label %ENDIF
-ENDIF: ; preds = %main_body, %ELSE
- %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ]
- %28 = fadd float %temp.0, %24
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00)
+ENDIF: ; preds = %ELSE, %main_body
+ %temp.0 = phi float [ %tmp26, %ELSE ], [ %tmp21, %main_body ]
+ %tmp27 = fadd float %temp.0, %tmp23
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00)
ret void
}
; Make sure this program doesn't crash
; CHECK-LABEL: {{^}}phi2:
-define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
main_body:
- %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
- %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
- %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
- %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
- %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
- %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40)
- %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48)
- %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52)
- %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56)
- %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64)
- %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68)
- %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72)
- %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76)
- %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80)
- %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
- %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
- %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
- %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
- %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1
- %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
- %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1
- %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
- %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
- %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
- %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5)
- %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5)
- %46 = bitcast float %41 to i32
- %47 = bitcast float %42 to i32
- %48 = insertelement <2 x i32> undef, i32 %46, i32 0
- %49 = insertelement <2 x i32> %48, i32 %47, i32 1
- %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2)
- %51 = extractelement <4 x float> %50, i32 2
- %52 = call float @fabs(float %51)
- %53 = fmul float %43, %43
- %54 = fmul float %44, %44
- %55 = fadd float %54, %53
- %56 = fmul float %45, %45
- %57 = fadd float %55, %56
- %58 = call float @llvm.AMDGPU.rsq.f32(float %57)
- %59 = fmul float %43, %58
- %60 = fmul float %44, %58
- %61 = fmul float %45, %58
- %62 = fmul float %59, %23
- %63 = fmul float %60, %24
- %64 = fadd float %63, %62
- %65 = fmul float %61, %25
- %66 = fadd float %64, %65
- %67 = fsub float -0.000000e+00, %26
- %68 = fmul float %66, %52
- %69 = fadd float %68, %67
- %70 = fmul float %27, %69
- %71 = fmul float %28, %69
- %72 = call float @fabs(float %70)
- %73 = fcmp olt float 0x3EE4F8B580000000, %72
- %74 = sext i1 %73 to i32
- %75 = bitcast i32 %74 to float
- %76 = bitcast float %75 to i32
- %77 = icmp ne i32 %76, 0
- br i1 %77, label %IF, label %ENDIF
+ %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+ %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+ %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32)
+ %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 36)
+ %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 40)
+ %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 48)
+ %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 52)
+ %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 56)
+ %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 64)
+ %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 68)
+ %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 72)
+ %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 76)
+ %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 80)
+ %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 84)
+ %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 88)
+ %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 92)
+ %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
+ %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0
+ %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
+ %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0
+ %tmp40 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
+ %tmp41 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
+ %tmp42 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg3, <2 x i32> %arg5)
+ %tmp43 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg3, <2 x i32> %arg5)
+ %tmp44 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg3, <2 x i32> %arg5)
+ %tmp45 = bitcast float %tmp40 to i32
+ %tmp46 = bitcast float %tmp41 to i32
+ %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0
+ %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1
+ %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32>
+ %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp50 = extractelement <4 x float> %tmp49, i32 2
+ %tmp51 = call float @fabs(float %tmp50)
+ %tmp52 = fmul float %tmp42, %tmp42
+ %tmp53 = fmul float %tmp43, %tmp43
+ %tmp54 = fadd float %tmp53, %tmp52
+ %tmp55 = fmul float %tmp44, %tmp44
+ %tmp56 = fadd float %tmp54, %tmp55
+ %tmp57 = call float @llvm.amdgcn.rsq.f32(float %tmp56)
+ %tmp58 = fmul float %tmp42, %tmp57
+ %tmp59 = fmul float %tmp43, %tmp57
+ %tmp60 = fmul float %tmp44, %tmp57
+ %tmp61 = fmul float %tmp58, %tmp22
+ %tmp62 = fmul float %tmp59, %tmp23
+ %tmp63 = fadd float %tmp62, %tmp61
+ %tmp64 = fmul float %tmp60, %tmp24
+ %tmp65 = fadd float %tmp63, %tmp64
+ %tmp66 = fsub float -0.000000e+00, %tmp25
+ %tmp67 = fmul float %tmp65, %tmp51
+ %tmp68 = fadd float %tmp67, %tmp66
+ %tmp69 = fmul float %tmp26, %tmp68
+ %tmp70 = fmul float %tmp27, %tmp68
+ %tmp71 = call float @fabs(float %tmp69)
+ %tmp72 = fcmp olt float 0x3EE4F8B580000000, %tmp71
+ %tmp73 = sext i1 %tmp72 to i32
+ %tmp74 = bitcast i32 %tmp73 to float
+ %tmp75 = bitcast float %tmp74 to i32
+ %tmp76 = icmp ne i32 %tmp75, 0
+ br i1 %tmp76, label %IF, label %ENDIF
IF: ; preds = %main_body
- %78 = fsub float -0.000000e+00, %70
- %79 = call float @llvm.AMDIL.exp.(float %78)
- %80 = fsub float -0.000000e+00, %79
- %81 = fadd float 1.000000e+00, %80
- %82 = fdiv float 1.000000e+00, %70
- %83 = fmul float %81, %82
- %84 = fmul float %32, %83
+ %tmp77 = fsub float -0.000000e+00, %tmp69
+ %tmp78 = call float @llvm.exp2.f32(float %tmp77)
+ %tmp79 = fsub float -0.000000e+00, %tmp78
+ %tmp80 = fadd float 1.000000e+00, %tmp79
+ %tmp81 = fdiv float 1.000000e+00, %tmp69
+ %tmp82 = fmul float %tmp80, %tmp81
+ %tmp83 = fmul float %tmp31, %tmp82
br label %ENDIF
-ENDIF: ; preds = %main_body, %IF
- %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ]
- %85 = call float @fabs(float %71)
- %86 = fcmp olt float 0x3EE4F8B580000000, %85
- %87 = sext i1 %86 to i32
- %88 = bitcast i32 %87 to float
- %89 = bitcast float %88 to i32
- %90 = icmp ne i32 %89, 0
- br i1 %90, label %IF25, label %ENDIF24
+ENDIF: ; preds = %IF, %main_body
+ %temp4.0 = phi float [ %tmp83, %IF ], [ %tmp31, %main_body ]
+ %tmp84 = call float @fabs(float %tmp70)
+ %tmp85 = fcmp olt float 0x3EE4F8B580000000, %tmp84
+ %tmp86 = sext i1 %tmp85 to i32
+ %tmp87 = bitcast i32 %tmp86 to float
+ %tmp88 = bitcast float %tmp87 to i32
+ %tmp89 = icmp ne i32 %tmp88, 0
+ br i1 %tmp89, label %IF25, label %ENDIF24
IF25: ; preds = %ENDIF
- %91 = fsub float -0.000000e+00, %71
- %92 = call float @llvm.AMDIL.exp.(float %91)
- %93 = fsub float -0.000000e+00, %92
- %94 = fadd float 1.000000e+00, %93
- %95 = fdiv float 1.000000e+00, %71
- %96 = fmul float %94, %95
- %97 = fmul float %36, %96
+ %tmp90 = fsub float -0.000000e+00, %tmp70
+ %tmp91 = call float @llvm.exp2.f32(float %tmp90)
+ %tmp92 = fsub float -0.000000e+00, %tmp91
+ %tmp93 = fadd float 1.000000e+00, %tmp92
+ %tmp94 = fdiv float 1.000000e+00, %tmp70
+ %tmp95 = fmul float %tmp93, %tmp94
+ %tmp96 = fmul float %tmp35, %tmp95
br label %ENDIF24
-ENDIF24: ; preds = %ENDIF, %IF25
- %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ]
- %98 = fmul float %29, %temp4.0
- %99 = fmul float %30, %temp4.0
- %100 = fmul float %31, %temp4.0
- %101 = fmul float %33, %temp8.0
- %102 = fadd float %101, %98
- %103 = fmul float %34, %temp8.0
- %104 = fadd float %103, %99
- %105 = fmul float %35, %temp8.0
- %106 = fadd float %105, %100
- %107 = call float @llvm.pow.f32(float %52, float %22)
- %108 = fsub float -0.000000e+00, %102
- %109 = fmul float %108, %107
- %110 = fsub float -0.000000e+00, %104
- %111 = fmul float %110, %107
- %112 = fsub float -0.000000e+00, %106
- %113 = fmul float %112, %107
- %114 = call i32 @llvm.SI.packf16(float %109, float %111)
- %115 = bitcast i32 %114 to float
- %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00)
- %117 = bitcast i32 %116 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117)
+ENDIF24: ; preds = %IF25, %ENDIF
+ %temp8.0 = phi float [ %tmp96, %IF25 ], [ %tmp35, %ENDIF ]
+ %tmp97 = fmul float %tmp28, %temp4.0
+ %tmp98 = fmul float %tmp29, %temp4.0
+ %tmp99 = fmul float %tmp30, %temp4.0
+ %tmp100 = fmul float %tmp32, %temp8.0
+ %tmp101 = fadd float %tmp100, %tmp97
+ %tmp102 = fmul float %tmp33, %temp8.0
+ %tmp103 = fadd float %tmp102, %tmp98
+ %tmp104 = fmul float %tmp34, %temp8.0
+ %tmp105 = fadd float %tmp104, %tmp99
+ %tmp106 = call float @llvm.pow.f32(float %tmp51, float %tmp21)
+ %tmp107 = fsub float -0.000000e+00, %tmp101
+ %tmp108 = fmul float %tmp107, %tmp106
+ %tmp109 = fsub float -0.000000e+00, %tmp103
+ %tmp110 = fmul float %tmp109, %tmp106
+ %tmp111 = fsub float -0.000000e+00, %tmp105
+ %tmp112 = fmul float %tmp111, %tmp106
+ %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110)
+ %tmp114 = bitcast i32 %tmp113 to float
+ %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00)
+ %tmp116 = bitcast i32 %tmp115 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp114, float %tmp116, float %tmp114, float %tmp116)
ret void
}
; We just want ot make sure the program doesn't crash
; CHECK-LABEL: {{^}}loop:
-
-define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
main_body:
- %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
- %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
- %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
- %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
- %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
- %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12)
- %26 = fptosi float %25 to i32
- %27 = bitcast i32 %26 to float
- %28 = bitcast float %27 to i32
+ %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+ %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0)
+ %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 4)
+ %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 8)
+ %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 12)
+ %tmp25 = fptosi float %tmp24 to i32
+ %tmp26 = bitcast i32 %tmp25 to float
+ %tmp27 = bitcast float %tmp26 to i32
br label %LOOP
LOOP: ; preds = %ENDIF, %main_body
- %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ]
- %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ]
- %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ]
- %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ]
- %29 = bitcast float %temp8.0 to i32
- %30 = icmp sge i32 %29, %28
- %31 = sext i1 %30 to i32
- %32 = bitcast i32 %31 to float
- %33 = bitcast float %32 to i32
- %34 = icmp ne i32 %33, 0
- br i1 %34, label %IF, label %ENDIF
+ %temp4.0 = phi float [ %tmp21, %main_body ], [ %temp5.0, %ENDIF ]
+ %temp5.0 = phi float [ %tmp22, %main_body ], [ %temp6.0, %ENDIF ]
+ %temp6.0 = phi float [ %tmp23, %main_body ], [ %temp4.0, %ENDIF ]
+ %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %tmp36, %ENDIF ]
+ %tmp28 = bitcast float %temp8.0 to i32
+ %tmp29 = icmp sge i32 %tmp28, %tmp27
+ %tmp30 = sext i1 %tmp29 to i32
+ %tmp31 = bitcast i32 %tmp30 to float
+ %tmp32 = bitcast float %tmp31 to i32
+ %tmp33 = icmp ne i32 %tmp32, 0
+ br i1 %tmp33, label %IF, label %ENDIF
IF: ; preds = %LOOP
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
ret void
ENDIF: ; preds = %LOOP
- %35 = bitcast float %temp8.0 to i32
- %36 = add i32 %35, 1
- %37 = bitcast i32 %36 to float
+ %tmp34 = bitcast float %temp8.0 to i32
+ %tmp35 = add i32 %tmp34, 1
+ %tmp36 = bitcast i32 %tmp35 to float
br label %LOOP
}
@@ -197,29 +201,19 @@ declare float @fabs(float) #2
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readonly }
-attributes #3 = { readnone }
-attributes #4 = { nounwind readonly }
-
-!0 = !{!"const", null}
-!1 = !{!0, !0, i64 0, i32 1}
-
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #1
; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #3
+declare float @llvm.amdgcn.rsq.f32(float) #1
-; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #3
+declare float @llvm.exp2.f32(float) #1
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #4
+; Function Attrs: nounwind readnone
+declare float @llvm.pow.f32(float, float) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
@@ -233,114 +227,109 @@ declare i32 @llvm.SI.packf16(float, float) #1
; CHECK: image_sample
; CHECK: exp
; CHECK: s_endpgm
-define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-
+define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
entry:
- %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
- %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2
- %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
- %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
- %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2
- %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
- %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2
- %28 = fcmp oeq float %23, 0.0
- br i1 %28, label %if, label %else
-
-if:
- %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 0, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+ %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
+ %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 16)
+ %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
+ %tmp24 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp23, !tbaa !0
+ %tmp25 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
+ %tmp26 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp25, !tbaa !0
+ %tmp27 = fcmp oeq float %tmp22, 0.000000e+00
+ %tmp26.bc = bitcast <16 x i8> %tmp26 to <4 x i32>
+ br i1 %tmp27, label %if, label %else
+
+if: ; preds = %entry
+ %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> zeroinitializer, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%val.if.0 = extractelement <4 x float> %val.if, i32 0
%val.if.1 = extractelement <4 x float> %val.if, i32 1
%val.if.2 = extractelement <4 x float> %val.if, i32 2
br label %endif
-else:
- %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 1, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+else: ; preds = %entry
+ %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1, i32 0>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%val.else.0 = extractelement <4 x float> %val.else, i32 0
%val.else.1 = extractelement <4 x float> %val.else, i32 1
%val.else.2 = extractelement <4 x float> %val.else, i32 2
br label %endif
-endif:
- %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else]
- %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else]
- %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else]
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0)
+endif: ; preds = %else, %if
+ %val.0 = phi float [ %val.if.0, %if ], [ %val.else.0, %else ]
+ %val.1 = phi float [ %val.if.1, %if ], [ %val.else.1, %else ]
+ %val.2 = phi float [ %val.if.2, %if ], [ %val.else.2, %else ]
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.000000e+00)
ret void
}
-!2 = !{!"const", null, i32 1}
-
; CHECK-LABEL: {{^}}copy1:
; CHECK: buffer_load_dword
; CHECK: v_add
; CHECK: s_endpgm
define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
entry:
- %0 = load float, float addrspace(1)* %in0
- %1 = fcmp oeq float %0, 0.0
- br i1 %1, label %if0, label %endif
+ %tmp = load float, float addrspace(1)* %in0
+ %tmp1 = fcmp oeq float %tmp, 0.000000e+00
+ br i1 %tmp1, label %if0, label %endif
-if0:
- %2 = bitcast float %0 to i32
- %3 = fcmp olt float %0, 0.0
- br i1 %3, label %if1, label %endif
+if0: ; preds = %entry
+ %tmp2 = bitcast float %tmp to i32
+ %tmp3 = fcmp olt float %tmp, 0.000000e+00
+ br i1 %tmp3, label %if1, label %endif
-if1:
- %4 = add i32 %2, 1
+if1: ; preds = %if0
+ %tmp4 = add i32 %tmp2, 1
br label %endif
-endif:
- %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ]
- %6 = bitcast i32 %5 to float
- store float %6, float addrspace(1)* %out
+endif: ; preds = %if1, %if0, %entry
+ %tmp5 = phi i32 [ 0, %entry ], [ %tmp2, %if0 ], [ %tmp4, %if1 ]
+ %tmp6 = bitcast i32 %tmp5 to float
+ store float %tmp6, float addrspace(1)* %out
ret void
}
; This test is just checking that we don't crash / assertion fail.
; CHECK-LABEL: {{^}}copy2:
; CHECK: s_endpgm
-
-define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
entry:
br label %LOOP68
-LOOP68:
+LOOP68: ; preds = %ENDIF69, %entry
%temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ]
%t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ]
%g = icmp eq i32 0, %t
%l = bitcast float %temp4.7 to i32
br i1 %g, label %IF70, label %ENDIF69
-IF70:
+IF70: ; preds = %LOOP68
%q = icmp ne i32 %l, 13
%temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
ret void
-ENDIF69:
+ENDIF69: ; preds = %LOOP68
%u = add i32 %l, %t
%v = bitcast i32 %u to float
%x = add i32 %t, -1
br label %LOOP68
}
-attributes #0 = { "ShaderType"="0" }
-
; This test checks that image_sample resource descriptors aren't loaded into
; vgprs. The verifier will fail if this happens.
; CHECK-LABEL:{{^}}sample_rsrc:
; CHECK: image_sample
; CHECK: image_sample
; CHECK: s_endpgm
-define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
bb:
%tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
- %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !2
%tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
%tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
- %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0
+ %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !2
%tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
- %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0
+ %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !2
%tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
%tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
%tmp31 = bitcast float %tmp23 to i32
@@ -352,9 +341,8 @@ bb38: ; preds = %bb
%tmp53 = bitcast float %tmp30 to i32
%tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
%tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
- %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8>
- %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8>
- %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2)
+ %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32>
+ %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
br label %bb71
bb80: ; preds = %bb
@@ -363,9 +351,8 @@ bb80: ; preds = %bb
%tmp82.2 = add i32 %tmp82, 1
%tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
%tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
- %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8>
- %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8>
- %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2)
+ %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32>
+ %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
br label %bb71
bb71: ; preds = %bb80, %bb38
@@ -375,5 +362,42 @@ bb71: ; preds = %bb80, %bb38
ret void
}
-attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
+; Check the the resource descriptor is stored in an sgpr.
+; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
+; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
+define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
+ %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+ %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp10 = extractelement <4 x float> %tmp9, i32 0
+ %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10)
+ %tmp13 = bitcast i32 %tmp12 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+ ret void
+}
+
+; Check the the sampler is stored in an sgpr.
+; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
+; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
+define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
+ %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
+ %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp10 = extractelement <4 x float> %tmp9, i32 0
+ %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
+ %tmp13 = bitcast i32 %tmp12 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
+!2 = !{!1, !1, i64 0}
diff --git a/test/CodeGen/AMDGPU/shared-op-cycle.ll b/test/CodeGen/AMDGPU/shared-op-cycle.ll
index f52a9baf4d18..f9a72b47cc99 100644
--- a/test/CodeGen/AMDGPU/shared-op-cycle.ll
+++ b/test/CodeGen/AMDGPU/shared-op-cycle.ll
@@ -4,7 +4,7 @@
; CHECK: MULADD_IEEE *
; CHECK-NOT: MULADD_IEEE *
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) {
%w0 = extractelement <4 x float> %reg0, i32 3
%w1 = extractelement <4 x float> %reg1, i32 3
%w2 = extractelement <4 x float> %reg2, i32 3
@@ -17,16 +17,15 @@ define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float>
%v0 = insertelement <4 x float> undef, float %r0, i32 0
%v1 = insertelement <4 x float> %v0, float %r1, i32 1
%v2 = insertelement <4 x float> %v1, float %r2, i32 2
- %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
+ %res = call float @llvm.r600.dot4(<4 x float> %v2, <4 x float> %v2)
%vecres = insertelement <4 x float> undef, float %res, i32 0
- call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
ret void
}
; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone } \ No newline at end of file
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
new file mode 100644
index 000000000000..a6555a197388
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -0,0 +1,118 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Extract the high bit of the 1st quarter
+; GCN-LABEL: {{^}}v_uextract_bit_31_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+
+; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i128, i128 addrspace(1)* %in.gep
+ %srl = lshr i128 %ld.64, 31
+ %bit = and i128 %srl, 1
+ store i128 %bit, i128 addrspace(1)* %out.gep
+ ret void
+}
+
+; Extract the high bit of the 2nd quarter
+; GCN-LABEL: {{^}}v_uextract_bit_63_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+
+; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i128, i128 addrspace(1)* %in.gep
+ %srl = lshr i128 %ld.64, 63
+ %bit = and i128 %srl, 1
+ store i128 %bit, i128 addrspace(1)* %out.gep
+ ret void
+}
+
+; Extract the high bit of the 3rd quarter
+; GCN-LABEL: {{^}}v_uextract_bit_95_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i128, i128 addrspace(1)* %in.gep
+ %srl = lshr i128 %ld.64, 95
+ %bit = and i128 %srl, 1
+ store i128 %bit, i128 addrspace(1)* %out.gep
+ ret void
+}
+
+; Extract the high bit of the 4th quarter
+; GCN-LABEL: {{^}}v_uextract_bit_127_i128:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i128, i128 addrspace(1)* %in.gep
+ %srl = lshr i128 %ld.64, 127
+ %bit = and i128 %srl, 1
+ store i128 %bit, i128 addrspace(1)* %out.gep
+ ret void
+}
+
+; Spans more than 2 dword boundaries
+; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128:
+; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+
+; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30
+; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}}
+; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
+; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}}
+
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: s_endpgm
+define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i128, i128 addrspace(1)* %in.gep
+ %srl = lshr i128 %ld.64, 34
+ %bit = and i128 %srl, 73786976294838206463
+ store i128 %bit, i128 addrspace(1)* %out.gep
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
new file mode 100644
index 000000000000..c5dbfd9589a6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -0,0 +1,386 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.
+
+; Extract the high bit of the low half
+; GCN-LABEL: {{^}}v_uextract_bit_31_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 31
+ %bit = and i64 %srl, 1
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; Extract the high bit of the high half
+; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 63
+ %bit = and i64 %srl, 1
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_1_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 1
+ %bit = and i64 %srl, 1
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_20_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 20
+ %bit = and i64 %srl, 1
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 32
+ %bit = and i64 %srl, 1
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 33
+ %bit = and i64 %srl, 1
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 20
+ %bit = and i64 %srl, 3
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 1
+ %bit = and i64 %srl, 1073741823
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 1
+ %bit = and i64 %srl, 2147483647
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; Spans the dword boundary, so requires full shift
+; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 31
+ %bit = and i64 %srl, 3
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 33
+ %bit = and i64 %srl, 3
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 30
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 30
+ %bit = and i64 %srl, 1073741823
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
+; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 33
+ %bit = and i64 %srl, 1073741823
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 31
+ %and = and i64 %srl, 4294967295
+ store i64 %and, i64 addrspace(1)* %out
+ ret void
+}
+
+; trunc applied before and mask
+; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
+; GCN: buffer_store_dword v[[SHIFT]]
+define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 31
+ %trunc = trunc i64 %srl to i32
+ %bit = and i32 %trunc, 1
+ store i32 %bit, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
+; GCN: buffer_store_dword [[BFE]]
+define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 3
+ %trunc = trunc i64 %srl to i32
+ %bit = and i32 %trunc, 1
+ store i32 %bit, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
+; GCN: buffer_store_dword [[BFE]]
+define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 33
+ %trunc = trunc i64 %srl to i32
+ %bit = and i32 %trunc, 1
+ store i32 %bit, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
+; GCN-NOT: v[[SHRLO]]
+; GCN: buffer_store_dword v[[SHRLO]]
+define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 31
+ %trunc = trunc i64 %srl to i32
+ %bit = and i32 %trunc, 3
+ store i32 %bit, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_not_mask_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
+; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
+; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]]
+; GCN-NOT: v[[SHRLO]]
+; GCN-NOT: v[[SHRHI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 20
+ %bit = and i64 %srl, 4
+ store i64 %bit, i64 addrspace(1)* %out.gep
+ ret void
+}
+
+; The instruction count is the same with/without hasOneUse, but
+; keeping the 32-bit and has a smaller encoding size than the bfe.
+
+; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27
+; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 27
+ %bit = and i64 %srl, 3
+ store volatile i64 %srl, i64 addrspace(1)* %out
+ store volatile i64 %bit, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 34
+ %bit = and i64 %srl, 7
+ store volatile i64 %srl, i64 addrspace(1)* %out
+ store volatile i64 %bit, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dword v[[ZERO]]
+define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
+ %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x
+ %out1.gep = getelementptr i32, i32 addrspace(1)* %out1, i32 %id.x
+ %ld.64 = load i64, i64 addrspace(1)* %in.gep
+ %srl = lshr i64 %ld.64, 33
+ %bit = and i64 %srl, 7
+ store volatile i64 %bit, i64 addrspace(1)* %out0.gep
+
+ %srl.srl32 = lshr i64 %srl, 32
+ %srl.hi = trunc i64 %srl.srl32 to i32
+ store volatile i32 %srl.hi, i32 addrspace(1)* %out1.gep
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
new file mode 100644
index 000000000000..28a7b924904d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s
+
+
+; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0
+; GCN-LABEL: {{^}}lshr_i64_35:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = lshr i64 %val, 35
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lshr_i64_63:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = lshr i64 %val, 63
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lshr_i64_33:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = lshr i64 %val, 33
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}lshr_i64_32:
+; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = lshr i64 %val, 32
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; Make sure the and of the constant doesn't prevent bfe from forming
+; after 64-bit shift is split.
+
+; GCN-LABEL: {{^}}lshr_and_i64_35:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
+ %shl = lshr i64 %and, 40
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; lshl (i64 x), c: c > 32 => reg_sequence lshl 0, (i32 lo_32(x)), (c - 32)
+
+; GCN-LABEL: {{^}}shl_i64_const_35:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 35
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}shl_i64_const_32:
+; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 32
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}shl_i64_const_63:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 63
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x)
+
+; GCN-LABEL: {{^}}ashr_i64_const_32:
+define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = ashr i64 %val, 32
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}ashr_i64_const_63:
+define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = ashr i64 %val, 63
+ store i64 %shl, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_31_i32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
+; GCN: buffer_store_dword [[SHL]]
+define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 31
+ %trunc = trunc i64 %shl to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_15_i16_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
+; GCN: buffer_store_short [[SHL]]
+define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 15
+ %trunc = trunc i64 %shl to i16
+ store i16 %trunc, i16 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_15_i16_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
+; GCN: buffer_store_short [[SHL]]
+define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %val = load i32, i32 addrspace(1)* %in
+ %shl = shl i32 %val, 15
+ %trunc = trunc i32 %shl to i16
+ store i16 %trunc, i16 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_7_i8_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
+; GCN: buffer_store_byte [[SHL]]
+define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 7
+ %trunc = trunc i64 %shl to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_1_i2_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
+; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
+; GCN: buffer_store_byte [[AND]]
+define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 1
+ %trunc = trunc i64 %shl to i2
+ store i2 %trunc, i2 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_1_i32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
+; GCN: buffer_store_dword [[SHL]]
+define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 1
+ %trunc = trunc i64 %shl to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_16_i32_i64:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
+; GCN: buffer_store_dword [[SHL]]
+define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 16
+ %trunc = trunc i64 %shl to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_33_i32_i64:
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[ZERO]]
+define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 33
+ %trunc = trunc i64 %shl to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_16_v2i32_v2i64:
+; GCN: buffer_load_dwordx4 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+ %val = load <2 x i64>, <2 x i64> addrspace(1)* %in
+ %shl = shl <2 x i64> %val, <i64 16, i64 16>
+ %trunc = trunc <2 x i64> %shl to <2 x i32>
+ store <2 x i32> %trunc, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_31_i32_i64_multi_use:
+; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN: buffer_store_dword v[[RESLO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %val = load i64, i64 addrspace(1)* %in
+ %shl = shl i64 %val, 31
+ %trunc = trunc i64 %shl to i32
+ store volatile i32 %trunc, i32 addrspace(1)* %out
+ store volatile i64 %shl, i64 addrspace(1)* %in
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 55db80731c90..5a2b03bff990 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
declare i32 @llvm.r600.read.tidig.x() #0
@@ -208,4 +208,173 @@ define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
ret void
}
+; FUNC-LABEL: {{^}}s_shl_constant_i64
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+ %shl = shl i64 281474976710655, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_shl_constant_i64:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0xab19b207
+; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}}
+; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
+; SI: buffer_store_dwordx2
+define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+ %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %shl = shl i64 1231231234567, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_shl_i64_32_bit_constant:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}}
+; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}}
+; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
+define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+ %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %shl = shl i64 1234567, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64:
+; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}}
+define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+ %a = load i64, i64 addrspace(1)* %aptr, align 8
+ %shl = shl i64 64, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}}
+define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 64, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}}
+define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 1, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 4607182418800017408, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 13830554455654793216, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}}
+define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 4602678819172646912, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 13826050856027422720, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 4611686018427387904, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 13835058055282163712, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 4616189618054758400, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64:
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}}
+define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 13839561654909534208, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+
+; Test with the 64-bit integer bitpattern for a 32-bit float in the
+; low 32-bits, which is not a valid 64-bit inline immmediate.
+
+; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 1082130432, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FIXME: Copy of -1 register
+; FUNC-LABEL: {{^}}s_shl_inline_imm_f32_neg_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 -1065353216, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; Shift into upper 32-bits
+; FUNC-LABEL: {{^}}s_shl_inline_high_imm_f32_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 4647714815446351872, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_shl_inline_high_imm_f32_neg_4.0_i64:
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
+define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+ %shl = shl i64 13871086852301127680, %a
+ store i64 %shl, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll
index dfb2bf3383fc..13254d0bcf74 100644
--- a/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; Test with inline immediate
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
%val = load i32, i32 addrspace(1)* %ptr, align 4
%add = add i32 %val, 9
@@ -26,7 +26,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
; SI-DAG: buffer_store_dword [[SHLREG]]
; SI: s_endpgm
define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
%val = load i32, i32 addrspace(1)* %ptr, align 4
%add = add i32 %val, 9
@@ -44,7 +44,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1
; SI: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
%val = load i32, i32 addrspace(1)* %ptr, align 4
%shl = add i32 %val, 999
@@ -57,8 +57,8 @@ define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0
; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
-; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_addk_i32 [[RESULT]], 0x3d8
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
; SI: buffer_store_dword [[VRESULT]]
define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
@@ -73,7 +73,7 @@ define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #
; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[Y]], [[SHL3]]
; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
; SI: buffer_store_dword [[VRESULT]]
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
index ac94824bd61f..a6be2eda33b3 100644
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -7,7 +7,7 @@
; LDS globals.
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
@lds0 = addrspace(3) global [512 x float] undef, align 4
@lds1 = addrspace(3) global [512 x float] undef, align 4
@@ -20,7 +20,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
; SI: s_endpgm
define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -40,7 +40,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
; SI-DAG: buffer_store_dword [[ADDUSE]]
; SI: s_endpgm
define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -56,7 +56,7 @@ define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %ad
; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
; SI: s_endpgm
define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 65535
%arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
%val0 = load i8, i8 addrspace(3)* %arrayidx0
@@ -74,7 +74,7 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
; SI: s_endpgm
define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -90,7 +90,7 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
store float 1.0, float addrspace(3)* %arrayidx0, align 4
@@ -105,7 +105,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
@lds2 = addrspace(3) global [512 x i32] undef, align 4
; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
; %idx.0 = add nsw i32 %tid.x, 2
; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
; %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4
@@ -120,7 +120,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
@@ -135,7 +135,7 @@ define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace
; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -149,7 +149,7 @@ define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -163,7 +163,7 @@ define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -177,7 +177,7 @@ define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -191,7 +191,7 @@ define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -205,7 +205,7 @@ define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -215,7 +215,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
}
; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
-; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+; %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
; %idx.0 = add nsw i32 %tid.x, 2
; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -229,7 +229,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -243,7 +243,7 @@ define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -257,7 +257,7 @@ define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
@@ -271,7 +271,7 @@ define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm
define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
- %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
%val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
deleted file mode 100644
index 69d719385acd..000000000000
--- a/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
-
-
-define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
-; CHECK-LABEL: {{^}}test:
-
-entry:
- switch i32 %x, label %sw.default [
- i32 0, label %sw.bb
- i32 60, label %sw.bb
- ]
-
-sw.bb:
- unreachable
-
-sw.default:
- unreachable
-
-sw.epilog:
- ret void
-}
-
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
index bbcb861f37dc..133fd480e599 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -10,9 +10,10 @@
; SI: s_andn2_b64
; s_cbranch_execnz [[LOOP_LABEL]]
; SI: s_endpgm
-define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
main_body:
- %0 = and i32 %a, %b
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %0 = and i32 %a, %tid
%1 = trunc i32 %0 to i1
br label %ENDIF
@@ -39,9 +40,10 @@ ENDIF:
; SI: s_cbranch_execnz [[LOOP_LABEL]]
; SI: s_endpgm
-define void @phi_cond_outside_loop(i32 %a, i32 %b) {
+define void @phi_cond_outside_loop(i32 %b) {
entry:
- %0 = icmp eq i32 %a , 0
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %0 = icmp eq i32 %tid , 0
br i1 %0, label %if, label %else
if:
@@ -61,3 +63,88 @@ loop:
exit:
ret void
}
+
+; FIXME: should emit s_endpgm
+; CHECK-LABEL: {{^}}switch_unreachable:
+; CHECK-NOT: s_endpgm
+; CHECK: .Lfunc_end2
+define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+centry:
+ switch i32 %x, label %sw.default [
+ i32 0, label %sw.bb
+ i32 60, label %sw.bb
+ ]
+
+sw.bb:
+ unreachable
+
+sw.default:
+ unreachable
+
+sw.epilog:
+ ret void
+}
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; This broke the old AMDIL cfg structurizer
+; FUNC-LABEL: {{^}}loop_land_info_assert:
+; SI: s_cmp_gt_i32
+; SI-NEXT: s_cbranch_scc0 [[ENDPGM:BB[0-9]+_[0-9]+]]
+
+; SI: s_cmp_gt_i32
+; SI-NEXT: s_cbranch_scc1 [[ENDPGM]]
+
+; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]
+; SI: s_branch [[INFLOOP]]
+
+; SI: [[ENDPGM]]:
+; SI: s_endpgm
+define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+entry:
+ %cmp = icmp sgt i32 %c0, 0
+ br label %while.cond.outer
+
+while.cond.outer:
+ %tmp = load float, float addrspace(1)* undef
+ br label %while.cond
+
+while.cond:
+ %cmp1 = icmp slt i32 %c1, 4
+ br i1 %cmp1, label %convex.exit, label %for.cond
+
+convex.exit:
+ %or = or i1 %cmp, %cmp1
+ br i1 %or, label %return, label %if.end
+
+if.end:
+ %tmp3 = call float @llvm.fabs.f32(float %tmp) nounwind readnone
+ %cmp2 = fcmp olt float %tmp3, 0x3E80000000000000
+ br i1 %cmp2, label %if.else, label %while.cond.outer
+
+if.else:
+ store volatile i32 3, i32 addrspace(1)* undef, align 4
+ br label %while.cond
+
+for.cond:
+ %cmp3 = icmp slt i32 %c3, 1000
+ br i1 %cmp3, label %for.body, label %return
+
+for.body:
+ br i1 %cmp3, label %self.loop, label %if.end.2
+
+if.end.2:
+ %or.cond2 = or i1 %cmp3, %arg
+ br i1 %or.cond2, label %return, label %for.cond
+
+self.loop:
+ br label %self.loop
+
+return:
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
new file mode 100644
index 000000000000..025a3d8fca2e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test:
+; CHECK s_and_saveexec_b64
+; CHECK s_xor_b64
+; CHECK s_or_b64 exec, exec
+; CHECK s_andn2_b64 exec, exec
+; CHECK s_cbranch_execnz
+define void @test(i32 %arg, i32 %arg1) {
+bb:
+ %tmp = icmp ne i32 %arg, 0
+ %tmp7 = icmp ne i32 %arg1, 0
+ %tmp8 = and i1 %tmp, %tmp7
+ br i1 %tmp8, label %bb9, label %bb11
+
+bb9: ; preds = %bb
+ br label %bb10
+
+bb10: ; preds = %bb10, %bb9
+ br label %bb10
+
+bb11: ; preds = %bb
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 27a8e70aae13..98d1bb7cf9a2 100644
--- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -1,15 +1,15 @@
-; RUN: llc -o /dev/null %s -march=amdgcn -mcpu=verde -verify-machineinstrs -stop-after expand-isel-pseudos 2>&1 | FileCheck %s
+; RUN: llc -o - %s -march=amdgcn -mcpu=verde -verify-machineinstrs -stop-after expand-isel-pseudos | FileCheck %s
; This test verifies that the instruction selection will add the implicit
; register operands in the correct order when modifying the opcode of an
; instruction to V_ADD_I32_e32.
-; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
+; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 killed %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
- %a = load i32, i32 addrspace(1)* %in
- %b = load i32, i32 addrspace(1)* %b_ptr
+ %a = load volatile i32, i32 addrspace(1)* %in
+ %b = load volatile i32, i32 addrspace(1)* %b_ptr
%result = add i32 %a, %b
store i32 %result, i32 addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/si-literal-folding.ll b/test/CodeGen/AMDGPU/si-literal-folding.ll
index 901b3c3453fc..d5030adc89be 100644
--- a/test/CodeGen/AMDGPU/si-literal-folding.ll
+++ b/test/CodeGen/AMDGPU/si-literal-folding.ll
@@ -4,7 +4,7 @@
; CHECK-LABEL: {{^}}main:
; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8
-define void @main(float) #0 {
+define amdgpu_vs void @main(float) {
main_body:
%1 = fmul float %0, 0x3FE86A7F00000000
%2 = fmul float %0, 0xBFE86A7F00000000
@@ -13,5 +13,3 @@ main_body:
}
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
index 944499a11461..8df0a64a2b7c 100644
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -5,32 +5,32 @@
; the wrong register class is used for the REG_SEQUENCE instructions.
; CHECK: {{^}}main:
-; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) {
main_body:
- %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
- %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
- %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
- %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
- %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1
- %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
- %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1
- %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
- %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
- %29 = bitcast float %22 to i32
- %30 = bitcast float %27 to i32
- %31 = bitcast float %28 to i32
- %32 = insertelement <4 x i32> undef, i32 %29, i32 0
- %33 = insertelement <4 x i32> %32, i32 %30, i32 1
- %34 = insertelement <4 x i32> %33, i32 %31, i32 2
- %35 = insertelement <4 x i32> %34, i32 undef, i32 3
- %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
- %37 = extractelement <4 x float> %36, i32 0
- %38 = extractelement <4 x float> %36, i32 1
- %39 = extractelement <4 x float> %36, i32 2
- %40 = extractelement <4 x float> %36, i32 3
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
+ %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+ %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+ %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
+ %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0
+ %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
+ %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0
+ %tmp26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
+ %tmp27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
+ %tmp28 = bitcast float %tmp21 to i32
+ %tmp29 = bitcast float %tmp26 to i32
+ %tmp30 = bitcast float %tmp27 to i32
+ %tmp31 = insertelement <4 x i32> undef, i32 %tmp28, i32 0
+ %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1
+ %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2
+ %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3
+ %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32>
+ %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp36 = extractelement <4 x float> %tmp35, i32 0
+ %tmp37 = extractelement <4 x float> %tmp35, i32 1
+ %tmp38 = extractelement <4 x float> %tmp35, i32 2
+ %tmp39 = extractelement <4 x float> %tmp35, i32 3
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp36, float %tmp37, float %tmp38, float %tmp39)
ret void
}
@@ -40,13 +40,13 @@ declare float @llvm.SI.load.const(<16 x i8>, i32) #1
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
+
attributes #1 = { nounwind readnone }
-!0 = !{!"const", null}
-!1 = !{!0, !0, i64 0, i32 1}
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
new file mode 100644
index 000000000000..ea506e6b3b3f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
+; GCN: v_cmp_eq_i32
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+; GCN: s_branch BB0_1
+
+; GCN: s_or_b64 exec, exec
+; GCN: s_endpgm
+
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define void @lower_control_flow_unreachable_terminator() #0 {
+bb:
+ %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
+ %tmp63 = icmp eq i32 %tmp15, 32
+ br i1 %tmp63, label %bb64, label %bb68
+
+bb64:
+ store volatile i32 0, i32 addrspace(3)* undef, align 4
+ unreachable
+
+bb68:
+ ret void
+}
+
+; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
+; GCN: v_cmp_eq_i32
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+; GCN: s_endpgm
+
+; GCN: s_or_b64 exec, exec
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
+bb:
+ %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
+ %tmp63 = icmp eq i32 %tmp15, 32
+ br i1 %tmp63, label %bb68, label %bb64
+
+bb68:
+ ret void
+
+bb64:
+ store volatile i32 0, i32 addrspace(3)* undef, align 4
+ unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll
index 66a9571d75bf..5520fe61d867 100644
--- a/test/CodeGen/AMDGPU/si-scheduler.ll
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@@ -1,4 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=SI --misched=si < %s | FileCheck %s
+; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so
+; we need to disable this when the si scheduler is being used.
+; The only way the subtarget knows that the si machine scheduler is being used
+; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend
+; won't know what scheduler we are using.
+; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s
; The test checks the "si" machine scheduler pass works correctly.
@@ -11,45 +16,45 @@
; CHECK: s_waitcnt vmcnt(0)
; CHECK: exp
; CHECK: s_endpgm
-
-define void @main([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>,
-<2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
main_body:
- %22 = bitcast [34 x <8 x i32>] addrspace(2)* %3 to <32 x i8> addrspace(2)*
- %23 = load <32 x i8>, <32 x i8> addrspace(2)* %22, align 32, !tbaa !0
- %24 = bitcast [17 x <4 x i32>] addrspace(2)* %2 to <16 x i8> addrspace(2)*
- %25 = load <16 x i8>, <16 x i8> addrspace(2)* %24, align 16, !tbaa !0
- %26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %11)
- %27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %11)
- %28 = bitcast float %26 to i32
- %29 = bitcast float %27 to i32
- %30 = insertelement <2 x i32> undef, i32 %28, i32 0
- %31 = insertelement <2 x i32> %30, i32 %29, i32 1
- %32 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %31, <32 x i8> %23, <16 x i8> %25, i32 2)
- %33 = extractelement <4 x float> %32, i32 0
- %34 = extractelement <4 x float> %32, i32 1
- %35 = extractelement <4 x float> %32, i32 2
- %36 = extractelement <4 x float> %32, i32 3
- %37 = call i32 @llvm.SI.packf16(float %33, float %34)
- %38 = bitcast i32 %37 to float
- %39 = call i32 @llvm.SI.packf16(float %35, float %36)
- %40 = bitcast i32 %39 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %38, float %40, float %38, float %40)
+ %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)*
+ %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0
+ %tmp23 = bitcast [17 x <4 x i32>] addrspace(2)* %arg2 to <16 x i8> addrspace(2)*
+ %tmp24 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp23, align 16, !tbaa !0
+ %tmp25 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg11)
+ %tmp26 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg11)
+ %tmp27 = bitcast float %tmp25 to i32
+ %tmp28 = bitcast float %tmp26 to i32
+ %tmp29 = insertelement <2 x i32> undef, i32 %tmp27, i32 0
+ %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1
+ %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32>
+ %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32>
+ %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp32 = extractelement <4 x float> %tmp31, i32 0
+ %tmp33 = extractelement <4 x float> %tmp31, i32 1
+ %tmp34 = extractelement <4 x float> %tmp31, i32 2
+ %tmp35 = extractelement <4 x float> %tmp31, i32 3
+ %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33)
+ %tmp37 = bitcast i32 %tmp36 to float
+ %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35)
+ %tmp39 = bitcast i32 %tmp38 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39)
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }
-!0 = !{!"const", null, i32 1}
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index d7b35fc631eb..63b1b71a8cb7 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,9 +1,12 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; These tests check that the compiler won't crash when it needs to spill
; SGPRs.
+
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
; CHECK-LABEL: {{^}}main:
; CHECK: s_wqm
@@ -19,1560 +22,1601 @@
; Writing to M0 from an SMRD instruction will hang the GPU.
; CHECK-NOT: s_buffer_load_dword m0
; CHECK: s_endpgm
-@ddxy_lds = external addrspace(3) global [64 x i32]
-
-define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
main_body:
- %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
- %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
- %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
- %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
- %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
- %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112)
- %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116)
- %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120)
- %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
- %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
- %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140)
- %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
- %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
- %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
- %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
- %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
- %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
- %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
- %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
- %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
- %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
- %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
- %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224)
- %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
- %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
- %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
- %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
- %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
- %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
- %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
- %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
- %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
- %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296)
- %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304)
- %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308)
- %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312)
- %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368)
- %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
- %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
- %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
- %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
- %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0
- %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
- %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0
- %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
- %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0
- %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
- %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0
- %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
- %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0
- %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
- %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0
- %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
- %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0
- %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
- %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0
- %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
- %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0
- %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
- %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0
- %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
- %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0
- %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
- %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0
- %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
- %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0
- %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
- %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0
- %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
- %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0
- %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
- %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0
- %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
- %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
- %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
- %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
- %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
- %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
- %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
- %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
- %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
- %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
- %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
- %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
- %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
- %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
- %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
- %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
- %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
- %110 = call i32 @llvm.SI.tid()
- %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
- %112 = bitcast float %93 to i32
- store i32 %112, i32 addrspace(3)* %111
- %113 = bitcast float %94 to i32
- store i32 %113, i32 addrspace(3)* %111
- %114 = call i32 @llvm.SI.tid()
- %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
- %116 = and i32 %114, -4
- %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
- %118 = add i32 %116, 1
- %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
- %120 = bitcast float %93 to i32
- store i32 %120, i32 addrspace(3)* %115
- %121 = load i32, i32 addrspace(3)* %117
- %122 = bitcast i32 %121 to float
- %123 = load i32, i32 addrspace(3)* %119
- %124 = bitcast i32 %123 to float
- %125 = fsub float %124, %122
- %126 = bitcast float %94 to i32
- store i32 %126, i32 addrspace(3)* %115
- %127 = load i32, i32 addrspace(3)* %117
- %128 = bitcast i32 %127 to float
- %129 = load i32, i32 addrspace(3)* %119
- %130 = bitcast i32 %129 to float
- %131 = fsub float %130, %128
- %132 = insertelement <4 x float> undef, float %125, i32 0
- %133 = insertelement <4 x float> %132, float %131, i32 1
- %134 = insertelement <4 x float> %133, float %131, i32 2
- %135 = insertelement <4 x float> %134, float %131, i32 3
- %136 = extractelement <4 x float> %135, i32 0
- %137 = extractelement <4 x float> %135, i32 1
- %138 = fmul float %60, %93
- %139 = fmul float %60, %94
- %140 = fmul float %60, %94
- %141 = fmul float %60, %94
- %142 = call i32 @llvm.SI.tid()
- %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
- %144 = bitcast float %138 to i32
- store i32 %144, i32 addrspace(3)* %143
- %145 = bitcast float %139 to i32
- store i32 %145, i32 addrspace(3)* %143
- %146 = bitcast float %140 to i32
- store i32 %146, i32 addrspace(3)* %143
- %147 = bitcast float %141 to i32
- store i32 %147, i32 addrspace(3)* %143
- %148 = call i32 @llvm.SI.tid()
- %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
- %150 = and i32 %148, -4
- %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
- %152 = add i32 %150, 2
- %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
- %154 = bitcast float %138 to i32
- store i32 %154, i32 addrspace(3)* %149
- %155 = load i32, i32 addrspace(3)* %151
- %156 = bitcast i32 %155 to float
- %157 = load i32, i32 addrspace(3)* %153
- %158 = bitcast i32 %157 to float
- %159 = fsub float %158, %156
- %160 = bitcast float %139 to i32
- store i32 %160, i32 addrspace(3)* %149
- %161 = load i32, i32 addrspace(3)* %151
- %162 = bitcast i32 %161 to float
- %163 = load i32, i32 addrspace(3)* %153
- %164 = bitcast i32 %163 to float
- %165 = fsub float %164, %162
- %166 = bitcast float %140 to i32
- store i32 %166, i32 addrspace(3)* %149
- %167 = load i32, i32 addrspace(3)* %151
- %168 = bitcast i32 %167 to float
- %169 = load i32, i32 addrspace(3)* %153
- %170 = bitcast i32 %169 to float
- %171 = fsub float %170, %168
- %172 = bitcast float %141 to i32
- store i32 %172, i32 addrspace(3)* %149
- %173 = load i32, i32 addrspace(3)* %151
- %174 = bitcast i32 %173 to float
- %175 = load i32, i32 addrspace(3)* %153
- %176 = bitcast i32 %175 to float
- %177 = fsub float %176, %174
- %178 = insertelement <4 x float> undef, float %159, i32 0
- %179 = insertelement <4 x float> %178, float %165, i32 1
- %180 = insertelement <4 x float> %179, float %171, i32 2
- %181 = insertelement <4 x float> %180, float %177, i32 3
- %182 = extractelement <4 x float> %181, i32 0
- %183 = extractelement <4 x float> %181, i32 1
- %184 = fdiv float 1.000000e+00, %97
- %185 = fmul float %33, %184
- %186 = fcmp uge float 1.000000e+00, %185
- %187 = select i1 %186, float %185, float 1.000000e+00
- %188 = fmul float %187, %30
- %189 = call float @ceil(float %188)
- %190 = fcmp uge float 3.000000e+00, %189
- %191 = select i1 %190, float 3.000000e+00, float %189
- %192 = fdiv float 1.000000e+00, %191
- %193 = fdiv float 1.000000e+00, %30
- %194 = fmul float %191, %193
- %195 = fmul float %31, %194
- %196 = fmul float %95, %95
- %197 = fmul float %96, %96
- %198 = fadd float %197, %196
- %199 = fmul float %97, %97
- %200 = fadd float %198, %199
- %201 = call float @llvm.AMDGPU.rsq.f32(float %200)
- %202 = fmul float %95, %201
- %203 = fmul float %96, %201
- %204 = fmul float %202, %29
- %205 = fmul float %203, %29
- %206 = fmul float %204, -1.000000e+00
- %207 = fmul float %205, 1.000000e+00
- %208 = fmul float %206, %32
- %209 = fmul float %207, %32
- %210 = fsub float -0.000000e+00, %208
- %211 = fadd float %93, %210
- %212 = fsub float -0.000000e+00, %209
- %213 = fadd float %94, %212
- %214 = fmul float %206, %192
- %215 = fmul float %207, %192
- %216 = fmul float -1.000000e+00, %192
- %217 = bitcast float %136 to i32
- %218 = bitcast float %182 to i32
- %219 = bitcast float %137 to i32
- %220 = bitcast float %183 to i32
- %221 = insertelement <8 x i32> undef, i32 %217, i32 0
- %222 = insertelement <8 x i32> %221, i32 %218, i32 1
- %223 = insertelement <8 x i32> %222, i32 %219, i32 2
- %224 = insertelement <8 x i32> %223, i32 %220, i32 3
+ %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
+ %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 96)
+ %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 100)
+ %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 104)
+ %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 112)
+ %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 116)
+ %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 120)
+ %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128)
+ %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132)
+ %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 140)
+ %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144)
+ %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160)
+ %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176)
+ %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180)
+ %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184)
+ %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192)
+ %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196)
+ %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200)
+ %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208)
+ %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212)
+ %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216)
+ %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 224)
+ %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240)
+ %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244)
+ %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248)
+ %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256)
+ %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272)
+ %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276)
+ %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280)
+ %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288)
+ %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292)
+ %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 296)
+ %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 304)
+ %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 308)
+ %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 312)
+ %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 368)
+ %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 372)
+ %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 376)
+ %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 384)
+ %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
+ %tmp61 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp60, !tbaa !0
+ %tmp62 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
+ %tmp63 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp62, !tbaa !0
+ %tmp63.bc = bitcast <16 x i8> %tmp63 to <4 x i32>
+ %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
+ %tmp65 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp64, !tbaa !0
+ %tmp66 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1
+ %tmp67 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp66, !tbaa !0
+ %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
+ %tmp69 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp68, !tbaa !0
+ %tmp70 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2
+ %tmp71 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp70, !tbaa !0
+ %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
+ %tmp73 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp72, !tbaa !0
+ %tmp74 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3
+ %tmp75 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp74, !tbaa !0
+ %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
+ %tmp77 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp76, !tbaa !0
+ %tmp78 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4
+ %tmp79 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp78, !tbaa !0
+ %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
+ %tmp81 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp80, !tbaa !0
+ %tmp82 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5
+ %tmp83 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp82, !tbaa !0
+ %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
+ %tmp85 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp84, !tbaa !0
+ %tmp86 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6
+ %tmp87 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp86, !tbaa !0
+ %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
+ %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0
+ %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
+ %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0
+ %tmp92 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
+ %tmp93 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
+ %tmp94 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
+ %tmp95 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
+ %tmp96 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
+ %tmp97 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
+ %tmp98 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
+ %tmp99 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
+ %tmp100 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
+ %tmp101 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
+ %tmp102 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
+ %tmp103 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
+ %tmp104 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
+ %tmp105 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
+ %tmp106 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
+ %tmp107 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
+ %tmp108 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
+ %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
+ %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
+ %tmp111 = bitcast float %tmp92 to i32
+ store i32 %tmp111, i32 addrspace(3)* %tmp110
+ %tmp112 = bitcast float %tmp93 to i32
+ store i32 %tmp112, i32 addrspace(3)* %tmp110
+ %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
+ %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113
+ %tmp115 = and i32 %tmp113, -4
+ %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
+ %tmp117 = add i32 %tmp115, 1
+ %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
+ %tmp119 = bitcast float %tmp92 to i32
+ store i32 %tmp119, i32 addrspace(3)* %tmp114
+ %tmp120 = load i32, i32 addrspace(3)* %tmp116
+ %tmp121 = bitcast i32 %tmp120 to float
+ %tmp122 = load i32, i32 addrspace(3)* %tmp118
+ %tmp123 = bitcast i32 %tmp122 to float
+ %tmp124 = fsub float %tmp123, %tmp121
+ %tmp125 = bitcast float %tmp93 to i32
+ store i32 %tmp125, i32 addrspace(3)* %tmp114
+ %tmp126 = load i32, i32 addrspace(3)* %tmp116
+ %tmp127 = bitcast i32 %tmp126 to float
+ %tmp128 = load i32, i32 addrspace(3)* %tmp118
+ %tmp129 = bitcast i32 %tmp128 to float
+ %tmp130 = fsub float %tmp129, %tmp127
+ %tmp131 = insertelement <4 x float> undef, float %tmp124, i32 0
+ %tmp132 = insertelement <4 x float> %tmp131, float %tmp130, i32 1
+ %tmp133 = insertelement <4 x float> %tmp132, float %tmp130, i32 2
+ %tmp134 = insertelement <4 x float> %tmp133, float %tmp130, i32 3
+ %tmp135 = extractelement <4 x float> %tmp134, i32 0
+ %tmp136 = extractelement <4 x float> %tmp134, i32 1
+ %tmp137 = fmul float %tmp59, %tmp92
+ %tmp138 = fmul float %tmp59, %tmp93
+ %tmp139 = fmul float %tmp59, %tmp93
+ %tmp140 = fmul float %tmp59, %tmp93
+ %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
+ %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
+ %tmp143 = bitcast float %tmp137 to i32
+ store i32 %tmp143, i32 addrspace(3)* %tmp142
+ %tmp144 = bitcast float %tmp138 to i32
+ store i32 %tmp144, i32 addrspace(3)* %tmp142
+ %tmp145 = bitcast float %tmp139 to i32
+ store i32 %tmp145, i32 addrspace(3)* %tmp142
+ %tmp146 = bitcast float %tmp140 to i32
+ store i32 %tmp146, i32 addrspace(3)* %tmp142
+ %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
+ %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147
+ %tmp149 = and i32 %tmp147, -4
+ %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149
+ %tmp151 = add i32 %tmp149, 2
+ %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151
+ %tmp153 = bitcast float %tmp137 to i32
+ store i32 %tmp153, i32 addrspace(3)* %tmp148
+ %tmp154 = load i32, i32 addrspace(3)* %tmp150
+ %tmp155 = bitcast i32 %tmp154 to float
+ %tmp156 = load i32, i32 addrspace(3)* %tmp152
+ %tmp157 = bitcast i32 %tmp156 to float
+ %tmp158 = fsub float %tmp157, %tmp155
+ %tmp159 = bitcast float %tmp138 to i32
+ store i32 %tmp159, i32 addrspace(3)* %tmp148
+ %tmp160 = load i32, i32 addrspace(3)* %tmp150
+ %tmp161 = bitcast i32 %tmp160 to float
+ %tmp162 = load i32, i32 addrspace(3)* %tmp152
+ %tmp163 = bitcast i32 %tmp162 to float
+ %tmp164 = fsub float %tmp163, %tmp161
+ %tmp165 = bitcast float %tmp139 to i32
+ store i32 %tmp165, i32 addrspace(3)* %tmp148
+ %tmp166 = load i32, i32 addrspace(3)* %tmp150
+ %tmp167 = bitcast i32 %tmp166 to float
+ %tmp168 = load i32, i32 addrspace(3)* %tmp152
+ %tmp169 = bitcast i32 %tmp168 to float
+ %tmp170 = fsub float %tmp169, %tmp167
+ %tmp171 = bitcast float %tmp140 to i32
+ store i32 %tmp171, i32 addrspace(3)* %tmp148
+ %tmp172 = load i32, i32 addrspace(3)* %tmp150
+ %tmp173 = bitcast i32 %tmp172 to float
+ %tmp174 = load i32, i32 addrspace(3)* %tmp152
+ %tmp175 = bitcast i32 %tmp174 to float
+ %tmp176 = fsub float %tmp175, %tmp173
+ %tmp177 = insertelement <4 x float> undef, float %tmp158, i32 0
+ %tmp178 = insertelement <4 x float> %tmp177, float %tmp164, i32 1
+ %tmp179 = insertelement <4 x float> %tmp178, float %tmp170, i32 2
+ %tmp180 = insertelement <4 x float> %tmp179, float %tmp176, i32 3
+ %tmp181 = extractelement <4 x float> %tmp180, i32 0
+ %tmp182 = extractelement <4 x float> %tmp180, i32 1
+ %tmp183 = fdiv float 1.000000e+00, %tmp96
+ %tmp184 = fmul float %tmp32, %tmp183
+ %tmp185 = fcmp uge float 1.000000e+00, %tmp184
+ %tmp186 = select i1 %tmp185, float %tmp184, float 1.000000e+00
+ %tmp187 = fmul float %tmp186, %tmp29
+ %tmp188 = call float @ceil(float %tmp187)
+ %tmp189 = fcmp uge float 3.000000e+00, %tmp188
+ %tmp190 = select i1 %tmp189, float 3.000000e+00, float %tmp188
+ %tmp191 = fdiv float 1.000000e+00, %tmp190
+ %tmp192 = fdiv float 1.000000e+00, %tmp29
+ %tmp193 = fmul float %tmp190, %tmp192
+ %tmp194 = fmul float %tmp30, %tmp193
+ %tmp195 = fmul float %tmp94, %tmp94
+ %tmp196 = fmul float %tmp95, %tmp95
+ %tmp197 = fadd float %tmp196, %tmp195
+ %tmp198 = fmul float %tmp96, %tmp96
+ %tmp199 = fadd float %tmp197, %tmp198
+ %tmp200 = call float @llvm.amdgcn.rsq.f32(float %tmp199)
+ %tmp201 = fmul float %tmp94, %tmp200
+ %tmp202 = fmul float %tmp95, %tmp200
+ %tmp203 = fmul float %tmp201, %tmp28
+ %tmp204 = fmul float %tmp202, %tmp28
+ %tmp205 = fmul float %tmp203, -1.000000e+00
+ %tmp206 = fmul float %tmp204, 1.000000e+00
+ %tmp207 = fmul float %tmp205, %tmp31
+ %tmp208 = fmul float %tmp206, %tmp31
+ %tmp209 = fsub float -0.000000e+00, %tmp207
+ %tmp210 = fadd float %tmp92, %tmp209
+ %tmp211 = fsub float -0.000000e+00, %tmp208
+ %tmp212 = fadd float %tmp93, %tmp211
+ %tmp213 = fmul float %tmp205, %tmp191
+ %tmp214 = fmul float %tmp206, %tmp191
+ %tmp215 = fmul float -1.000000e+00, %tmp191
+ %tmp216 = bitcast float %tmp135 to i32
+ %tmp217 = bitcast float %tmp181 to i32
+ %tmp218 = bitcast float %tmp136 to i32
+ %tmp219 = bitcast float %tmp182 to i32
+ %tmp220 = insertelement <8 x i32> undef, i32 %tmp216, i32 0
+ %tmp221 = insertelement <8 x i32> %tmp220, i32 %tmp217, i32 1
+ %tmp222 = insertelement <8 x i32> %tmp221, i32 %tmp218, i32 2
+ %tmp223 = insertelement <8 x i32> %tmp222, i32 %tmp219, i32 3
br label %LOOP
LOOP: ; preds = %ENDIF, %main_body
- %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ]
- %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ]
- %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ]
- %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ]
- %225 = fcmp oge float %temp24.0, %191
- %226 = sext i1 %225 to i32
- %227 = bitcast i32 %226 to float
- %228 = bitcast float %227 to i32
- %229 = icmp ne i32 %228, 0
- br i1 %229, label %IF, label %ENDIF
+ %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp257, %ENDIF ]
+ %temp28.0 = phi float [ %tmp210, %main_body ], [ %tmp252, %ENDIF ]
+ %temp29.0 = phi float [ %tmp212, %main_body ], [ %tmp254, %ENDIF ]
+ %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp256, %ENDIF ]
+ %tmp224 = fcmp oge float %temp24.0, %tmp190
+ %tmp225 = sext i1 %tmp224 to i32
+ %tmp226 = bitcast i32 %tmp225 to float
+ %tmp227 = bitcast float %tmp226 to i32
+ %tmp228 = icmp ne i32 %tmp227, 0
+ br i1 %tmp228, label %IF, label %ENDIF
IF: ; preds = %LOOP
- %230 = bitcast float %136 to i32
- %231 = bitcast float %182 to i32
- %232 = bitcast float %137 to i32
- %233 = bitcast float %183 to i32
- %234 = insertelement <8 x i32> undef, i32 %230, i32 0
- %235 = insertelement <8 x i32> %234, i32 %231, i32 1
- %236 = insertelement <8 x i32> %235, i32 %232, i32 2
- %237 = insertelement <8 x i32> %236, i32 %233, i32 3
+ %tmp229 = bitcast float %tmp135 to i32
+ %tmp230 = bitcast float %tmp181 to i32
+ %tmp231 = bitcast float %tmp136 to i32
+ %tmp232 = bitcast float %tmp182 to i32
+ %tmp233 = insertelement <8 x i32> undef, i32 %tmp229, i32 0
+ %tmp234 = insertelement <8 x i32> %tmp233, i32 %tmp230, i32 1
+ %tmp235 = insertelement <8 x i32> %tmp234, i32 %tmp231, i32 2
+ %tmp236 = insertelement <8 x i32> %tmp235, i32 %tmp232, i32 3
br label %LOOP65
ENDIF: ; preds = %LOOP
- %238 = bitcast float %temp28.0 to i32
- %239 = bitcast float %temp29.0 to i32
- %240 = insertelement <8 x i32> %224, i32 %238, i32 4
- %241 = insertelement <8 x i32> %240, i32 %239, i32 5
- %242 = insertelement <8 x i32> %241, i32 undef, i32 6
- %243 = insertelement <8 x i32> %242, i32 undef, i32 7
- %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2)
- %245 = extractelement <4 x float> %244, i32 3
- %246 = fcmp oge float %temp30.0, %245
- %247 = sext i1 %246 to i32
- %248 = bitcast i32 %247 to float
- %249 = bitcast float %248 to i32
- %250 = and i32 %249, 1065353216
- %251 = bitcast i32 %250 to float
- %252 = fmul float %214, %251
- %253 = fadd float %252, %temp28.0
- %254 = fmul float %215, %251
- %255 = fadd float %254, %temp29.0
- %256 = fmul float %216, %251
- %257 = fadd float %256, %temp30.0
- %258 = fadd float %temp24.0, 1.000000e+00
+ %tmp237 = bitcast float %temp28.0 to i32
+ %tmp238 = bitcast float %temp29.0 to i32
+ %tmp239 = insertelement <8 x i32> %tmp223, i32 %tmp237, i32 4
+ %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5
+ %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6
+ %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7
+ %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp244 = extractelement <4 x float> %tmp243, i32 3
+ %tmp245 = fcmp oge float %temp30.0, %tmp244
+ %tmp246 = sext i1 %tmp245 to i32
+ %tmp247 = bitcast i32 %tmp246 to float
+ %tmp248 = bitcast float %tmp247 to i32
+ %tmp249 = and i32 %tmp248, 1065353216
+ %tmp250 = bitcast i32 %tmp249 to float
+ %tmp251 = fmul float %tmp213, %tmp250
+ %tmp252 = fadd float %tmp251, %temp28.0
+ %tmp253 = fmul float %tmp214, %tmp250
+ %tmp254 = fadd float %tmp253, %temp29.0
+ %tmp255 = fmul float %tmp215, %tmp250
+ %tmp256 = fadd float %tmp255, %temp30.0
+ %tmp257 = fadd float %temp24.0, 1.000000e+00
br label %LOOP
LOOP65: ; preds = %ENDIF66, %IF
- %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ]
- %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ]
- %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ]
- %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ]
- %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ]
- %259 = fcmp oge float %temp24.1, %195
- %260 = sext i1 %259 to i32
- %261 = bitcast i32 %260 to float
- %262 = bitcast float %261 to i32
- %263 = icmp ne i32 %262, 0
- br i1 %263, label %IF67, label %ENDIF66
+ %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %tmp609, %ENDIF66 ]
+ %temp28.1 = phi float [ %temp28.0, %IF ], [ %tmp604, %ENDIF66 ]
+ %temp29.1 = phi float [ %temp29.0, %IF ], [ %tmp606, %ENDIF66 ]
+ %temp30.1 = phi float [ %temp30.0, %IF ], [ %tmp608, %ENDIF66 ]
+ %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %tmp610, %ENDIF66 ]
+ %tmp258 = fcmp oge float %temp24.1, %tmp194
+ %tmp259 = sext i1 %tmp258 to i32
+ %tmp260 = bitcast i32 %tmp259 to float
+ %tmp261 = bitcast float %tmp260 to i32
+ %tmp262 = icmp ne i32 %tmp261, 0
+ br i1 %tmp262, label %IF67, label %ENDIF66
IF67: ; preds = %LOOP65
- %264 = bitcast float %136 to i32
- %265 = bitcast float %182 to i32
- %266 = bitcast float %137 to i32
- %267 = bitcast float %183 to i32
- %268 = bitcast float %temp28.1 to i32
- %269 = bitcast float %temp29.1 to i32
- %270 = insertelement <8 x i32> undef, i32 %264, i32 0
- %271 = insertelement <8 x i32> %270, i32 %265, i32 1
- %272 = insertelement <8 x i32> %271, i32 %266, i32 2
- %273 = insertelement <8 x i32> %272, i32 %267, i32 3
- %274 = insertelement <8 x i32> %273, i32 %268, i32 4
- %275 = insertelement <8 x i32> %274, i32 %269, i32 5
- %276 = insertelement <8 x i32> %275, i32 undef, i32 6
- %277 = insertelement <8 x i32> %276, i32 undef, i32 7
- %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2)
- %279 = extractelement <4 x float> %278, i32 0
- %280 = extractelement <4 x float> %278, i32 1
- %281 = extractelement <4 x float> %278, i32 2
- %282 = extractelement <4 x float> %278, i32 3
- %283 = fmul float %282, %47
- %284 = bitcast float %136 to i32
- %285 = bitcast float %182 to i32
- %286 = bitcast float %137 to i32
- %287 = bitcast float %183 to i32
- %288 = bitcast float %temp28.1 to i32
- %289 = bitcast float %temp29.1 to i32
- %290 = insertelement <8 x i32> undef, i32 %284, i32 0
- %291 = insertelement <8 x i32> %290, i32 %285, i32 1
- %292 = insertelement <8 x i32> %291, i32 %286, i32 2
- %293 = insertelement <8 x i32> %292, i32 %287, i32 3
- %294 = insertelement <8 x i32> %293, i32 %288, i32 4
- %295 = insertelement <8 x i32> %294, i32 %289, i32 5
- %296 = insertelement <8 x i32> %295, i32 undef, i32 6
- %297 = insertelement <8 x i32> %296, i32 undef, i32 7
- %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2)
- %299 = extractelement <4 x float> %298, i32 0
- %300 = extractelement <4 x float> %298, i32 1
- %301 = extractelement <4 x float> %298, i32 2
- %302 = bitcast float %136 to i32
- %303 = bitcast float %182 to i32
- %304 = bitcast float %137 to i32
- %305 = bitcast float %183 to i32
- %306 = bitcast float %temp28.1 to i32
- %307 = bitcast float %temp29.1 to i32
- %308 = insertelement <8 x i32> undef, i32 %302, i32 0
- %309 = insertelement <8 x i32> %308, i32 %303, i32 1
- %310 = insertelement <8 x i32> %309, i32 %304, i32 2
- %311 = insertelement <8 x i32> %310, i32 %305, i32 3
- %312 = insertelement <8 x i32> %311, i32 %306, i32 4
- %313 = insertelement <8 x i32> %312, i32 %307, i32 5
- %314 = insertelement <8 x i32> %313, i32 undef, i32 6
- %315 = insertelement <8 x i32> %314, i32 undef, i32 7
- %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2)
- %317 = extractelement <4 x float> %316, i32 0
- %318 = extractelement <4 x float> %316, i32 1
- %319 = extractelement <4 x float> %316, i32 2
- %320 = fmul float %317, %23
- %321 = fmul float %318, %24
- %322 = fmul float %319, %25
- %323 = fmul float %299, %26
- %324 = fadd float %323, %320
- %325 = fmul float %300, %27
- %326 = fadd float %325, %321
- %327 = fmul float %301, %28
- %328 = fadd float %327, %322
- %329 = fadd float %279, %324
- %330 = fadd float %280, %326
- %331 = fadd float %281, %328
- %332 = bitcast float %136 to i32
- %333 = bitcast float %182 to i32
- %334 = bitcast float %137 to i32
- %335 = bitcast float %183 to i32
- %336 = bitcast float %temp28.1 to i32
- %337 = bitcast float %temp29.1 to i32
- %338 = insertelement <8 x i32> undef, i32 %332, i32 0
- %339 = insertelement <8 x i32> %338, i32 %333, i32 1
- %340 = insertelement <8 x i32> %339, i32 %334, i32 2
- %341 = insertelement <8 x i32> %340, i32 %335, i32 3
- %342 = insertelement <8 x i32> %341, i32 %336, i32 4
- %343 = insertelement <8 x i32> %342, i32 %337, i32 5
- %344 = insertelement <8 x i32> %343, i32 undef, i32 6
- %345 = insertelement <8 x i32> %344, i32 undef, i32 7
- %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2)
- %347 = extractelement <4 x float> %346, i32 0
- %348 = extractelement <4 x float> %346, i32 1
- %349 = extractelement <4 x float> %346, i32 2
- %350 = fadd float %347, -5.000000e-01
- %351 = fadd float %348, -5.000000e-01
- %352 = fadd float %349, -5.000000e-01
- %353 = fmul float %350, %350
- %354 = fmul float %351, %351
- %355 = fadd float %354, %353
- %356 = fmul float %352, %352
- %357 = fadd float %355, %356
- %358 = call float @llvm.AMDGPU.rsq.f32(float %357)
- %359 = fmul float %350, %358
- %360 = fmul float %351, %358
- %361 = fmul float %352, %358
- %362 = bitcast float %136 to i32
- %363 = bitcast float %182 to i32
- %364 = bitcast float %137 to i32
- %365 = bitcast float %183 to i32
- %366 = bitcast float %temp28.1 to i32
- %367 = bitcast float %temp29.1 to i32
- %368 = insertelement <8 x i32> undef, i32 %362, i32 0
- %369 = insertelement <8 x i32> %368, i32 %363, i32 1
- %370 = insertelement <8 x i32> %369, i32 %364, i32 2
- %371 = insertelement <8 x i32> %370, i32 %365, i32 3
- %372 = insertelement <8 x i32> %371, i32 %366, i32 4
- %373 = insertelement <8 x i32> %372, i32 %367, i32 5
- %374 = insertelement <8 x i32> %373, i32 undef, i32 6
- %375 = insertelement <8 x i32> %374, i32 undef, i32 7
- %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2)
- %377 = extractelement <4 x float> %376, i32 0
- %378 = extractelement <4 x float> %376, i32 1
- %379 = extractelement <4 x float> %376, i32 2
- %380 = extractelement <4 x float> %376, i32 3
- %381 = fsub float -0.000000e+00, %95
- %382 = fsub float -0.000000e+00, %96
- %383 = fsub float -0.000000e+00, %97
- %384 = fmul float %359, %381
- %385 = fmul float %360, %382
- %386 = fadd float %385, %384
- %387 = fmul float %361, %383
- %388 = fadd float %386, %387
- %389 = fmul float %388, %359
- %390 = fmul float %388, %360
- %391 = fmul float %388, %361
- %392 = fmul float 2.000000e+00, %389
- %393 = fmul float 2.000000e+00, %390
- %394 = fmul float 2.000000e+00, %391
- %395 = fsub float -0.000000e+00, %392
- %396 = fadd float %381, %395
- %397 = fsub float -0.000000e+00, %393
- %398 = fadd float %382, %397
- %399 = fsub float -0.000000e+00, %394
- %400 = fadd float %383, %399
- %401 = fmul float %396, %98
- %402 = fmul float %396, %99
- %403 = fmul float %396, %100
- %404 = fmul float %398, %101
- %405 = fadd float %404, %401
- %406 = fmul float %398, %102
- %407 = fadd float %406, %402
- %408 = fmul float %398, %103
- %409 = fadd float %408, %403
- %410 = fmul float %400, %104
- %411 = fadd float %410, %405
- %412 = fmul float %400, %105
- %413 = fadd float %412, %407
- %414 = fmul float %400, %106
- %415 = fadd float %414, %409
- %416 = bitcast float %136 to i32
- %417 = bitcast float %182 to i32
- %418 = bitcast float %137 to i32
- %419 = bitcast float %183 to i32
- %420 = bitcast float %temp28.1 to i32
- %421 = bitcast float %temp29.1 to i32
- %422 = insertelement <8 x i32> undef, i32 %416, i32 0
- %423 = insertelement <8 x i32> %422, i32 %417, i32 1
- %424 = insertelement <8 x i32> %423, i32 %418, i32 2
- %425 = insertelement <8 x i32> %424, i32 %419, i32 3
- %426 = insertelement <8 x i32> %425, i32 %420, i32 4
- %427 = insertelement <8 x i32> %426, i32 %421, i32 5
- %428 = insertelement <8 x i32> %427, i32 undef, i32 6
- %429 = insertelement <8 x i32> %428, i32 undef, i32 7
- %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2)
- %431 = extractelement <4 x float> %430, i32 0
- %432 = extractelement <4 x float> %430, i32 1
- %433 = extractelement <4 x float> %430, i32 2
- %434 = fmul float %48, %411
- %435 = fmul float %49, %411
- %436 = fmul float %50, %411
- %437 = fmul float %51, %413
- %438 = fadd float %437, %434
- %439 = fmul float %52, %413
- %440 = fadd float %439, %435
- %441 = fmul float %53, %413
- %442 = fadd float %441, %436
- %443 = fmul float %54, %415
- %444 = fadd float %443, %438
- %445 = fmul float %55, %415
- %446 = fadd float %445, %440
- %447 = fmul float %56, %415
- %448 = fadd float %447, %442
- %449 = insertelement <4 x float> undef, float %444, i32 0
- %450 = insertelement <4 x float> %449, float %446, i32 1
- %451 = insertelement <4 x float> %450, float %448, i32 2
- %452 = insertelement <4 x float> %451, float %195, i32 3
- %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452)
- %454 = extractelement <4 x float> %453, i32 0
- %455 = extractelement <4 x float> %453, i32 1
- %456 = extractelement <4 x float> %453, i32 2
- %457 = extractelement <4 x float> %453, i32 3
- %458 = call float @fabs(float %456)
- %459 = fdiv float 1.000000e+00, %458
- %460 = fmul float %454, %459
- %461 = fadd float %460, 1.500000e+00
- %462 = fmul float %455, %459
- %463 = fadd float %462, 1.500000e+00
- %464 = bitcast float %463 to i32
- %465 = bitcast float %461 to i32
- %466 = bitcast float %457 to i32
- %467 = insertelement <4 x i32> undef, i32 %464, i32 0
- %468 = insertelement <4 x i32> %467, i32 %465, i32 1
- %469 = insertelement <4 x i32> %468, i32 %466, i32 2
- %470 = insertelement <4 x i32> %469, i32 undef, i32 3
- %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4)
- %472 = extractelement <4 x float> %471, i32 0
- %473 = extractelement <4 x float> %471, i32 1
- %474 = extractelement <4 x float> %471, i32 2
- %475 = fmul float %431, %472
- %476 = fadd float %475, %329
- %477 = fmul float %432, %473
- %478 = fadd float %477, %330
- %479 = fmul float %433, %474
- %480 = fadd float %479, %331
- %481 = fmul float %107, %107
- %482 = fmul float %108, %108
- %483 = fadd float %482, %481
- %484 = fmul float %109, %109
- %485 = fadd float %483, %484
- %486 = call float @llvm.AMDGPU.rsq.f32(float %485)
- %487 = fmul float %107, %486
- %488 = fmul float %108, %486
- %489 = fmul float %109, %486
- %490 = fmul float %377, %40
- %491 = fmul float %378, %41
- %492 = fmul float %379, %42
- %493 = fmul float %359, %487
- %494 = fmul float %360, %488
- %495 = fadd float %494, %493
- %496 = fmul float %361, %489
- %497 = fadd float %495, %496
- %498 = fmul float %497, %359
- %499 = fmul float %497, %360
- %500 = fmul float %497, %361
- %501 = fmul float 2.000000e+00, %498
- %502 = fmul float 2.000000e+00, %499
- %503 = fmul float 2.000000e+00, %500
- %504 = fsub float -0.000000e+00, %501
- %505 = fadd float %487, %504
- %506 = fsub float -0.000000e+00, %502
- %507 = fadd float %488, %506
- %508 = fsub float -0.000000e+00, %503
- %509 = fadd float %489, %508
- %510 = fmul float %95, %95
- %511 = fmul float %96, %96
- %512 = fadd float %511, %510
- %513 = fmul float %97, %97
- %514 = fadd float %512, %513
- %515 = call float @llvm.AMDGPU.rsq.f32(float %514)
- %516 = fmul float %95, %515
- %517 = fmul float %96, %515
- %518 = fmul float %97, %515
- %519 = fmul float %505, %516
- %520 = fmul float %507, %517
- %521 = fadd float %520, %519
- %522 = fmul float %509, %518
- %523 = fadd float %521, %522
- %524 = fsub float -0.000000e+00, %523
- %525 = fcmp uge float %524, 0.000000e+00
- %526 = select i1 %525, float %524, float 0.000000e+00
- %527 = fmul float %43, %380
- %528 = fadd float %527, 1.000000e+00
- %529 = call float @llvm.pow.f32(float %526, float %528)
- %530 = fmul float %476, %37
- %531 = fmul float %478, %38
- %532 = fmul float %480, %39
- %533 = fmul float %359, %487
- %534 = fmul float %360, %488
- %535 = fadd float %534, %533
- %536 = fmul float %361, %489
- %537 = fadd float %535, %536
- %538 = fcmp uge float %537, 0.000000e+00
- %539 = select i1 %538, float %537, float 0.000000e+00
- %540 = fmul float %530, %539
- %541 = fmul float %531, %539
- %542 = fmul float %532, %539
- %543 = fmul float %490, %529
- %544 = fadd float %543, %540
- %545 = fmul float %491, %529
- %546 = fadd float %545, %541
- %547 = fmul float %492, %529
- %548 = fadd float %547, %542
- %549 = fmul float %476, %34
- %550 = fmul float %478, %35
- %551 = fmul float %480, %36
- %552 = fmul float %544, %57
- %553 = fadd float %552, %549
- %554 = fmul float %546, %58
- %555 = fadd float %554, %550
- %556 = fmul float %548, %59
- %557 = fadd float %556, %551
- %558 = bitcast float %136 to i32
- %559 = bitcast float %182 to i32
- %560 = bitcast float %137 to i32
- %561 = bitcast float %183 to i32
- %562 = bitcast float %temp28.1 to i32
- %563 = bitcast float %temp29.1 to i32
- %564 = insertelement <8 x i32> undef, i32 %558, i32 0
- %565 = insertelement <8 x i32> %564, i32 %559, i32 1
- %566 = insertelement <8 x i32> %565, i32 %560, i32 2
- %567 = insertelement <8 x i32> %566, i32 %561, i32 3
- %568 = insertelement <8 x i32> %567, i32 %562, i32 4
- %569 = insertelement <8 x i32> %568, i32 %563, i32 5
- %570 = insertelement <8 x i32> %569, i32 undef, i32 6
- %571 = insertelement <8 x i32> %570, i32 undef, i32 7
- %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2)
- %573 = extractelement <4 x float> %572, i32 0
- %574 = extractelement <4 x float> %572, i32 1
- %575 = extractelement <4 x float> %572, i32 2
- %576 = fmul float %573, %44
- %577 = fadd float %576, %553
- %578 = fmul float %574, %45
- %579 = fadd float %578, %555
- %580 = fmul float %575, %46
- %581 = fadd float %580, %557
- %582 = call i32 @llvm.SI.packf16(float %577, float %579)
- %583 = bitcast i32 %582 to float
- %584 = call i32 @llvm.SI.packf16(float %581, float %283)
- %585 = bitcast i32 %584 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585)
+ %tmp263 = bitcast float %tmp135 to i32
+ %tmp264 = bitcast float %tmp181 to i32
+ %tmp265 = bitcast float %tmp136 to i32
+ %tmp266 = bitcast float %tmp182 to i32
+ %tmp267 = bitcast float %temp28.1 to i32
+ %tmp268 = bitcast float %temp29.1 to i32
+ %tmp269 = insertelement <8 x i32> undef, i32 %tmp263, i32 0
+ %tmp270 = insertelement <8 x i32> %tmp269, i32 %tmp264, i32 1
+ %tmp271 = insertelement <8 x i32> %tmp270, i32 %tmp265, i32 2
+ %tmp272 = insertelement <8 x i32> %tmp271, i32 %tmp266, i32 3
+ %tmp273 = insertelement <8 x i32> %tmp272, i32 %tmp267, i32 4
+ %tmp274 = insertelement <8 x i32> %tmp273, i32 %tmp268, i32 5
+ %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6
+ %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7
+ %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32>
+ %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp278 = extractelement <4 x float> %tmp277, i32 0
+ %tmp279 = extractelement <4 x float> %tmp277, i32 1
+ %tmp280 = extractelement <4 x float> %tmp277, i32 2
+ %tmp281 = extractelement <4 x float> %tmp277, i32 3
+ %tmp282 = fmul float %tmp281, %tmp46
+ %tmp283 = bitcast float %tmp135 to i32
+ %tmp284 = bitcast float %tmp181 to i32
+ %tmp285 = bitcast float %tmp136 to i32
+ %tmp286 = bitcast float %tmp182 to i32
+ %tmp287 = bitcast float %temp28.1 to i32
+ %tmp288 = bitcast float %temp29.1 to i32
+ %tmp289 = insertelement <8 x i32> undef, i32 %tmp283, i32 0
+ %tmp290 = insertelement <8 x i32> %tmp289, i32 %tmp284, i32 1
+ %tmp291 = insertelement <8 x i32> %tmp290, i32 %tmp285, i32 2
+ %tmp292 = insertelement <8 x i32> %tmp291, i32 %tmp286, i32 3
+ %tmp293 = insertelement <8 x i32> %tmp292, i32 %tmp287, i32 4
+ %tmp294 = insertelement <8 x i32> %tmp293, i32 %tmp288, i32 5
+ %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6
+ %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7
+ %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32>
+ %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp298 = extractelement <4 x float> %tmp297, i32 0
+ %tmp299 = extractelement <4 x float> %tmp297, i32 1
+ %tmp300 = extractelement <4 x float> %tmp297, i32 2
+ %tmp301 = bitcast float %tmp135 to i32
+ %tmp302 = bitcast float %tmp181 to i32
+ %tmp303 = bitcast float %tmp136 to i32
+ %tmp304 = bitcast float %tmp182 to i32
+ %tmp305 = bitcast float %temp28.1 to i32
+ %tmp306 = bitcast float %temp29.1 to i32
+ %tmp307 = insertelement <8 x i32> undef, i32 %tmp301, i32 0
+ %tmp308 = insertelement <8 x i32> %tmp307, i32 %tmp302, i32 1
+ %tmp309 = insertelement <8 x i32> %tmp308, i32 %tmp303, i32 2
+ %tmp310 = insertelement <8 x i32> %tmp309, i32 %tmp304, i32 3
+ %tmp311 = insertelement <8 x i32> %tmp310, i32 %tmp305, i32 4
+ %tmp312 = insertelement <8 x i32> %tmp311, i32 %tmp306, i32 5
+ %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6
+ %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7
+ %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32>
+ %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp316 = extractelement <4 x float> %tmp315, i32 0
+ %tmp317 = extractelement <4 x float> %tmp315, i32 1
+ %tmp318 = extractelement <4 x float> %tmp315, i32 2
+ %tmp319 = fmul float %tmp316, %tmp22
+ %tmp320 = fmul float %tmp317, %tmp23
+ %tmp321 = fmul float %tmp318, %tmp24
+ %tmp322 = fmul float %tmp298, %tmp25
+ %tmp323 = fadd float %tmp322, %tmp319
+ %tmp324 = fmul float %tmp299, %tmp26
+ %tmp325 = fadd float %tmp324, %tmp320
+ %tmp326 = fmul float %tmp300, %tmp27
+ %tmp327 = fadd float %tmp326, %tmp321
+ %tmp328 = fadd float %tmp278, %tmp323
+ %tmp329 = fadd float %tmp279, %tmp325
+ %tmp330 = fadd float %tmp280, %tmp327
+ %tmp331 = bitcast float %tmp135 to i32
+ %tmp332 = bitcast float %tmp181 to i32
+ %tmp333 = bitcast float %tmp136 to i32
+ %tmp334 = bitcast float %tmp182 to i32
+ %tmp335 = bitcast float %temp28.1 to i32
+ %tmp336 = bitcast float %temp29.1 to i32
+ %tmp337 = insertelement <8 x i32> undef, i32 %tmp331, i32 0
+ %tmp338 = insertelement <8 x i32> %tmp337, i32 %tmp332, i32 1
+ %tmp339 = insertelement <8 x i32> %tmp338, i32 %tmp333, i32 2
+ %tmp340 = insertelement <8 x i32> %tmp339, i32 %tmp334, i32 3
+ %tmp341 = insertelement <8 x i32> %tmp340, i32 %tmp335, i32 4
+ %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5
+ %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6
+ %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7
+ %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp346 = extractelement <4 x float> %tmp345, i32 0
+ %tmp347 = extractelement <4 x float> %tmp345, i32 1
+ %tmp348 = extractelement <4 x float> %tmp345, i32 2
+ %tmp349 = fadd float %tmp346, -5.000000e-01
+ %tmp350 = fadd float %tmp347, -5.000000e-01
+ %tmp351 = fadd float %tmp348, -5.000000e-01
+ %tmp352 = fmul float %tmp349, %tmp349
+ %tmp353 = fmul float %tmp350, %tmp350
+ %tmp354 = fadd float %tmp353, %tmp352
+ %tmp355 = fmul float %tmp351, %tmp351
+ %tmp356 = fadd float %tmp354, %tmp355
+ %tmp357 = call float @llvm.amdgcn.rsq.f32(float %tmp356)
+ %tmp358 = fmul float %tmp349, %tmp357
+ %tmp359 = fmul float %tmp350, %tmp357
+ %tmp360 = fmul float %tmp351, %tmp357
+ %tmp361 = bitcast float %tmp135 to i32
+ %tmp362 = bitcast float %tmp181 to i32
+ %tmp363 = bitcast float %tmp136 to i32
+ %tmp364 = bitcast float %tmp182 to i32
+ %tmp365 = bitcast float %temp28.1 to i32
+ %tmp366 = bitcast float %temp29.1 to i32
+ %tmp367 = insertelement <8 x i32> undef, i32 %tmp361, i32 0
+ %tmp368 = insertelement <8 x i32> %tmp367, i32 %tmp362, i32 1
+ %tmp369 = insertelement <8 x i32> %tmp368, i32 %tmp363, i32 2
+ %tmp370 = insertelement <8 x i32> %tmp369, i32 %tmp364, i32 3
+ %tmp371 = insertelement <8 x i32> %tmp370, i32 %tmp365, i32 4
+ %tmp372 = insertelement <8 x i32> %tmp371, i32 %tmp366, i32 5
+ %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6
+ %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7
+ %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32>
+ %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp376 = extractelement <4 x float> %tmp375, i32 0
+ %tmp377 = extractelement <4 x float> %tmp375, i32 1
+ %tmp378 = extractelement <4 x float> %tmp375, i32 2
+ %tmp379 = extractelement <4 x float> %tmp375, i32 3
+ %tmp380 = fsub float -0.000000e+00, %tmp94
+ %tmp381 = fsub float -0.000000e+00, %tmp95
+ %tmp382 = fsub float -0.000000e+00, %tmp96
+ %tmp383 = fmul float %tmp358, %tmp380
+ %tmp384 = fmul float %tmp359, %tmp381
+ %tmp385 = fadd float %tmp384, %tmp383
+ %tmp386 = fmul float %tmp360, %tmp382
+ %tmp387 = fadd float %tmp385, %tmp386
+ %tmp388 = fmul float %tmp387, %tmp358
+ %tmp389 = fmul float %tmp387, %tmp359
+ %tmp390 = fmul float %tmp387, %tmp360
+ %tmp391 = fmul float 2.000000e+00, %tmp388
+ %tmp392 = fmul float 2.000000e+00, %tmp389
+ %tmp393 = fmul float 2.000000e+00, %tmp390
+ %tmp394 = fsub float -0.000000e+00, %tmp391
+ %tmp395 = fadd float %tmp380, %tmp394
+ %tmp396 = fsub float -0.000000e+00, %tmp392
+ %tmp397 = fadd float %tmp381, %tmp396
+ %tmp398 = fsub float -0.000000e+00, %tmp393
+ %tmp399 = fadd float %tmp382, %tmp398
+ %tmp400 = fmul float %tmp395, %tmp97
+ %tmp401 = fmul float %tmp395, %tmp98
+ %tmp402 = fmul float %tmp395, %tmp99
+ %tmp403 = fmul float %tmp397, %tmp100
+ %tmp404 = fadd float %tmp403, %tmp400
+ %tmp405 = fmul float %tmp397, %tmp101
+ %tmp406 = fadd float %tmp405, %tmp401
+ %tmp407 = fmul float %tmp397, %tmp102
+ %tmp408 = fadd float %tmp407, %tmp402
+ %tmp409 = fmul float %tmp399, %tmp103
+ %tmp410 = fadd float %tmp409, %tmp404
+ %tmp411 = fmul float %tmp399, %tmp104
+ %tmp412 = fadd float %tmp411, %tmp406
+ %tmp413 = fmul float %tmp399, %tmp105
+ %tmp414 = fadd float %tmp413, %tmp408
+ %tmp415 = bitcast float %tmp135 to i32
+ %tmp416 = bitcast float %tmp181 to i32
+ %tmp417 = bitcast float %tmp136 to i32
+ %tmp418 = bitcast float %tmp182 to i32
+ %tmp419 = bitcast float %temp28.1 to i32
+ %tmp420 = bitcast float %temp29.1 to i32
+ %tmp421 = insertelement <8 x i32> undef, i32 %tmp415, i32 0
+ %tmp422 = insertelement <8 x i32> %tmp421, i32 %tmp416, i32 1
+ %tmp423 = insertelement <8 x i32> %tmp422, i32 %tmp417, i32 2
+ %tmp424 = insertelement <8 x i32> %tmp423, i32 %tmp418, i32 3
+ %tmp425 = insertelement <8 x i32> %tmp424, i32 %tmp419, i32 4
+ %tmp426 = insertelement <8 x i32> %tmp425, i32 %tmp420, i32 5
+ %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6
+ %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7
+ %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32>
+ %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp430 = extractelement <4 x float> %tmp429, i32 0
+ %tmp431 = extractelement <4 x float> %tmp429, i32 1
+ %tmp432 = extractelement <4 x float> %tmp429, i32 2
+ %tmp433 = fmul float %tmp47, %tmp410
+ %tmp434 = fmul float %tmp48, %tmp410
+ %tmp435 = fmul float %tmp49, %tmp410
+ %tmp436 = fmul float %tmp50, %tmp412
+ %tmp437 = fadd float %tmp436, %tmp433
+ %tmp438 = fmul float %tmp51, %tmp412
+ %tmp439 = fadd float %tmp438, %tmp434
+ %tmp440 = fmul float %tmp52, %tmp412
+ %tmp441 = fadd float %tmp440, %tmp435
+ %tmp442 = fmul float %tmp53, %tmp414
+ %tmp443 = fadd float %tmp442, %tmp437
+ %tmp444 = fmul float %tmp54, %tmp414
+ %tmp445 = fadd float %tmp444, %tmp439
+ %tmp446 = fmul float %tmp55, %tmp414
+ %tmp447 = fadd float %tmp446, %tmp441
+ %tmp448 = insertelement <4 x float> undef, float %tmp443, i32 0
+ %tmp449 = insertelement <4 x float> %tmp448, float %tmp445, i32 1
+ %tmp450 = insertelement <4 x float> %tmp449, float %tmp447, i32 2
+ %tmp451 = insertelement <4 x float> %tmp450, float %tmp194, i32 3
+ %tmp452 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp451)
+ %tmp453 = extractelement <4 x float> %tmp452, i32 0
+ %tmp454 = extractelement <4 x float> %tmp452, i32 1
+ %tmp455 = extractelement <4 x float> %tmp452, i32 2
+ %tmp456 = extractelement <4 x float> %tmp452, i32 3
+ %tmp457 = call float @fabs(float %tmp455)
+ %tmp458 = fdiv float 1.000000e+00, %tmp457
+ %tmp459 = fmul float %tmp453, %tmp458
+ %tmp460 = fadd float %tmp459, 1.500000e+00
+ %tmp461 = fmul float %tmp454, %tmp458
+ %tmp462 = fadd float %tmp461, 1.500000e+00
+ %tmp463 = bitcast float %tmp462 to i32
+ %tmp464 = bitcast float %tmp460 to i32
+ %tmp465 = bitcast float %tmp456 to i32
+ %tmp466 = insertelement <4 x i32> undef, i32 %tmp463, i32 0
+ %tmp467 = insertelement <4 x i32> %tmp466, i32 %tmp464, i32 1
+ %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2
+ %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3
+ %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32>
+ %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp471 = extractelement <4 x float> %tmp470, i32 0
+ %tmp472 = extractelement <4 x float> %tmp470, i32 1
+ %tmp473 = extractelement <4 x float> %tmp470, i32 2
+ %tmp474 = fmul float %tmp430, %tmp471
+ %tmp475 = fadd float %tmp474, %tmp328
+ %tmp476 = fmul float %tmp431, %tmp472
+ %tmp477 = fadd float %tmp476, %tmp329
+ %tmp478 = fmul float %tmp432, %tmp473
+ %tmp479 = fadd float %tmp478, %tmp330
+ %tmp480 = fmul float %tmp106, %tmp106
+ %tmp481 = fmul float %tmp107, %tmp107
+ %tmp482 = fadd float %tmp481, %tmp480
+ %tmp483 = fmul float %tmp108, %tmp108
+ %tmp484 = fadd float %tmp482, %tmp483
+ %tmp485 = call float @llvm.amdgcn.rsq.f32(float %tmp484)
+ %tmp486 = fmul float %tmp106, %tmp485
+ %tmp487 = fmul float %tmp107, %tmp485
+ %tmp488 = fmul float %tmp108, %tmp485
+ %tmp489 = fmul float %tmp376, %tmp39
+ %tmp490 = fmul float %tmp377, %tmp40
+ %tmp491 = fmul float %tmp378, %tmp41
+ %tmp492 = fmul float %tmp358, %tmp486
+ %tmp493 = fmul float %tmp359, %tmp487
+ %tmp494 = fadd float %tmp493, %tmp492
+ %tmp495 = fmul float %tmp360, %tmp488
+ %tmp496 = fadd float %tmp494, %tmp495
+ %tmp497 = fmul float %tmp496, %tmp358
+ %tmp498 = fmul float %tmp496, %tmp359
+ %tmp499 = fmul float %tmp496, %tmp360
+ %tmp500 = fmul float 2.000000e+00, %tmp497
+ %tmp501 = fmul float 2.000000e+00, %tmp498
+ %tmp502 = fmul float 2.000000e+00, %tmp499
+ %tmp503 = fsub float -0.000000e+00, %tmp500
+ %tmp504 = fadd float %tmp486, %tmp503
+ %tmp505 = fsub float -0.000000e+00, %tmp501
+ %tmp506 = fadd float %tmp487, %tmp505
+ %tmp507 = fsub float -0.000000e+00, %tmp502
+ %tmp508 = fadd float %tmp488, %tmp507
+ %tmp509 = fmul float %tmp94, %tmp94
+ %tmp510 = fmul float %tmp95, %tmp95
+ %tmp511 = fadd float %tmp510, %tmp509
+ %tmp512 = fmul float %tmp96, %tmp96
+ %tmp513 = fadd float %tmp511, %tmp512
+ %tmp514 = call float @llvm.amdgcn.rsq.f32(float %tmp513)
+ %tmp515 = fmul float %tmp94, %tmp514
+ %tmp516 = fmul float %tmp95, %tmp514
+ %tmp517 = fmul float %tmp96, %tmp514
+ %tmp518 = fmul float %tmp504, %tmp515
+ %tmp519 = fmul float %tmp506, %tmp516
+ %tmp520 = fadd float %tmp519, %tmp518
+ %tmp521 = fmul float %tmp508, %tmp517
+ %tmp522 = fadd float %tmp520, %tmp521
+ %tmp523 = fsub float -0.000000e+00, %tmp522
+ %tmp524 = fcmp uge float %tmp523, 0.000000e+00
+ %tmp525 = select i1 %tmp524, float %tmp523, float 0.000000e+00
+ %tmp526 = fmul float %tmp42, %tmp379
+ %tmp527 = fadd float %tmp526, 1.000000e+00
+ %tmp528 = call float @llvm.pow.f32(float %tmp525, float %tmp527)
+ %tmp529 = fmul float %tmp475, %tmp36
+ %tmp530 = fmul float %tmp477, %tmp37
+ %tmp531 = fmul float %tmp479, %tmp38
+ %tmp532 = fmul float %tmp358, %tmp486
+ %tmp533 = fmul float %tmp359, %tmp487
+ %tmp534 = fadd float %tmp533, %tmp532
+ %tmp535 = fmul float %tmp360, %tmp488
+ %tmp536 = fadd float %tmp534, %tmp535
+ %tmp537 = fcmp uge float %tmp536, 0.000000e+00
+ %tmp538 = select i1 %tmp537, float %tmp536, float 0.000000e+00
+ %tmp539 = fmul float %tmp529, %tmp538
+ %tmp540 = fmul float %tmp530, %tmp538
+ %tmp541 = fmul float %tmp531, %tmp538
+ %tmp542 = fmul float %tmp489, %tmp528
+ %tmp543 = fadd float %tmp542, %tmp539
+ %tmp544 = fmul float %tmp490, %tmp528
+ %tmp545 = fadd float %tmp544, %tmp540
+ %tmp546 = fmul float %tmp491, %tmp528
+ %tmp547 = fadd float %tmp546, %tmp541
+ %tmp548 = fmul float %tmp475, %tmp33
+ %tmp549 = fmul float %tmp477, %tmp34
+ %tmp550 = fmul float %tmp479, %tmp35
+ %tmp551 = fmul float %tmp543, %tmp56
+ %tmp552 = fadd float %tmp551, %tmp548
+ %tmp553 = fmul float %tmp545, %tmp57
+ %tmp554 = fadd float %tmp553, %tmp549
+ %tmp555 = fmul float %tmp547, %tmp58
+ %tmp556 = fadd float %tmp555, %tmp550
+ %tmp557 = bitcast float %tmp135 to i32
+ %tmp558 = bitcast float %tmp181 to i32
+ %tmp559 = bitcast float %tmp136 to i32
+ %tmp560 = bitcast float %tmp182 to i32
+ %tmp561 = bitcast float %temp28.1 to i32
+ %tmp562 = bitcast float %temp29.1 to i32
+ %tmp563 = insertelement <8 x i32> undef, i32 %tmp557, i32 0
+ %tmp564 = insertelement <8 x i32> %tmp563, i32 %tmp558, i32 1
+ %tmp565 = insertelement <8 x i32> %tmp564, i32 %tmp559, i32 2
+ %tmp566 = insertelement <8 x i32> %tmp565, i32 %tmp560, i32 3
+ %tmp567 = insertelement <8 x i32> %tmp566, i32 %tmp561, i32 4
+ %tmp568 = insertelement <8 x i32> %tmp567, i32 %tmp562, i32 5
+ %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6
+ %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7
+ %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32>
+ %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp572 = extractelement <4 x float> %tmp571, i32 0
+ %tmp573 = extractelement <4 x float> %tmp571, i32 1
+ %tmp574 = extractelement <4 x float> %tmp571, i32 2
+ %tmp575 = fmul float %tmp572, %tmp43
+ %tmp576 = fadd float %tmp575, %tmp552
+ %tmp577 = fmul float %tmp573, %tmp44
+ %tmp578 = fadd float %tmp577, %tmp554
+ %tmp579 = fmul float %tmp574, %tmp45
+ %tmp580 = fadd float %tmp579, %tmp556
+ %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578)
+ %tmp582 = bitcast i32 %tmp581 to float
+ %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282)
+ %tmp584 = bitcast i32 %tmp583 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp582, float %tmp584, float %tmp582, float %tmp584)
ret void
ENDIF66: ; preds = %LOOP65
- %586 = bitcast float %temp28.1 to i32
- %587 = bitcast float %temp29.1 to i32
- %588 = insertelement <8 x i32> %237, i32 %586, i32 4
- %589 = insertelement <8 x i32> %588, i32 %587, i32 5
- %590 = insertelement <8 x i32> %589, i32 undef, i32 6
- %591 = insertelement <8 x i32> %590, i32 undef, i32 7
- %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2)
- %593 = extractelement <4 x float> %592, i32 3
- %594 = fcmp oge float %temp30.1, %593
- %595 = sext i1 %594 to i32
- %596 = bitcast i32 %595 to float
- %597 = bitcast float %596 to i32
- %598 = and i32 %597, 1065353216
- %599 = bitcast i32 %598 to float
- %600 = fmul float 5.000000e-01, %temp32.0
- %601 = fsub float -0.000000e+00, %600
- %602 = fmul float %599, %temp32.0
- %603 = fadd float %602, %601
- %604 = fmul float %214, %603
- %605 = fadd float %604, %temp28.1
- %606 = fmul float %215, %603
- %607 = fadd float %606, %temp29.1
- %608 = fmul float %216, %603
- %609 = fadd float %608, %temp30.1
- %610 = fadd float %temp24.1, 1.000000e+00
- %611 = fmul float %temp32.0, 5.000000e-01
+ %tmp585 = bitcast float %temp28.1 to i32
+ %tmp586 = bitcast float %temp29.1 to i32
+ %tmp587 = insertelement <8 x i32> %tmp236, i32 %tmp585, i32 4
+ %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5
+ %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6
+ %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7
+ %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp592 = extractelement <4 x float> %tmp591, i32 3
+ %tmp593 = fcmp oge float %temp30.1, %tmp592
+ %tmp594 = sext i1 %tmp593 to i32
+ %tmp595 = bitcast i32 %tmp594 to float
+ %tmp596 = bitcast float %tmp595 to i32
+ %tmp597 = and i32 %tmp596, 1065353216
+ %tmp598 = bitcast i32 %tmp597 to float
+ %tmp599 = fmul float 5.000000e-01, %temp32.0
+ %tmp600 = fsub float -0.000000e+00, %tmp599
+ %tmp601 = fmul float %tmp598, %temp32.0
+ %tmp602 = fadd float %tmp601, %tmp600
+ %tmp603 = fmul float %tmp213, %tmp602
+ %tmp604 = fadd float %tmp603, %temp28.1
+ %tmp605 = fmul float %tmp214, %tmp602
+ %tmp606 = fadd float %tmp605, %temp29.1
+ %tmp607 = fmul float %tmp215, %tmp602
+ %tmp608 = fadd float %tmp607, %temp30.1
+ %tmp609 = fadd float %temp24.1, 1.000000e+00
+ %tmp610 = fmul float %temp32.0, 5.000000e-01
br label %LOOP65
}
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-; Function Attrs: readnone
-declare i32 @llvm.SI.tid() #2
-
-; Function Attrs: readonly
-declare float @ceil(float) #3
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq.f32(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2
-
-; Function Attrs: readnone
-declare float @fabs(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #4
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
-attributes #3 = { readonly }
-attributes #4 = { nounwind readonly }
-
-!0 = !{!"const", null, i32 1}
-
; CHECK-LABEL: {{^}}main1:
; CHECK: s_endpgm
-define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
main_body:
- %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
- %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
- %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0)
- %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4)
- %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8)
- %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12)
- %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28)
- %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48)
- %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52)
- %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56)
- %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64)
- %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68)
- %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72)
- %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76)
- %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
- %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
- %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
- %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148)
- %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152)
- %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
- %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164)
- %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168)
- %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172)
- %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
- %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
- %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
- %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
- %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
- %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
- %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
- %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
- %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
- %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220)
- %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236)
- %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
- %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
- %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
- %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252)
- %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
- %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260)
- %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264)
- %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268)
- %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
- %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
- %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
- %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284)
- %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
- %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
- %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464)
- %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468)
- %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472)
- %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496)
- %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500)
- %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504)
- %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512)
- %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516)
- %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524)
- %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532)
- %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536)
- %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540)
- %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544)
- %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548)
- %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552)
- %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556)
- %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560)
- %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564)
- %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568)
- %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572)
- %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576)
- %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580)
- %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584)
- %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588)
- %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592)
- %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596)
- %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600)
- %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604)
- %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608)
- %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612)
- %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616)
- %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624)
- %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628)
- %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632)
- %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636)
- %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640)
- %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644)
- %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648)
- %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652)
- %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656)
- %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660)
- %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664)
- %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668)
- %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672)
- %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676)
- %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680)
- %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684)
- %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688)
- %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692)
- %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696)
- %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700)
- %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704)
- %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708)
- %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712)
- %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716)
- %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864)
- %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868)
- %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
- %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0
- %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
- %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0
- %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
- %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0
- %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
- %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0
- %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
- %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0
- %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
- %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0
- %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
- %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0
- %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
- %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0
- %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
- %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0
- %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
- %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0
- %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
- %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0
- %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
- %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0
- %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
- %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0
- %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
- %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0
- %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
- %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0
- %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
- %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0
- %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
- %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0
- %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
- %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0
- %162 = fcmp ugt float %17, 0.000000e+00
- %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00
- %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
- %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
- %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6)
- %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6)
- %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
- %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
- %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
- %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6)
- %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
- %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
- %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
- %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6)
- %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
- %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
- %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
- %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6)
- %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
- %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
- %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
- %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6)
- %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
- %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
- %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
- %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6)
- %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6)
- %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6)
- %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6)
- %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6)
- %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6)
- %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6)
- %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6)
- %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6)
- %196 = fmul float %14, %124
- %197 = fadd float %196, %125
- %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00)
- %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
- %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
- %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
- %202 = bitcast float %198 to i32
- %203 = icmp ne i32 %202, 0
- %. = select i1 %203, float -1.000000e+00, float 1.000000e+00
- %204 = fsub float -0.000000e+00, %164
- %205 = fadd float %44, %204
- %206 = fsub float -0.000000e+00, %165
- %207 = fadd float %45, %206
- %208 = fsub float -0.000000e+00, %166
- %209 = fadd float %46, %208
- %210 = fmul float %205, %205
- %211 = fmul float %207, %207
- %212 = fadd float %211, %210
- %213 = fmul float %209, %209
- %214 = fadd float %212, %213
- %215 = call float @llvm.AMDGPU.rsq.f32(float %214)
- %216 = fmul float %205, %215
- %217 = fmul float %207, %215
- %218 = fmul float %209, %215
- %219 = fmul float %., %54
- %220 = fmul float %13, %47
- %221 = fmul float %197, %48
- %222 = bitcast float %174 to i32
- %223 = bitcast float %175 to i32
- %224 = insertelement <2 x i32> undef, i32 %222, i32 0
- %225 = insertelement <2 x i32> %224, i32 %223, i32 1
- %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2)
- %227 = extractelement <4 x float> %226, i32 0
- %228 = extractelement <4 x float> %226, i32 1
- %229 = extractelement <4 x float> %226, i32 2
- %230 = extractelement <4 x float> %226, i32 3
- %231 = fmul float %227, 0x4012611180000000
- %232 = fmul float %228, 0x4012611180000000
- %233 = fmul float %229, 0x4012611180000000
- %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00)
- %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00)
- %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00)
- %237 = fmul float %216, %184
- %238 = fmul float %217, %185
- %239 = fadd float %238, %237
- %240 = fmul float %218, %186
- %241 = fadd float %239, %240
- %242 = fmul float %216, %187
- %243 = fmul float %217, %188
- %244 = fadd float %243, %242
- %245 = fmul float %218, %189
- %246 = fadd float %244, %245
- %247 = fmul float %216, %190
- %248 = fmul float %217, %191
- %249 = fadd float %248, %247
- %250 = fmul float %218, %192
- %251 = fadd float %249, %250
- %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00)
- %253 = fmul float %214, 0x3F5A36E2E0000000
- %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00)
- %255 = fsub float -0.000000e+00, %254
- %256 = fadd float 1.000000e+00, %255
- %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01)
- %258 = fmul float %39, %257
- %259 = fmul float %241, %258
- %260 = fmul float %246, %258
- %261 = fmul float %259, %230
- %262 = fmul float %260, %230
- %263 = fadd float %252, 0x3EE4F8B580000000
- %264 = fsub float -0.000000e+00, %252
- %265 = fadd float 1.000000e+00, %264
- %266 = fmul float 1.200000e+01, %265
- %267 = fadd float %266, 4.000000e+00
- %268 = fsub float -0.000000e+00, %267
- %269 = fmul float %268, %263
- %270 = fsub float -0.000000e+00, %267
- %271 = fmul float %270, %263
- %272 = fsub float -0.000000e+00, %267
- %273 = fmul float %272, %263
- %274 = fdiv float 1.000000e+00, %269
- %275 = fdiv float 1.000000e+00, %271
- %276 = fdiv float 1.000000e+00, %273
- %277 = fmul float %261, %274
- %278 = fmul float %262, %275
- %279 = fmul float %263, %276
+ %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
+ %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+ %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 0)
+ %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 4)
+ %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 8)
+ %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 12)
+ %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 28)
+ %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 48)
+ %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 52)
+ %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 56)
+ %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 64)
+ %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 68)
+ %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 72)
+ %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 76)
+ %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128)
+ %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132)
+ %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144)
+ %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 148)
+ %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 152)
+ %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160)
+ %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 164)
+ %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 168)
+ %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 172)
+ %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176)
+ %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180)
+ %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184)
+ %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192)
+ %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196)
+ %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200)
+ %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208)
+ %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212)
+ %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216)
+ %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 220)
+ %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 236)
+ %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240)
+ %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244)
+ %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248)
+ %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 252)
+ %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256)
+ %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 260)
+ %tmp60 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 264)
+ %tmp61 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 268)
+ %tmp62 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272)
+ %tmp63 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276)
+ %tmp64 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280)
+ %tmp65 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 284)
+ %tmp66 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288)
+ %tmp67 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292)
+ %tmp68 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 464)
+ %tmp69 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 468)
+ %tmp70 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 472)
+ %tmp71 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 496)
+ %tmp72 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 500)
+ %tmp73 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 504)
+ %tmp74 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 512)
+ %tmp75 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 516)
+ %tmp76 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 524)
+ %tmp77 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 532)
+ %tmp78 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 536)
+ %tmp79 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 540)
+ %tmp80 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 544)
+ %tmp81 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 548)
+ %tmp82 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 552)
+ %tmp83 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 556)
+ %tmp84 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 560)
+ %tmp85 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 564)
+ %tmp86 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 568)
+ %tmp87 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 572)
+ %tmp88 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 576)
+ %tmp89 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 580)
+ %tmp90 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 584)
+ %tmp91 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 588)
+ %tmp92 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 592)
+ %tmp93 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 596)
+ %tmp94 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 600)
+ %tmp95 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 604)
+ %tmp96 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 608)
+ %tmp97 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 612)
+ %tmp98 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 616)
+ %tmp99 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 624)
+ %tmp100 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 628)
+ %tmp101 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 632)
+ %tmp102 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 636)
+ %tmp103 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 640)
+ %tmp104 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 644)
+ %tmp105 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 648)
+ %tmp106 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 652)
+ %tmp107 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 656)
+ %tmp108 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 660)
+ %tmp109 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 664)
+ %tmp110 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 668)
+ %tmp111 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 672)
+ %tmp112 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 676)
+ %tmp113 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 680)
+ %tmp114 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 684)
+ %tmp115 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 688)
+ %tmp116 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 692)
+ %tmp117 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 696)
+ %tmp118 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 700)
+ %tmp119 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 704)
+ %tmp120 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 708)
+ %tmp121 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 712)
+ %tmp122 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 716)
+ %tmp123 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 864)
+ %tmp124 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 868)
+ %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
+ %tmp126 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp125, !tbaa !0
+ %tmp127 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
+ %tmp128 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp127, !tbaa !0
+ %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
+ %tmp130 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp129, !tbaa !0
+ %tmp131 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1
+ %tmp132 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp131, !tbaa !0
+ %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
+ %tmp134 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp133, !tbaa !0
+ %tmp135 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2
+ %tmp136 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp135, !tbaa !0
+ %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
+ %tmp138 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp137, !tbaa !0
+ %tmp139 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3
+ %tmp140 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp139, !tbaa !0
+ %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
+ %tmp142 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp141, !tbaa !0
+ %tmp143 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4
+ %tmp144 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp143, !tbaa !0
+ %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
+ %tmp146 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp145, !tbaa !0
+ %tmp147 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5
+ %tmp148 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp147, !tbaa !0
+ %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
+ %tmp150 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp149, !tbaa !0
+ %tmp151 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6
+ %tmp152 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp151, !tbaa !0
+ %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
+ %tmp154 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp153, !tbaa !0
+ %tmp155 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
+ %tmp156 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp155, !tbaa !0
+ %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 8
+ %tmp158 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp157, !tbaa !0
+ %tmp159 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 8
+ %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0
+ %tmp161 = fcmp ugt float %arg17, 0.000000e+00
+ %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00
+ %tmp163 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
+ %tmp164 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
+ %tmp165 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %arg4, <2 x i32> %arg6)
+ %tmp166 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg4, <2 x i32> %arg6)
+ %tmp167 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
+ %tmp168 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
+ %tmp169 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
+ %tmp170 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %arg4, <2 x i32> %arg6)
+ %tmp171 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
+ %tmp172 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
+ %tmp173 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
+ %tmp174 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %arg4, <2 x i32> %arg6)
+ %tmp175 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
+ %tmp176 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
+ %tmp177 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
+ %tmp178 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %arg4, <2 x i32> %arg6)
+ %tmp179 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
+ %tmp180 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
+ %tmp181 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
+ %tmp182 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg4, <2 x i32> %arg6)
+ %tmp183 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
+ %tmp184 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
+ %tmp185 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
+ %tmp186 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %arg4, <2 x i32> %arg6)
+ %tmp187 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %arg4, <2 x i32> %arg6)
+ %tmp188 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %arg4, <2 x i32> %arg6)
+ %tmp189 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %arg4, <2 x i32> %arg6)
+ %tmp190 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %arg4, <2 x i32> %arg6)
+ %tmp191 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %arg4, <2 x i32> %arg6)
+ %tmp192 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %arg4, <2 x i32> %arg6)
+ %tmp193 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %arg4, <2 x i32> %arg6)
+ %tmp194 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %arg4, <2 x i32> %arg6)
+ %tmp195 = fmul float %arg14, %tmp123
+ %tmp196 = fadd float %tmp195, %tmp124
+ %tmp197 = call float @llvm.AMDGPU.clamp.f32(float %tmp162, float 0.000000e+00, float 1.000000e+00)
+ %tmp198 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+ %tmp199 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+ %tmp200 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+ %tmp201 = bitcast float %tmp197 to i32
+ %tmp202 = icmp ne i32 %tmp201, 0
+ %. = select i1 %tmp202, float -1.000000e+00, float 1.000000e+00
+ %tmp203 = fsub float -0.000000e+00, %tmp163
+ %tmp204 = fadd float %tmp43, %tmp203
+ %tmp205 = fsub float -0.000000e+00, %tmp164
+ %tmp206 = fadd float %tmp44, %tmp205
+ %tmp207 = fsub float -0.000000e+00, %tmp165
+ %tmp208 = fadd float %tmp45, %tmp207
+ %tmp209 = fmul float %tmp204, %tmp204
+ %tmp210 = fmul float %tmp206, %tmp206
+ %tmp211 = fadd float %tmp210, %tmp209
+ %tmp212 = fmul float %tmp208, %tmp208
+ %tmp213 = fadd float %tmp211, %tmp212
+ %tmp214 = call float @llvm.amdgcn.rsq.f32(float %tmp213)
+ %tmp215 = fmul float %tmp204, %tmp214
+ %tmp216 = fmul float %tmp206, %tmp214
+ %tmp217 = fmul float %tmp208, %tmp214
+ %tmp218 = fmul float %., %tmp53
+ %tmp219 = fmul float %arg13, %tmp46
+ %tmp220 = fmul float %tmp196, %tmp47
+ %tmp221 = bitcast float %tmp173 to i32
+ %tmp222 = bitcast float %tmp174 to i32
+ %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0
+ %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1
+ %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32>
+ %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp226 = extractelement <4 x float> %tmp225, i32 0
+ %tmp227 = extractelement <4 x float> %tmp225, i32 1
+ %tmp228 = extractelement <4 x float> %tmp225, i32 2
+ %tmp229 = extractelement <4 x float> %tmp225, i32 3
+ %tmp230 = fmul float %tmp226, 0x4012611180000000
+ %tmp231 = fmul float %tmp227, 0x4012611180000000
+ %tmp232 = fmul float %tmp228, 0x4012611180000000
+ %one.sub.a.i = fsub float 1.000000e+00, %tmp26
+ %result.i = fadd float %tmp230, %one.sub.a.i
+ %one.sub.a.i43 = fsub float 1.000000e+00, %tmp26
+ %result.i44 = fadd float %tmp231, %one.sub.a.i43
+ %one.sub.a.i41 = fsub float 1.000000e+00, %tmp26
+ %result.i42 = fadd float %tmp232, %one.sub.a.i41
+ %tmp233 = fmul float %tmp215, %tmp183
+ %tmp234 = fmul float %tmp216, %tmp184
+ %tmp235 = fadd float %tmp234, %tmp233
+ %tmp236 = fmul float %tmp217, %tmp185
+ %tmp237 = fadd float %tmp235, %tmp236
+ %tmp238 = fmul float %tmp215, %tmp186
+ %tmp239 = fmul float %tmp216, %tmp187
+ %tmp240 = fadd float %tmp239, %tmp238
+ %tmp241 = fmul float %tmp217, %tmp188
+ %tmp242 = fadd float %tmp240, %tmp241
+ %tmp243 = fmul float %tmp215, %tmp189
+ %tmp244 = fmul float %tmp216, %tmp190
+ %tmp245 = fadd float %tmp244, %tmp243
+ %tmp246 = fmul float %tmp217, %tmp191
+ %tmp247 = fadd float %tmp245, %tmp246
+ %tmp248 = call float @llvm.AMDGPU.clamp.f32(float %tmp247, float 0.000000e+00, float 1.000000e+00)
+ %tmp249 = fmul float %tmp213, 0x3F5A36E2E0000000
+ %tmp250 = call float @llvm.AMDGPU.clamp.f32(float %tmp249, float 0.000000e+00, float 1.000000e+00)
+ %tmp251 = fsub float -0.000000e+00, %tmp250
+ %tmp252 = fadd float 1.000000e+00, %tmp251
+ %tmp253 = call float @llvm.pow.f32(float %tmp248, float 2.500000e-01)
+ %tmp254 = fmul float %tmp38, %tmp253
+ %tmp255 = fmul float %tmp237, %tmp254
+ %tmp256 = fmul float %tmp242, %tmp254
+ %tmp257 = fmul float %tmp255, %tmp229
+ %tmp258 = fmul float %tmp256, %tmp229
+ %tmp259 = fadd float %tmp248, 0x3EE4F8B580000000
+ %tmp260 = fsub float -0.000000e+00, %tmp248
+ %tmp261 = fadd float 1.000000e+00, %tmp260
+ %tmp262 = fmul float 1.200000e+01, %tmp261
+ %tmp263 = fadd float %tmp262, 4.000000e+00
+ %tmp264 = fsub float -0.000000e+00, %tmp263
+ %tmp265 = fmul float %tmp264, %tmp259
+ %tmp266 = fsub float -0.000000e+00, %tmp263
+ %tmp267 = fmul float %tmp266, %tmp259
+ %tmp268 = fsub float -0.000000e+00, %tmp263
+ %tmp269 = fmul float %tmp268, %tmp259
+ %tmp270 = fdiv float 1.000000e+00, %tmp265
+ %tmp271 = fdiv float 1.000000e+00, %tmp267
+ %tmp272 = fdiv float 1.000000e+00, %tmp269
+ %tmp273 = fmul float %tmp257, %tmp270
+ %tmp274 = fmul float %tmp258, %tmp271
+ %tmp275 = fmul float %tmp259, %tmp272
br label %LOOP
LOOP: ; preds = %LOOP, %main_body
- %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ]
- %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ]
- %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ]
- %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ]
- %280 = bitcast float %temp168.0 to i32
- %281 = bitcast float %temp169.0 to i32
- %282 = insertelement <4 x i32> undef, i32 %280, i32 0
- %283 = insertelement <4 x i32> %282, i32 %281, i32 1
- %284 = insertelement <4 x i32> %283, i32 0, i32 2
- %285 = insertelement <4 x i32> %284, i32 undef, i32 3
- %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2)
- %287 = extractelement <4 x float> %286, i32 3
- %288 = fadd float %temp168.0, %277
- %289 = fadd float %temp169.0, %278
- %290 = fadd float %temp170.0, %279
- %291 = fsub float -0.000000e+00, %287
- %292 = fadd float %290, %291
- %293 = fcmp oge float 0.000000e+00, %292
- %294 = sext i1 %293 to i32
- %295 = bitcast i32 %294 to float
- %296 = bitcast float %295 to i32
- %297 = icmp ne i32 %296, 0
- br i1 %297, label %IF189, label %LOOP
+ %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp288, %LOOP ]
+ %temp168.0 = phi float [ %tmp175, %main_body ], [ %tmp284, %LOOP ]
+ %temp169.0 = phi float [ %tmp176, %main_body ], [ %tmp285, %LOOP ]
+ %temp170.0 = phi float [ %tmp252, %main_body ], [ %tmp286, %LOOP ]
+ %tmp276 = bitcast float %temp168.0 to i32
+ %tmp277 = bitcast float %temp169.0 to i32
+ %tmp278 = insertelement <4 x i32> undef, i32 %tmp276, i32 0
+ %tmp279 = insertelement <4 x i32> %tmp278, i32 %tmp277, i32 1
+ %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2
+ %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3
+ %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32>
+ %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp283 = extractelement <4 x float> %tmp282, i32 3
+ %tmp284 = fadd float %temp168.0, %tmp273
+ %tmp285 = fadd float %temp169.0, %tmp274
+ %tmp286 = fadd float %temp170.0, %tmp275
+ %tmp287 = fsub float -0.000000e+00, %tmp283
+ %tmp288 = fadd float %tmp286, %tmp287
+ %tmp289 = fcmp oge float 0.000000e+00, %tmp288
+ %tmp290 = sext i1 %tmp289 to i32
+ %tmp291 = bitcast i32 %tmp290 to float
+ %tmp292 = bitcast float %tmp291 to i32
+ %tmp293 = icmp ne i32 %tmp292, 0
+ br i1 %tmp293, label %IF189, label %LOOP
IF189: ; preds = %LOOP
- %298 = extractelement <4 x float> %286, i32 0
- %299 = extractelement <4 x float> %286, i32 1
- %300 = extractelement <4 x float> %286, i32 2
- %301 = fsub float -0.000000e+00, %292
- %302 = fadd float %temp144.0, %301
- %303 = fdiv float 1.000000e+00, %302
- %304 = fmul float %292, %303
- %305 = fadd float %304, -1.000000e+00
- %306 = fmul float %305, %277
- %307 = fadd float %306, %288
- %308 = fmul float %305, %278
- %309 = fadd float %308, %289
- %310 = fsub float -0.000000e+00, %176
- %311 = fadd float %307, %310
- %312 = fsub float -0.000000e+00, %177
- %313 = fadd float %309, %312
- %314 = fadd float %176, %311
- %315 = fadd float %177, %313
- %316 = fmul float %311, %67
- %317 = fmul float %313, %68
- %318 = fmul float %316, %55
- %319 = fmul float %316, %56
- %320 = fmul float %317, %57
- %321 = fadd float %320, %318
- %322 = fmul float %317, %58
- %323 = fadd float %322, %319
- %324 = fadd float %178, %321
- %325 = fadd float %179, %323
- %326 = fmul float %316, %59
- %327 = fmul float %316, %60
- %328 = fmul float %316, %61
- %329 = fmul float %316, %62
- %330 = fmul float %317, %63
- %331 = fadd float %330, %326
- %332 = fmul float %317, %64
- %333 = fadd float %332, %327
- %334 = fmul float %317, %65
- %335 = fadd float %334, %328
- %336 = fmul float %317, %66
- %337 = fadd float %336, %329
- %338 = fadd float %168, %331
- %339 = fadd float %169, %333
- %340 = fadd float %170, %335
- %341 = fadd float %171, %337
- %342 = bitcast float %338 to i32
- %343 = bitcast float %339 to i32
- %344 = insertelement <2 x i32> undef, i32 %342, i32 0
- %345 = insertelement <2 x i32> %344, i32 %343, i32 1
- %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2)
- %347 = extractelement <4 x float> %346, i32 0
- %348 = extractelement <4 x float> %346, i32 1
- %349 = extractelement <4 x float> %346, i32 2
- %350 = extractelement <4 x float> %346, i32 3
- %351 = fmul float %347, %23
- %352 = fmul float %348, %24
- %353 = fmul float %349, %25
- %354 = fmul float %350, %26
- %355 = fmul float %351, %180
- %356 = fmul float %352, %181
- %357 = fmul float %353, %182
- %358 = fmul float %354, %183
- %359 = fsub float -0.000000e+00, %350
- %360 = fadd float 1.000000e+00, %359
- %361 = fmul float %360, %49
- %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355)
- %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356)
- %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357)
- %365 = bitcast float %340 to i32
- %366 = bitcast float %341 to i32
- %367 = insertelement <2 x i32> undef, i32 %365, i32 0
- %368 = insertelement <2 x i32> %367, i32 %366, i32 1
- %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2)
- %370 = extractelement <4 x float> %369, i32 2
- %371 = fmul float %362, %234
- %372 = fmul float %363, %235
- %373 = fmul float %364, %236
- %374 = fmul float %358, %230
- %375 = bitcast float %314 to i32
- %376 = bitcast float %315 to i32
- %377 = insertelement <2 x i32> undef, i32 %375, i32 0
- %378 = insertelement <2 x i32> %377, i32 %376, i32 1
- %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2)
- %380 = extractelement <4 x float> %379, i32 0
- %381 = extractelement <4 x float> %379, i32 1
- %382 = extractelement <4 x float> %379, i32 2
- %383 = extractelement <4 x float> %379, i32 3
- %384 = fcmp olt float 0.000000e+00, %382
- %385 = sext i1 %384 to i32
- %386 = bitcast i32 %385 to float
- %387 = bitcast float %386 to i32
- %388 = icmp ne i32 %387, 0
- %.224 = select i1 %388, float %381, float %380
- %.225 = select i1 %388, float %383, float %381
- %389 = bitcast float %324 to i32
- %390 = bitcast float %325 to i32
- %391 = insertelement <2 x i32> undef, i32 %389, i32 0
- %392 = insertelement <2 x i32> %391, i32 %390, i32 1
- %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2)
- %394 = extractelement <4 x float> %393, i32 0
- %395 = extractelement <4 x float> %393, i32 1
- %396 = extractelement <4 x float> %393, i32 2
- %397 = extractelement <4 x float> %393, i32 3
- %398 = fcmp olt float 0.000000e+00, %396
- %399 = sext i1 %398 to i32
- %400 = bitcast i32 %399 to float
- %401 = bitcast float %400 to i32
- %402 = icmp ne i32 %401, 0
- %temp112.1 = select i1 %402, float %395, float %394
- %temp113.1 = select i1 %402, float %397, float %395
- %403 = fmul float %.224, 2.000000e+00
- %404 = fadd float %403, -1.000000e+00
- %405 = fmul float %.225, 2.000000e+00
- %406 = fadd float %405, -1.000000e+00
- %407 = fmul float %temp112.1, 2.000000e+00
- %408 = fadd float %407, -1.000000e+00
- %409 = fmul float %temp113.1, 2.000000e+00
- %410 = fadd float %409, -1.000000e+00
- %411 = fsub float -0.000000e+00, %404
- %412 = fmul float %411, %35
- %413 = fsub float -0.000000e+00, %406
- %414 = fmul float %413, %35
- %415 = fsub float -0.000000e+00, %408
- %416 = fmul float %415, %36
- %417 = fsub float -0.000000e+00, %410
- %418 = fmul float %417, %36
- %419 = fmul float %416, %370
- %420 = fmul float %418, %370
- %421 = call float @fabs(float %412)
- %422 = call float @fabs(float %414)
- %423 = fsub float -0.000000e+00, %421
- %424 = fadd float 1.000000e+00, %423
- %425 = fsub float -0.000000e+00, %422
- %426 = fadd float 1.000000e+00, %425
- %427 = fmul float %424, %419
- %428 = fadd float %427, %412
- %429 = fmul float %426, %420
- %430 = fadd float %429, %414
- %431 = fmul float %428, %428
- %432 = fmul float %430, %430
- %433 = fadd float %431, %432
- %434 = fsub float -0.000000e+00, %433
- %435 = fadd float 0x3FF00068E0000000, %434
- %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
- %437 = call float @llvm.AMDGPU.rsq.f32(float %436)
- %438 = fmul float %437, %436
- %439 = fsub float -0.000000e+00, %436
- %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
- %441 = fmul float %184, %428
- %442 = fmul float %185, %428
- %443 = fmul float %186, %428
- %444 = fmul float %187, %430
- %445 = fadd float %444, %441
- %446 = fmul float %188, %430
- %447 = fadd float %446, %442
- %448 = fmul float %189, %430
- %449 = fadd float %448, %443
- %450 = fmul float %190, %440
- %451 = fadd float %450, %445
- %452 = fmul float %191, %440
- %453 = fadd float %452, %447
- %454 = fmul float %192, %440
- %455 = fadd float %454, %449
- %456 = fmul float %451, %451
- %457 = fmul float %453, %453
- %458 = fadd float %457, %456
- %459 = fmul float %455, %455
- %460 = fadd float %458, %459
- %461 = call float @llvm.AMDGPU.rsq.f32(float %460)
- %462 = fmul float %451, %461
- %463 = fmul float %453, %461
- %464 = fmul float %455, %461
- %465 = fcmp olt float 0.000000e+00, %219
- %466 = sext i1 %465 to i32
- %467 = bitcast i32 %466 to float
- %468 = bitcast float %467 to i32
- %469 = icmp ne i32 %468, 0
- br i1 %469, label %IF198, label %ENDIF197
+ %tmp294 = extractelement <4 x float> %tmp282, i32 0
+ %tmp295 = extractelement <4 x float> %tmp282, i32 1
+ %tmp296 = extractelement <4 x float> %tmp282, i32 2
+ %tmp297 = fsub float -0.000000e+00, %tmp288
+ %tmp298 = fadd float %temp144.0, %tmp297
+ %tmp299 = fdiv float 1.000000e+00, %tmp298
+ %tmp300 = fmul float %tmp288, %tmp299
+ %tmp301 = fadd float %tmp300, -1.000000e+00
+ %tmp302 = fmul float %tmp301, %tmp273
+ %tmp303 = fadd float %tmp302, %tmp284
+ %tmp304 = fmul float %tmp301, %tmp274
+ %tmp305 = fadd float %tmp304, %tmp285
+ %tmp306 = fsub float -0.000000e+00, %tmp175
+ %tmp307 = fadd float %tmp303, %tmp306
+ %tmp308 = fsub float -0.000000e+00, %tmp176
+ %tmp309 = fadd float %tmp305, %tmp308
+ %tmp310 = fadd float %tmp175, %tmp307
+ %tmp311 = fadd float %tmp176, %tmp309
+ %tmp312 = fmul float %tmp307, %tmp66
+ %tmp313 = fmul float %tmp309, %tmp67
+ %tmp314 = fmul float %tmp312, %tmp54
+ %tmp315 = fmul float %tmp312, %tmp55
+ %tmp316 = fmul float %tmp313, %tmp56
+ %tmp317 = fadd float %tmp316, %tmp314
+ %tmp318 = fmul float %tmp313, %tmp57
+ %tmp319 = fadd float %tmp318, %tmp315
+ %tmp320 = fadd float %tmp177, %tmp317
+ %tmp321 = fadd float %tmp178, %tmp319
+ %tmp322 = fmul float %tmp312, %tmp58
+ %tmp323 = fmul float %tmp312, %tmp59
+ %tmp324 = fmul float %tmp312, %tmp60
+ %tmp325 = fmul float %tmp312, %tmp61
+ %tmp326 = fmul float %tmp313, %tmp62
+ %tmp327 = fadd float %tmp326, %tmp322
+ %tmp328 = fmul float %tmp313, %tmp63
+ %tmp329 = fadd float %tmp328, %tmp323
+ %tmp330 = fmul float %tmp313, %tmp64
+ %tmp331 = fadd float %tmp330, %tmp324
+ %tmp332 = fmul float %tmp313, %tmp65
+ %tmp333 = fadd float %tmp332, %tmp325
+ %tmp334 = fadd float %tmp167, %tmp327
+ %tmp335 = fadd float %tmp168, %tmp329
+ %tmp336 = fadd float %tmp169, %tmp331
+ %tmp337 = fadd float %tmp170, %tmp333
+ %tmp338 = bitcast float %tmp334 to i32
+ %tmp339 = bitcast float %tmp335 to i32
+ %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0
+ %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1
+ %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32>
+ %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp343 = extractelement <4 x float> %tmp342, i32 0
+ %tmp344 = extractelement <4 x float> %tmp342, i32 1
+ %tmp345 = extractelement <4 x float> %tmp342, i32 2
+ %tmp346 = extractelement <4 x float> %tmp342, i32 3
+ %tmp347 = fmul float %tmp343, %tmp22
+ %tmp348 = fmul float %tmp344, %tmp23
+ %tmp349 = fmul float %tmp345, %tmp24
+ %tmp350 = fmul float %tmp346, %tmp25
+ %tmp351 = fmul float %tmp347, %tmp179
+ %tmp352 = fmul float %tmp348, %tmp180
+ %tmp353 = fmul float %tmp349, %tmp181
+ %tmp354 = fmul float %tmp350, %tmp182
+ %tmp355 = fsub float -0.000000e+00, %tmp346
+ %tmp356 = fadd float 1.000000e+00, %tmp355
+ %tmp357 = fmul float %tmp356, %tmp48
+ %one.sub.a.i37 = fsub float 1.000000e+00, %tmp357
+ %one.sub.ac.i38 = fmul float %one.sub.a.i37, %tmp351
+ %mul.i39 = fmul float %tmp343, %tmp351
+ %result.i40 = fadd float %mul.i39, %one.sub.ac.i38
+ %one.sub.a.i33 = fsub float 1.000000e+00, %tmp357
+ %one.sub.ac.i34 = fmul float %one.sub.a.i33, %tmp352
+ %mul.i35 = fmul float %tmp344, %tmp352
+ %result.i36 = fadd float %mul.i35, %one.sub.ac.i34
+ %one.sub.a.i29 = fsub float 1.000000e+00, %tmp357
+ %one.sub.ac.i30 = fmul float %one.sub.a.i29, %tmp353
+ %mul.i31 = fmul float %tmp345, %tmp353
+ %result.i32 = fadd float %mul.i31, %one.sub.ac.i30
+ %tmp358 = bitcast float %tmp336 to i32
+ %tmp359 = bitcast float %tmp337 to i32
+ %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0
+ %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1
+ %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32>
+ %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp363 = extractelement <4 x float> %tmp362, i32 2
+ %tmp364 = fmul float %result.i40, %result.i
+ %tmp365 = fmul float %result.i36, %result.i44
+ %tmp366 = fmul float %result.i32, %result.i42
+ %tmp367 = fmul float %tmp354, %tmp229
+ %tmp368 = bitcast float %tmp310 to i32
+ %tmp369 = bitcast float %tmp311 to i32
+ %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0
+ %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1
+ %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32>
+ %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp373 = extractelement <4 x float> %tmp372, i32 0
+ %tmp374 = extractelement <4 x float> %tmp372, i32 1
+ %tmp375 = extractelement <4 x float> %tmp372, i32 2
+ %tmp376 = extractelement <4 x float> %tmp372, i32 3
+ %tmp377 = fcmp olt float 0.000000e+00, %tmp375
+ %tmp378 = sext i1 %tmp377 to i32
+ %tmp379 = bitcast i32 %tmp378 to float
+ %tmp380 = bitcast float %tmp379 to i32
+ %tmp381 = icmp ne i32 %tmp380, 0
+ %.224 = select i1 %tmp381, float %tmp374, float %tmp373
+ %.225 = select i1 %tmp381, float %tmp376, float %tmp374
+ %tmp382 = bitcast float %tmp320 to i32
+ %tmp383 = bitcast float %tmp321 to i32
+ %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0
+ %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1
+ %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32>
+ %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp387 = extractelement <4 x float> %tmp386, i32 0
+ %tmp388 = extractelement <4 x float> %tmp386, i32 1
+ %tmp389 = extractelement <4 x float> %tmp386, i32 2
+ %tmp390 = extractelement <4 x float> %tmp386, i32 3
+ %tmp391 = fcmp olt float 0.000000e+00, %tmp389
+ %tmp392 = sext i1 %tmp391 to i32
+ %tmp393 = bitcast i32 %tmp392 to float
+ %tmp394 = bitcast float %tmp393 to i32
+ %tmp395 = icmp ne i32 %tmp394, 0
+ %temp112.1 = select i1 %tmp395, float %tmp388, float %tmp387
+ %temp113.1 = select i1 %tmp395, float %tmp390, float %tmp388
+ %tmp396 = fmul float %.224, 2.000000e+00
+ %tmp397 = fadd float %tmp396, -1.000000e+00
+ %tmp398 = fmul float %.225, 2.000000e+00
+ %tmp399 = fadd float %tmp398, -1.000000e+00
+ %tmp400 = fmul float %temp112.1, 2.000000e+00
+ %tmp401 = fadd float %tmp400, -1.000000e+00
+ %tmp402 = fmul float %temp113.1, 2.000000e+00
+ %tmp403 = fadd float %tmp402, -1.000000e+00
+ %tmp404 = fsub float -0.000000e+00, %tmp397
+ %tmp405 = fmul float %tmp404, %tmp34
+ %tmp406 = fsub float -0.000000e+00, %tmp399
+ %tmp407 = fmul float %tmp406, %tmp34
+ %tmp408 = fsub float -0.000000e+00, %tmp401
+ %tmp409 = fmul float %tmp408, %tmp35
+ %tmp410 = fsub float -0.000000e+00, %tmp403
+ %tmp411 = fmul float %tmp410, %tmp35
+ %tmp412 = fmul float %tmp409, %tmp363
+ %tmp413 = fmul float %tmp411, %tmp363
+ %tmp414 = call float @fabs(float %tmp405)
+ %tmp415 = call float @fabs(float %tmp407)
+ %tmp416 = fsub float -0.000000e+00, %tmp414
+ %tmp417 = fadd float 1.000000e+00, %tmp416
+ %tmp418 = fsub float -0.000000e+00, %tmp415
+ %tmp419 = fadd float 1.000000e+00, %tmp418
+ %tmp420 = fmul float %tmp417, %tmp412
+ %tmp421 = fadd float %tmp420, %tmp405
+ %tmp422 = fmul float %tmp419, %tmp413
+ %tmp423 = fadd float %tmp422, %tmp407
+ %tmp424 = fmul float %tmp421, %tmp421
+ %tmp425 = fmul float %tmp423, %tmp423
+ %tmp426 = fadd float %tmp424, %tmp425
+ %tmp427 = fsub float -0.000000e+00, %tmp426
+ %tmp428 = fadd float 0x3FF00068E0000000, %tmp427
+ %tmp429 = call float @llvm.AMDGPU.clamp.f32(float %tmp428, float 0.000000e+00, float 1.000000e+00)
+ %tmp430 = call float @llvm.amdgcn.rsq.f32(float %tmp429)
+ %tmp431 = fmul float %tmp430, %tmp429
+ %tmp432 = fsub float -0.000000e+00, %tmp429
+ %cmp = fcmp ogt float 0.000000e+00, %tmp432
+ %tmp433 = select i1 %cmp, float %tmp431, float 0.000000e+00
+ %tmp434 = fmul float %tmp183, %tmp421
+ %tmp435 = fmul float %tmp184, %tmp421
+ %tmp436 = fmul float %tmp185, %tmp421
+ %tmp437 = fmul float %tmp186, %tmp423
+ %tmp438 = fadd float %tmp437, %tmp434
+ %tmp439 = fmul float %tmp187, %tmp423
+ %tmp440 = fadd float %tmp439, %tmp435
+ %tmp441 = fmul float %tmp188, %tmp423
+ %tmp442 = fadd float %tmp441, %tmp436
+ %tmp443 = fmul float %tmp189, %tmp433
+ %tmp444 = fadd float %tmp443, %tmp438
+ %tmp445 = fmul float %tmp190, %tmp433
+ %tmp446 = fadd float %tmp445, %tmp440
+ %tmp447 = fmul float %tmp191, %tmp433
+ %tmp448 = fadd float %tmp447, %tmp442
+ %tmp449 = fmul float %tmp444, %tmp444
+ %tmp450 = fmul float %tmp446, %tmp446
+ %tmp451 = fadd float %tmp450, %tmp449
+ %tmp452 = fmul float %tmp448, %tmp448
+ %tmp453 = fadd float %tmp451, %tmp452
+ %tmp454 = call float @llvm.amdgcn.rsq.f32(float %tmp453)
+ %tmp455 = fmul float %tmp444, %tmp454
+ %tmp456 = fmul float %tmp446, %tmp454
+ %tmp457 = fmul float %tmp448, %tmp454
+ %tmp458 = fcmp olt float 0.000000e+00, %tmp218
+ %tmp459 = sext i1 %tmp458 to i32
+ %tmp460 = bitcast i32 %tmp459 to float
+ %tmp461 = bitcast float %tmp460 to i32
+ %tmp462 = icmp ne i32 %tmp461, 0
+ br i1 %tmp462, label %IF198, label %ENDIF197
IF198: ; preds = %IF189
- %470 = fsub float -0.000000e+00, %462
- %471 = fsub float -0.000000e+00, %463
- %472 = fsub float -0.000000e+00, %464
+ %tmp463 = fsub float -0.000000e+00, %tmp455
+ %tmp464 = fsub float -0.000000e+00, %tmp456
+ %tmp465 = fsub float -0.000000e+00, %tmp457
br label %ENDIF197
-ENDIF197: ; preds = %IF189, %IF198
- %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ]
- %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ]
- %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ]
- %473 = bitcast float %220 to i32
- %474 = bitcast float %221 to i32
- %475 = insertelement <2 x i32> undef, i32 %473, i32 0
- %476 = insertelement <2 x i32> %475, i32 %474, i32 1
- %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2)
- %478 = extractelement <4 x float> %477, i32 0
- %479 = extractelement <4 x float> %477, i32 1
- %480 = extractelement <4 x float> %477, i32 2
- %481 = extractelement <4 x float> %477, i32 3
- %482 = fmul float %478, %40
- %483 = fadd float %482, %41
- %484 = fmul float %479, %40
- %485 = fadd float %484, %41
- %486 = fmul float %480, %40
- %487 = fadd float %486, %41
- %488 = fmul float %481, %42
- %489 = fadd float %488, %43
- %490 = bitcast float %172 to i32
- %491 = bitcast float %173 to i32
- %492 = insertelement <2 x i32> undef, i32 %490, i32 0
- %493 = insertelement <2 x i32> %492, i32 %491, i32 1
- %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2)
- %495 = extractelement <4 x float> %494, i32 0
- %496 = extractelement <4 x float> %494, i32 1
- %497 = extractelement <4 x float> %494, i32 2
- %498 = extractelement <4 x float> %494, i32 3
- %499 = fmul float %498, 3.200000e+01
- %500 = fadd float %499, -1.600000e+01
- %501 = call float @llvm.AMDIL.exp.(float %500)
- %502 = fmul float %495, %501
- %503 = fmul float %496, %501
- %504 = fmul float %497, %501
- %505 = fmul float %28, %502
- %506 = fadd float %505, %193
- %507 = fmul float %29, %503
- %508 = fadd float %507, %194
- %509 = fmul float %30, %504
- %510 = fadd float %509, %195
- %511 = fmul float %506, %489
- %512 = fmul float %508, %489
- %513 = fmul float %510, %489
- %514 = fmul float %489, 5.000000e-01
- %515 = fadd float %514, 5.000000e-01
- %516 = fmul float %483, %515
- %517 = fadd float %516, %511
- %518 = fmul float %485, %515
- %519 = fadd float %518, %512
- %520 = fmul float %487, %515
- %521 = fadd float %520, %513
- %522 = fmul float %517, %371
- %523 = fmul float %519, %372
- %524 = fmul float %521, %373
- %525 = fmul float %428, 0x3FDB272440000000
- %526 = fmul float %430, 0xBFDB272440000000
- %527 = fadd float %526, %525
- %528 = fmul float %440, 0x3FE99999A0000000
- %529 = fadd float %527, %528
- %530 = fmul float %529, 5.000000e-01
- %531 = fadd float %530, 0x3FE3333340000000
- %532 = fmul float %531, %531
- %533 = fmul float %522, %532
- %534 = fmul float %523, %532
- %535 = fmul float %524, %532
- %536 = fsub float -0.000000e+00, %72
- %537 = fsub float -0.000000e+00, %73
- %538 = fsub float -0.000000e+00, %74
- %539 = fmul float %temp12.0, %536
- %540 = fmul float %temp13.0, %537
- %541 = fadd float %540, %539
- %542 = fmul float %temp14.0, %538
- %543 = fadd float %541, %542
- %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00)
- %545 = fmul float %371, %544
- %546 = fmul float %372, %544
- %547 = fmul float %373, %544
- %548 = fmul float %545, %69
- %549 = fmul float %546, %70
- %550 = fmul float %547, %71
- %551 = fsub float -0.000000e+00, %164
- %552 = fadd float %97, %551
- %553 = fsub float -0.000000e+00, %165
- %554 = fadd float %98, %553
- %555 = fsub float -0.000000e+00, %166
- %556 = fadd float %99, %555
- %557 = fmul float %552, %552
- %558 = fmul float %554, %554
- %559 = fadd float %558, %557
- %560 = fmul float %556, %556
- %561 = fadd float %559, %560
- %562 = call float @llvm.AMDGPU.rsq.f32(float %561)
- %563 = fmul float %562, %561
- %564 = fsub float -0.000000e+00, %561
- %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
- %566 = fsub float -0.000000e+00, %84
- %567 = fadd float %565, %566
- %568 = fsub float -0.000000e+00, %83
- %569 = fadd float %565, %568
- %570 = fsub float -0.000000e+00, %82
- %571 = fadd float %565, %570
- %572 = fsub float -0.000000e+00, %84
- %573 = fadd float %83, %572
- %574 = fsub float -0.000000e+00, %83
- %575 = fadd float %82, %574
- %576 = fsub float -0.000000e+00, %82
- %577 = fadd float %81, %576
- %578 = fdiv float 1.000000e+00, %573
- %579 = fdiv float 1.000000e+00, %575
- %580 = fdiv float 1.000000e+00, %577
- %581 = fmul float %567, %578
- %582 = fmul float %569, %579
- %583 = fmul float %571, %580
- %584 = fcmp olt float %565, %83
- %585 = sext i1 %584 to i32
- %586 = bitcast i32 %585 to float
- %587 = bitcast float %586 to i32
- %588 = icmp ne i32 %587, 0
- br i1 %588, label %ENDIF200, label %ELSE202
+ENDIF197: ; preds = %IF198, %IF189
+ %temp14.0 = phi float [ %tmp465, %IF198 ], [ %tmp457, %IF189 ]
+ %temp13.0 = phi float [ %tmp464, %IF198 ], [ %tmp456, %IF189 ]
+ %temp12.0 = phi float [ %tmp463, %IF198 ], [ %tmp455, %IF189 ]
+ %tmp466 = bitcast float %tmp219 to i32
+ %tmp467 = bitcast float %tmp220 to i32
+ %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0
+ %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1
+ %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32>
+ %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp471 = extractelement <4 x float> %tmp470, i32 0
+ %tmp472 = extractelement <4 x float> %tmp470, i32 1
+ %tmp473 = extractelement <4 x float> %tmp470, i32 2
+ %tmp474 = extractelement <4 x float> %tmp470, i32 3
+ %tmp475 = fmul float %tmp471, %tmp39
+ %tmp476 = fadd float %tmp475, %tmp40
+ %tmp477 = fmul float %tmp472, %tmp39
+ %tmp478 = fadd float %tmp477, %tmp40
+ %tmp479 = fmul float %tmp473, %tmp39
+ %tmp480 = fadd float %tmp479, %tmp40
+ %tmp481 = fmul float %tmp474, %tmp41
+ %tmp482 = fadd float %tmp481, %tmp42
+ %tmp483 = bitcast float %tmp171 to i32
+ %tmp484 = bitcast float %tmp172 to i32
+ %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0
+ %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1
+ %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32>
+ %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp488 = extractelement <4 x float> %tmp487, i32 0
+ %tmp489 = extractelement <4 x float> %tmp487, i32 1
+ %tmp490 = extractelement <4 x float> %tmp487, i32 2
+ %tmp491 = extractelement <4 x float> %tmp487, i32 3
+ %tmp492 = fmul float %tmp491, 3.200000e+01
+ %tmp493 = fadd float %tmp492, -1.600000e+01
+ %tmp494 = call float @llvm.exp2.f32(float %tmp493)
+ %tmp495 = fmul float %tmp488, %tmp494
+ %tmp496 = fmul float %tmp489, %tmp494
+ %tmp497 = fmul float %tmp490, %tmp494
+ %tmp498 = fmul float %tmp27, %tmp495
+ %tmp499 = fadd float %tmp498, %tmp192
+ %tmp500 = fmul float %tmp28, %tmp496
+ %tmp501 = fadd float %tmp500, %tmp193
+ %tmp502 = fmul float %tmp29, %tmp497
+ %tmp503 = fadd float %tmp502, %tmp194
+ %tmp504 = fmul float %tmp499, %tmp482
+ %tmp505 = fmul float %tmp501, %tmp482
+ %tmp506 = fmul float %tmp503, %tmp482
+ %tmp507 = fmul float %tmp482, 5.000000e-01
+ %tmp508 = fadd float %tmp507, 5.000000e-01
+ %tmp509 = fmul float %tmp476, %tmp508
+ %tmp510 = fadd float %tmp509, %tmp504
+ %tmp511 = fmul float %tmp478, %tmp508
+ %tmp512 = fadd float %tmp511, %tmp505
+ %tmp513 = fmul float %tmp480, %tmp508
+ %tmp514 = fadd float %tmp513, %tmp506
+ %tmp515 = fmul float %tmp510, %tmp364
+ %tmp516 = fmul float %tmp512, %tmp365
+ %tmp517 = fmul float %tmp514, %tmp366
+ %tmp518 = fmul float %tmp421, 0x3FDB272440000000
+ %tmp519 = fmul float %tmp423, 0xBFDB272440000000
+ %tmp520 = fadd float %tmp519, %tmp518
+ %tmp521 = fmul float %tmp433, 0x3FE99999A0000000
+ %tmp522 = fadd float %tmp520, %tmp521
+ %tmp523 = fmul float %tmp522, 5.000000e-01
+ %tmp524 = fadd float %tmp523, 0x3FE3333340000000
+ %tmp525 = fmul float %tmp524, %tmp524
+ %tmp526 = fmul float %tmp515, %tmp525
+ %tmp527 = fmul float %tmp516, %tmp525
+ %tmp528 = fmul float %tmp517, %tmp525
+ %tmp529 = fsub float -0.000000e+00, %tmp71
+ %tmp530 = fsub float -0.000000e+00, %tmp72
+ %tmp531 = fsub float -0.000000e+00, %tmp73
+ %tmp532 = fmul float %temp12.0, %tmp529
+ %tmp533 = fmul float %temp13.0, %tmp530
+ %tmp534 = fadd float %tmp533, %tmp532
+ %tmp535 = fmul float %temp14.0, %tmp531
+ %tmp536 = fadd float %tmp534, %tmp535
+ %tmp537 = call float @llvm.AMDGPU.clamp.f32(float %tmp536, float 0.000000e+00, float 1.000000e+00)
+ %tmp538 = fmul float %tmp364, %tmp537
+ %tmp539 = fmul float %tmp365, %tmp537
+ %tmp540 = fmul float %tmp366, %tmp537
+ %tmp541 = fmul float %tmp538, %tmp68
+ %tmp542 = fmul float %tmp539, %tmp69
+ %tmp543 = fmul float %tmp540, %tmp70
+ %tmp544 = fsub float -0.000000e+00, %tmp163
+ %tmp545 = fadd float %tmp96, %tmp544
+ %tmp546 = fsub float -0.000000e+00, %tmp164
+ %tmp547 = fadd float %tmp97, %tmp546
+ %tmp548 = fsub float -0.000000e+00, %tmp165
+ %tmp549 = fadd float %tmp98, %tmp548
+ %tmp550 = fmul float %tmp545, %tmp545
+ %tmp551 = fmul float %tmp547, %tmp547
+ %tmp552 = fadd float %tmp551, %tmp550
+ %tmp553 = fmul float %tmp549, %tmp549
+ %tmp554 = fadd float %tmp552, %tmp553
+ %tmp555 = call float @llvm.amdgcn.rsq.f32(float %tmp554)
+ %tmp556 = fmul float %tmp555, %tmp554
+ %tmp557 = fsub float -0.000000e+00, %tmp554
+ %cmp1 = fcmp ogt float %tmp557, 0.000000e+00
+ %tmp558 = select i1 %cmp1, float %tmp556, float 0.000000e+00
+ %tmp559 = fsub float -0.000000e+00, %tmp83
+ %tmp560 = fadd float %tmp558, %tmp559
+ %tmp561 = fsub float -0.000000e+00, %tmp82
+ %tmp562 = fadd float %tmp558, %tmp561
+ %tmp563 = fsub float -0.000000e+00, %tmp81
+ %tmp564 = fadd float %tmp558, %tmp563
+ %tmp565 = fsub float -0.000000e+00, %tmp83
+ %tmp566 = fadd float %tmp82, %tmp565
+ %tmp567 = fsub float -0.000000e+00, %tmp82
+ %tmp568 = fadd float %tmp81, %tmp567
+ %tmp569 = fsub float -0.000000e+00, %tmp81
+ %tmp570 = fadd float %tmp80, %tmp569
+ %tmp571 = fdiv float 1.000000e+00, %tmp566
+ %tmp572 = fdiv float 1.000000e+00, %tmp568
+ %tmp573 = fdiv float 1.000000e+00, %tmp570
+ %tmp574 = fmul float %tmp560, %tmp571
+ %tmp575 = fmul float %tmp562, %tmp572
+ %tmp576 = fmul float %tmp564, %tmp573
+ %tmp577 = fcmp olt float %tmp558, %tmp82
+ %tmp578 = sext i1 %tmp577 to i32
+ %tmp579 = bitcast i32 %tmp578 to float
+ %tmp580 = bitcast float %tmp579 to i32
+ %tmp581 = icmp ne i32 %tmp580, 0
+ br i1 %tmp581, label %ENDIF200, label %ELSE202
ELSE202: ; preds = %ENDIF197
- %589 = fcmp olt float %565, %82
- %590 = sext i1 %589 to i32
- %591 = bitcast i32 %590 to float
- %592 = bitcast float %591 to i32
- %593 = icmp ne i32 %592, 0
- br i1 %593, label %ENDIF200, label %ELSE205
+ %tmp582 = fcmp olt float %tmp558, %tmp81
+ %tmp583 = sext i1 %tmp582 to i32
+ %tmp584 = bitcast i32 %tmp583 to float
+ %tmp585 = bitcast float %tmp584 to i32
+ %tmp586 = icmp ne i32 %tmp585, 0
+ br i1 %tmp586, label %ENDIF200, label %ELSE205
ENDIF200: ; preds = %ELSE205, %ELSE202, %ENDIF197
- %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ]
- %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ]
- %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ]
- %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ]
- %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ]
- %594 = fcmp olt float %565, %83
- %595 = sext i1 %594 to i32
- %596 = bitcast i32 %595 to float
- %597 = bitcast float %596 to i32
- %598 = icmp ne i32 %597, 0
- br i1 %598, label %ENDIF209, label %ELSE211
+ %temp80.0 = phi float [ %tmp574, %ENDIF197 ], [ %.226, %ELSE205 ], [ %tmp575, %ELSE202 ]
+ %temp88.0 = phi float [ %tmp121, %ENDIF197 ], [ %.227, %ELSE205 ], [ %tmp119, %ELSE202 ]
+ %temp89.0 = phi float [ %tmp122, %ENDIF197 ], [ %.228, %ELSE205 ], [ %tmp120, %ELSE202 ]
+ %temp90.0 = phi float [ %tmp119, %ENDIF197 ], [ %tmp115, %ELSE205 ], [ %tmp117, %ELSE202 ]
+ %temp91.0 = phi float [ %tmp120, %ENDIF197 ], [ %tmp116, %ELSE205 ], [ %tmp118, %ELSE202 ]
+ %tmp587 = fcmp olt float %tmp558, %tmp82
+ %tmp588 = sext i1 %tmp587 to i32
+ %tmp589 = bitcast i32 %tmp588 to float
+ %tmp590 = bitcast float %tmp589 to i32
+ %tmp591 = icmp ne i32 %tmp590, 0
+ br i1 %tmp591, label %ENDIF209, label %ELSE211
ELSE205: ; preds = %ELSE202
- %599 = fcmp olt float %565, %81
- %600 = sext i1 %599 to i32
- %601 = bitcast i32 %600 to float
- %602 = bitcast float %601 to i32
- %603 = icmp ne i32 %602, 0
- %.226 = select i1 %603, float %583, float 1.000000e+00
- %.227 = select i1 %603, float %118, float %116
- %.228 = select i1 %603, float %119, float %117
+ %tmp592 = fcmp olt float %tmp558, %tmp80
+ %tmp593 = sext i1 %tmp592 to i32
+ %tmp594 = bitcast i32 %tmp593 to float
+ %tmp595 = bitcast float %tmp594 to i32
+ %tmp596 = icmp ne i32 %tmp595, 0
+ %.226 = select i1 %tmp596, float %tmp576, float 1.000000e+00
+ %.227 = select i1 %tmp596, float %tmp117, float %tmp115
+ %.228 = select i1 %tmp596, float %tmp118, float %tmp116
br label %ENDIF200
ELSE211: ; preds = %ENDIF200
- %604 = fcmp olt float %565, %82
- %605 = sext i1 %604 to i32
- %606 = bitcast i32 %605 to float
- %607 = bitcast float %606 to i32
- %608 = icmp ne i32 %607, 0
- br i1 %608, label %ENDIF209, label %ELSE214
+ %tmp597 = fcmp olt float %tmp558, %tmp81
+ %tmp598 = sext i1 %tmp597 to i32
+ %tmp599 = bitcast i32 %tmp598 to float
+ %tmp600 = bitcast float %tmp599 to i32
+ %tmp601 = icmp ne i32 %tmp600, 0
+ br i1 %tmp601, label %ENDIF209, label %ELSE214
ENDIF209: ; preds = %ELSE214, %ELSE211, %ENDIF200
- %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ]
- %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ]
- %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ]
- %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ]
- %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ]
- %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ]
- %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ]
- %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ]
- %609 = fmul float %164, %85
- %610 = fmul float %165, %86
- %611 = fadd float %609, %610
- %612 = fmul float %166, %87
- %613 = fadd float %611, %612
- %614 = fmul float %167, %88
- %615 = fadd float %613, %614
- %616 = fmul float %164, %89
- %617 = fmul float %165, %90
- %618 = fadd float %616, %617
- %619 = fmul float %166, %91
- %620 = fadd float %618, %619
- %621 = fmul float %167, %92
- %622 = fadd float %620, %621
- %623 = fmul float %164, %93
- %624 = fmul float %165, %94
- %625 = fadd float %623, %624
- %626 = fmul float %166, %95
- %627 = fadd float %625, %626
- %628 = fmul float %167, %96
- %629 = fadd float %627, %628
- %630 = fsub float -0.000000e+00, %78
- %631 = fadd float 1.000000e+00, %630
- %632 = call float @fabs(float %615)
- %633 = call float @fabs(float %622)
- %634 = fcmp oge float %631, %632
- %635 = sext i1 %634 to i32
- %636 = bitcast i32 %635 to float
- %637 = bitcast float %636 to i32
- %638 = and i32 %637, 1065353216
- %639 = bitcast i32 %638 to float
- %640 = fcmp oge float %631, %633
- %641 = sext i1 %640 to i32
- %642 = bitcast i32 %641 to float
- %643 = bitcast float %642 to i32
- %644 = and i32 %643, 1065353216
- %645 = bitcast i32 %644 to float
- %646 = fmul float %639, %645
- %647 = fmul float %629, %646
- %648 = fmul float %615, %temp68.0
- %649 = fadd float %648, %temp70.0
- %650 = fmul float %622, %temp69.0
- %651 = fadd float %650, %temp71.0
- %652 = fmul float %615, %temp52.0
- %653 = fadd float %652, %temp54.0
- %654 = fmul float %622, %temp53.0
- %655 = fadd float %654, %temp55.0
- %656 = fadd float %temp80.0, -1.000000e+00
- %657 = fmul float %656, %77
- %658 = fadd float %657, 1.000000e+00
- %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00)
- %660 = bitcast float %649 to i32
- %661 = bitcast float %651 to i32
- %662 = bitcast float 0.000000e+00 to i32
- %663 = insertelement <4 x i32> undef, i32 %660, i32 0
- %664 = insertelement <4 x i32> %663, i32 %661, i32 1
- %665 = insertelement <4 x i32> %664, i32 %662, i32 2
- %666 = insertelement <4 x i32> %665, i32 undef, i32 3
- %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2)
- %668 = extractelement <4 x float> %667, i32 0
- %669 = extractelement <4 x float> %667, i32 1
- %670 = bitcast float %653 to i32
- %671 = bitcast float %655 to i32
- %672 = bitcast float 0.000000e+00 to i32
- %673 = insertelement <4 x i32> undef, i32 %670, i32 0
- %674 = insertelement <4 x i32> %673, i32 %671, i32 1
- %675 = insertelement <4 x i32> %674, i32 %672, i32 2
- %676 = insertelement <4 x i32> %675, i32 undef, i32 3
- %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2)
- %678 = extractelement <4 x float> %677, i32 0
- %679 = extractelement <4 x float> %677, i32 1
- %680 = fsub float -0.000000e+00, %669
- %681 = fadd float 1.000000e+00, %680
- %682 = fsub float -0.000000e+00, %679
- %683 = fadd float 1.000000e+00, %682
- %684 = fmul float %681, 2.500000e-01
- %685 = fmul float %683, 2.500000e-01
- %686 = fsub float -0.000000e+00, %684
- %687 = fadd float %668, %686
- %688 = fsub float -0.000000e+00, %685
- %689 = fadd float %678, %688
- %690 = fmul float %647, %temp88.0
- %691 = fadd float %690, %temp89.0
- %692 = fmul float %647, %temp90.0
- %693 = fadd float %692, %temp91.0
- %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00)
- %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00)
- %696 = fsub float -0.000000e+00, %694
- %697 = fadd float %668, %696
- %698 = fsub float -0.000000e+00, %695
- %699 = fadd float %678, %698
- %700 = fmul float %668, %668
- %701 = fmul float %678, %678
- %702 = fsub float -0.000000e+00, %700
- %703 = fadd float %687, %702
- %704 = fsub float -0.000000e+00, %701
- %705 = fadd float %689, %704
- %706 = fcmp uge float %703, %75
- %707 = select i1 %706, float %703, float %75
- %708 = fcmp uge float %705, %75
- %709 = select i1 %708, float %705, float %75
- %710 = fmul float %697, %697
- %711 = fadd float %710, %707
- %712 = fmul float %699, %699
- %713 = fadd float %712, %709
- %714 = fdiv float 1.000000e+00, %711
- %715 = fdiv float 1.000000e+00, %713
- %716 = fmul float %707, %714
- %717 = fmul float %709, %715
- %718 = fcmp oge float %697, 0.000000e+00
- %719 = sext i1 %718 to i32
- %720 = bitcast i32 %719 to float
- %721 = bitcast float %720 to i32
- %722 = icmp ne i32 %721, 0
- %.229 = select i1 %722, float 1.000000e+00, float %716
- %723 = fcmp oge float %699, 0.000000e+00
- %724 = sext i1 %723 to i32
- %725 = bitcast i32 %724 to float
- %726 = bitcast float %725 to i32
- %727 = icmp ne i32 %726, 0
- %temp28.0 = select i1 %727, float 1.000000e+00, float %717
- %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229)
- %729 = call float @llvm.pow.f32(float %728, float %76)
- %730 = fmul float %729, %79
- %731 = fadd float %730, %80
- %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00)
- %733 = fmul float %732, %732
- %734 = fmul float 2.000000e+00, %732
- %735 = fsub float -0.000000e+00, %734
- %736 = fadd float 3.000000e+00, %735
- %737 = fmul float %733, %736
- %738 = fmul float %548, %737
- %739 = fmul float %549, %737
- %740 = fmul float %550, %737
- %741 = fmul float %738, %515
- %742 = fadd float %741, %533
- %743 = fmul float %739, %515
- %744 = fadd float %743, %534
- %745 = fmul float %740, %515
- %746 = fadd float %745, %535
- %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00)
- %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00)
- %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00)
- %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00)
- %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00)
- %752 = fmul float %748, %751
- %753 = fmul float %749, %751
- %754 = fmul float %750, %751
- %755 = fmul float %742, %752
- %756 = fmul float %744, %753
- %757 = fmul float %746, %754
- %758 = fmul float %temp12.0, %216
- %759 = fmul float %temp13.0, %217
- %760 = fadd float %759, %758
- %761 = fmul float %temp14.0, %218
- %762 = fadd float %760, %761
- %763 = call float @fabs(float %762)
- %764 = fmul float %763, %763
- %765 = fmul float %764, %50
- %766 = fadd float %765, %51
- %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00)
- %768 = fsub float -0.000000e+00, %767
- %769 = fadd float 1.000000e+00, %768
- %770 = fmul float %33, %769
- %771 = fmul float %33, %769
- %772 = fmul float %33, %769
- %773 = fmul float %34, %769
- %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755)
- %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756)
- %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757)
- %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374)
- %778 = fcmp uge float %774, 0x3E6FFFFE60000000
- %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000
- %780 = fcmp uge float %775, 0x3E6FFFFE60000000
- %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000
- %782 = fcmp uge float %776, 0x3E6FFFFE60000000
- %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000
- %784 = fcmp uge float %779, 6.550400e+04
- %785 = select i1 %784, float 6.550400e+04, float %779
- %786 = fcmp uge float %781, 6.550400e+04
- %787 = select i1 %786, float 6.550400e+04, float %781
- %788 = fcmp uge float %783, 6.550400e+04
- %789 = select i1 %788, float 6.550400e+04, float %783
- %790 = fmul float %777, %52
- %791 = fadd float %790, %53
- %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00)
- %793 = call i32 @llvm.SI.packf16(float %785, float %787)
- %794 = bitcast i32 %793 to float
- %795 = call i32 @llvm.SI.packf16(float %789, float %792)
- %796 = bitcast i32 %795 to float
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796)
+ %temp52.0 = phi float [ %tmp107, %ENDIF200 ], [ %tmp99, %ELSE214 ], [ %tmp103, %ELSE211 ]
+ %temp53.0 = phi float [ %tmp108, %ENDIF200 ], [ %tmp100, %ELSE214 ], [ %tmp104, %ELSE211 ]
+ %temp54.0 = phi float [ %tmp109, %ENDIF200 ], [ %tmp101, %ELSE214 ], [ %tmp105, %ELSE211 ]
+ %temp55.0 = phi float [ %tmp110, %ENDIF200 ], [ %tmp102, %ELSE214 ], [ %tmp106, %ELSE211 ]
+ %temp68.0 = phi float [ %tmp111, %ENDIF200 ], [ %.230, %ELSE214 ], [ %tmp107, %ELSE211 ]
+ %temp69.0 = phi float [ %tmp112, %ENDIF200 ], [ %.231, %ELSE214 ], [ %tmp108, %ELSE211 ]
+ %temp70.0 = phi float [ %tmp113, %ENDIF200 ], [ %.232, %ELSE214 ], [ %tmp109, %ELSE211 ]
+ %temp71.0 = phi float [ %tmp114, %ENDIF200 ], [ %.233, %ELSE214 ], [ %tmp110, %ELSE211 ]
+ %tmp602 = fmul float %tmp163, %tmp84
+ %tmp603 = fmul float %tmp164, %tmp85
+ %tmp604 = fadd float %tmp602, %tmp603
+ %tmp605 = fmul float %tmp165, %tmp86
+ %tmp606 = fadd float %tmp604, %tmp605
+ %tmp607 = fmul float %tmp166, %tmp87
+ %tmp608 = fadd float %tmp606, %tmp607
+ %tmp609 = fmul float %tmp163, %tmp88
+ %tmp610 = fmul float %tmp164, %tmp89
+ %tmp611 = fadd float %tmp609, %tmp610
+ %tmp612 = fmul float %tmp165, %tmp90
+ %tmp613 = fadd float %tmp611, %tmp612
+ %tmp614 = fmul float %tmp166, %tmp91
+ %tmp615 = fadd float %tmp613, %tmp614
+ %tmp616 = fmul float %tmp163, %tmp92
+ %tmp617 = fmul float %tmp164, %tmp93
+ %tmp618 = fadd float %tmp616, %tmp617
+ %tmp619 = fmul float %tmp165, %tmp94
+ %tmp620 = fadd float %tmp618, %tmp619
+ %tmp621 = fmul float %tmp166, %tmp95
+ %tmp622 = fadd float %tmp620, %tmp621
+ %tmp623 = fsub float -0.000000e+00, %tmp77
+ %tmp624 = fadd float 1.000000e+00, %tmp623
+ %tmp625 = call float @fabs(float %tmp608)
+ %tmp626 = call float @fabs(float %tmp615)
+ %tmp627 = fcmp oge float %tmp624, %tmp625
+ %tmp628 = sext i1 %tmp627 to i32
+ %tmp629 = bitcast i32 %tmp628 to float
+ %tmp630 = bitcast float %tmp629 to i32
+ %tmp631 = and i32 %tmp630, 1065353216
+ %tmp632 = bitcast i32 %tmp631 to float
+ %tmp633 = fcmp oge float %tmp624, %tmp626
+ %tmp634 = sext i1 %tmp633 to i32
+ %tmp635 = bitcast i32 %tmp634 to float
+ %tmp636 = bitcast float %tmp635 to i32
+ %tmp637 = and i32 %tmp636, 1065353216
+ %tmp638 = bitcast i32 %tmp637 to float
+ %tmp639 = fmul float %tmp632, %tmp638
+ %tmp640 = fmul float %tmp622, %tmp639
+ %tmp641 = fmul float %tmp608, %temp68.0
+ %tmp642 = fadd float %tmp641, %temp70.0
+ %tmp643 = fmul float %tmp615, %temp69.0
+ %tmp644 = fadd float %tmp643, %temp71.0
+ %tmp645 = fmul float %tmp608, %temp52.0
+ %tmp646 = fadd float %tmp645, %temp54.0
+ %tmp647 = fmul float %tmp615, %temp53.0
+ %tmp648 = fadd float %tmp647, %temp55.0
+ %tmp649 = fadd float %temp80.0, -1.000000e+00
+ %tmp650 = fmul float %tmp649, %tmp76
+ %tmp651 = fadd float %tmp650, 1.000000e+00
+ %tmp652 = call float @llvm.AMDGPU.clamp.f32(float %tmp651, float 0.000000e+00, float 1.000000e+00)
+ %tmp653 = bitcast float %tmp642 to i32
+ %tmp654 = bitcast float %tmp644 to i32
+ %tmp655 = bitcast float 0.000000e+00 to i32
+ %tmp656 = insertelement <4 x i32> undef, i32 %tmp653, i32 0
+ %tmp657 = insertelement <4 x i32> %tmp656, i32 %tmp654, i32 1
+ %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2
+ %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3
+ %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32>
+ %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp661 = extractelement <4 x float> %tmp660, i32 0
+ %tmp662 = extractelement <4 x float> %tmp660, i32 1
+ %tmp663 = bitcast float %tmp646 to i32
+ %tmp664 = bitcast float %tmp648 to i32
+ %tmp665 = bitcast float 0.000000e+00 to i32
+ %tmp666 = insertelement <4 x i32> undef, i32 %tmp663, i32 0
+ %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1
+ %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2
+ %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3
+ %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp671 = extractelement <4 x float> %tmp670, i32 0
+ %tmp672 = extractelement <4 x float> %tmp670, i32 1
+ %tmp673 = fsub float -0.000000e+00, %tmp662
+ %tmp674 = fadd float 1.000000e+00, %tmp673
+ %tmp675 = fsub float -0.000000e+00, %tmp672
+ %tmp676 = fadd float 1.000000e+00, %tmp675
+ %tmp677 = fmul float %tmp674, 2.500000e-01
+ %tmp678 = fmul float %tmp676, 2.500000e-01
+ %tmp679 = fsub float -0.000000e+00, %tmp677
+ %tmp680 = fadd float %tmp661, %tmp679
+ %tmp681 = fsub float -0.000000e+00, %tmp678
+ %tmp682 = fadd float %tmp671, %tmp681
+ %tmp683 = fmul float %tmp640, %temp88.0
+ %tmp684 = fadd float %tmp683, %temp89.0
+ %tmp685 = fmul float %tmp640, %temp90.0
+ %tmp686 = fadd float %tmp685, %temp91.0
+ %tmp687 = call float @llvm.AMDGPU.clamp.f32(float %tmp684, float 0.000000e+00, float 1.000000e+00)
+ %tmp688 = call float @llvm.AMDGPU.clamp.f32(float %tmp686, float 0.000000e+00, float 1.000000e+00)
+ %tmp689 = fsub float -0.000000e+00, %tmp687
+ %tmp690 = fadd float %tmp661, %tmp689
+ %tmp691 = fsub float -0.000000e+00, %tmp688
+ %tmp692 = fadd float %tmp671, %tmp691
+ %tmp693 = fmul float %tmp661, %tmp661
+ %tmp694 = fmul float %tmp671, %tmp671
+ %tmp695 = fsub float -0.000000e+00, %tmp693
+ %tmp696 = fadd float %tmp680, %tmp695
+ %tmp697 = fsub float -0.000000e+00, %tmp694
+ %tmp698 = fadd float %tmp682, %tmp697
+ %tmp699 = fcmp uge float %tmp696, %tmp74
+ %tmp700 = select i1 %tmp699, float %tmp696, float %tmp74
+ %tmp701 = fcmp uge float %tmp698, %tmp74
+ %tmp702 = select i1 %tmp701, float %tmp698, float %tmp74
+ %tmp703 = fmul float %tmp690, %tmp690
+ %tmp704 = fadd float %tmp703, %tmp700
+ %tmp705 = fmul float %tmp692, %tmp692
+ %tmp706 = fadd float %tmp705, %tmp702
+ %tmp707 = fdiv float 1.000000e+00, %tmp704
+ %tmp708 = fdiv float 1.000000e+00, %tmp706
+ %tmp709 = fmul float %tmp700, %tmp707
+ %tmp710 = fmul float %tmp702, %tmp708
+ %tmp711 = fcmp oge float %tmp690, 0.000000e+00
+ %tmp712 = sext i1 %tmp711 to i32
+ %tmp713 = bitcast i32 %tmp712 to float
+ %tmp714 = bitcast float %tmp713 to i32
+ %tmp715 = icmp ne i32 %tmp714, 0
+ %.229 = select i1 %tmp715, float 1.000000e+00, float %tmp709
+ %tmp716 = fcmp oge float %tmp692, 0.000000e+00
+ %tmp717 = sext i1 %tmp716 to i32
+ %tmp718 = bitcast i32 %tmp717 to float
+ %tmp719 = bitcast float %tmp718 to i32
+ %tmp720 = icmp ne i32 %tmp719, 0
+ %temp28.0 = select i1 %tmp720, float 1.000000e+00, float %tmp710
+ %one.sub.a.i25 = fsub float 1.000000e+00, %tmp652
+ %one.sub.ac.i26 = fmul float %one.sub.a.i25, %.229
+ %mul.i27 = fmul float %temp28.0, %.229
+ %result.i28 = fadd float %mul.i27, %one.sub.ac.i26
+ %tmp721 = call float @llvm.pow.f32(float %result.i28, float %tmp75)
+ %tmp722 = fmul float %tmp721, %tmp78
+ %tmp723 = fadd float %tmp722, %tmp79
+ %tmp724 = call float @llvm.AMDGPU.clamp.f32(float %tmp723, float 0.000000e+00, float 1.000000e+00)
+ %tmp725 = fmul float %tmp724, %tmp724
+ %tmp726 = fmul float 2.000000e+00, %tmp724
+ %tmp727 = fsub float -0.000000e+00, %tmp726
+ %tmp728 = fadd float 3.000000e+00, %tmp727
+ %tmp729 = fmul float %tmp725, %tmp728
+ %tmp730 = fmul float %tmp541, %tmp729
+ %tmp731 = fmul float %tmp542, %tmp729
+ %tmp732 = fmul float %tmp543, %tmp729
+ %tmp733 = fmul float %tmp730, %tmp508
+ %tmp734 = fadd float %tmp733, %tmp526
+ %tmp735 = fmul float %tmp731, %tmp508
+ %tmp736 = fadd float %tmp735, %tmp527
+ %tmp737 = fmul float %tmp732, %tmp508
+ %tmp738 = fadd float %tmp737, %tmp528
+ %one.sub.a.i23 = fsub float 1.000000e+00, %tmp229
+ %result.i24 = fadd float %tmp283, %one.sub.a.i23
+ %one.sub.a.i21 = fsub float 1.000000e+00, %tmp36
+ %result.i22 = fadd float %tmp294, %one.sub.a.i21
+ %one.sub.a.i19 = fsub float 1.000000e+00, %tmp36
+ %result.i20 = fadd float %tmp295, %one.sub.a.i19
+ %one.sub.a.i17 = fsub float 1.000000e+00, %tmp36
+ %result.i18 = fadd float %tmp296, %one.sub.a.i17
+ %one.sub.a.i15 = fsub float 1.000000e+00, %tmp37
+ %result.i16 = fadd float %result.i24, %one.sub.a.i15
+ %tmp739 = fmul float %result.i22, %result.i16
+ %tmp740 = fmul float %result.i20, %result.i16
+ %tmp741 = fmul float %result.i18, %result.i16
+ %tmp742 = fmul float %tmp734, %tmp739
+ %tmp743 = fmul float %tmp736, %tmp740
+ %tmp744 = fmul float %tmp738, %tmp741
+ %tmp745 = fmul float %temp12.0, %tmp215
+ %tmp746 = fmul float %temp13.0, %tmp216
+ %tmp747 = fadd float %tmp746, %tmp745
+ %tmp748 = fmul float %temp14.0, %tmp217
+ %tmp749 = fadd float %tmp747, %tmp748
+ %tmp750 = call float @fabs(float %tmp749)
+ %tmp751 = fmul float %tmp750, %tmp750
+ %tmp752 = fmul float %tmp751, %tmp49
+ %tmp753 = fadd float %tmp752, %tmp50
+ %tmp754 = call float @llvm.AMDGPU.clamp.f32(float %tmp753, float 0.000000e+00, float 1.000000e+00)
+ %tmp755 = fsub float -0.000000e+00, %tmp754
+ %tmp756 = fadd float 1.000000e+00, %tmp755
+ %tmp757 = fmul float %tmp32, %tmp756
+ %tmp758 = fmul float %tmp32, %tmp756
+ %tmp759 = fmul float %tmp32, %tmp756
+ %tmp760 = fmul float %tmp33, %tmp756
+ %one.sub.a.i11 = fsub float 1.000000e+00, %tmp757
+ %one.sub.ac.i12 = fmul float %one.sub.a.i11, %tmp742
+ %mul.i13 = fmul float %tmp30, %tmp742
+ %result.i14 = fadd float %mul.i13, %one.sub.ac.i12
+ %one.sub.a.i7 = fsub float 1.000000e+00, %tmp758
+ %one.sub.ac.i8 = fmul float %one.sub.a.i7, %tmp743
+ %mul.i9 = fmul float %tmp30, %tmp743
+ %result.i10 = fadd float %mul.i9, %one.sub.ac.i8
+ %one.sub.a.i3 = fsub float 1.000000e+00, %tmp759
+ %one.sub.ac.i4 = fmul float %one.sub.a.i3, %tmp744
+ %mul.i5 = fmul float %tmp30, %tmp744
+ %result.i6 = fadd float %mul.i5, %one.sub.ac.i4
+ %one.sub.a.i1 = fsub float 1.000000e+00, %tmp760
+ %one.sub.ac.i = fmul float %one.sub.a.i1, %tmp367
+ %mul.i = fmul float %tmp31, %tmp367
+ %result.i2 = fadd float %mul.i, %one.sub.ac.i
+ %tmp761 = fcmp uge float %result.i14, 0x3E6FFFFE60000000
+ %tmp762 = select i1 %tmp761, float %result.i14, float 0x3E6FFFFE60000000
+ %tmp763 = fcmp uge float %result.i10, 0x3E6FFFFE60000000
+ %tmp764 = select i1 %tmp763, float %result.i10, float 0x3E6FFFFE60000000
+ %tmp765 = fcmp uge float %result.i6, 0x3E6FFFFE60000000
+ %tmp766 = select i1 %tmp765, float %result.i6, float 0x3E6FFFFE60000000
+ %tmp767 = fcmp uge float %tmp762, 6.550400e+04
+ %tmp768 = select i1 %tmp767, float 6.550400e+04, float %tmp762
+ %tmp769 = fcmp uge float %tmp764, 6.550400e+04
+ %tmp770 = select i1 %tmp769, float 6.550400e+04, float %tmp764
+ %tmp771 = fcmp uge float %tmp766, 6.550400e+04
+ %tmp772 = select i1 %tmp771, float 6.550400e+04, float %tmp766
+ %tmp773 = fmul float %result.i2, %tmp51
+ %tmp774 = fadd float %tmp773, %tmp52
+ %tmp775 = call float @llvm.AMDGPU.clamp.f32(float %tmp774, float 0.000000e+00, float 1.000000e+00)
+ %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770)
+ %tmp777 = bitcast i32 %tmp776 to float
+ %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %tmp775)
+ %tmp779 = bitcast i32 %tmp778 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp777, float %tmp779, float %tmp777, float %tmp779)
ret void
ELSE214: ; preds = %ELSE211
- %797 = fcmp olt float %565, %81
- %798 = sext i1 %797 to i32
- %799 = bitcast i32 %798 to float
- %800 = bitcast float %799 to i32
- %801 = icmp ne i32 %800, 0
- %.230 = select i1 %801, float %104, float %100
- %.231 = select i1 %801, float %105, float %101
- %.232 = select i1 %801, float %106, float %102
- %.233 = select i1 %801, float %107, float %103
+ %tmp780 = fcmp olt float %tmp558, %tmp80
+ %tmp781 = sext i1 %tmp780 to i32
+ %tmp782 = bitcast i32 %tmp781 to float
+ %tmp783 = bitcast float %tmp782 to i32
+ %tmp784 = icmp ne i32 %tmp783, 0
+ %.230 = select i1 %tmp784, float %tmp103, float %tmp99
+ %.231 = select i1 %tmp784, float %tmp104, float %tmp100
+ %.232 = select i1 %tmp784, float %tmp105, float %tmp101
+ %.233 = select i1 %tmp784, float %tmp106, float %tmp102
br label %ENDIF209
}
; Function Attrs: readnone
-declare float @llvm.AMDIL.clamp.(float, float, float) #2
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #2
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
+
+
+declare float @llvm.exp2.f32(float) #2
; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare float @llvm.SI.load.const(<16 x i8>, i32) #2
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #2
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+
+; Function Attrs: nounwind readonly
+declare float @ceil(float) #3
+
+; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.rsq.f32(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
; Function Attrs: readnone
-declare float @llvm.AMDGPU.cndlt(float, float, float) #2
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
; Function Attrs: readnone
-declare float @llvm.AMDIL.exp.(float) #2
+declare float @fabs(float) #1
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
+
+
+; Function Attrs: nounwind readnone
+declare float @llvm.pow.f32(float, float) #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #1 = { readnone }
+attributes #2 = { nounwind readnone }
attributes #3 = { nounwind readonly }
-attributes #4 = { readonly }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
index 4b2d8ec6bf0a..30aa2d550f65 100644
--- a/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -3,10 +3,10 @@
; If this occurs it is likely due to reordering and the restore was
; originally supposed to happen before SI_END_CF.
+
; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
; SI-NOT: v_readlane_b32 [[SAVED]]
-
-define void @main() #0 {
+define amdgpu_ps void @main() #0 {
main_body:
%0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
%1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
@@ -80,184 +80,198 @@ main_body:
LOOP: ; preds = %ENDIF2795, %main_body
%temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
%temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
- %67 = icmp sgt i32 undef, 4
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %67 = icmp sgt i32 %tid, 4
br i1 %67, label %ENDLOOP, label %ENDIF
ENDLOOP: ; preds = %ELSE2566, %LOOP
- %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef)
- call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00)
+ %one.sub.a.i = fsub float 1.000000e+00, %0
+ %one.sub.ac.i = fmul float %one.sub.a.i, undef
+ %result.i = fadd float fmul (float undef, float undef), %one.sub.ac.i
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %result.i, float undef, float 1.000000e+00)
ret void
ENDIF: ; preds = %LOOP
- %69 = fsub float %2, undef
- %70 = fsub float %3, undef
- %71 = fsub float %4, undef
- %72 = fmul float %69, 0.000000e+00
+ %68 = fsub float %2, undef
+ %69 = fsub float %3, undef
+ %70 = fsub float %4, undef
+ %71 = fmul float %68, 0.000000e+00
+ %72 = fmul float %69, undef
%73 = fmul float %70, undef
- %74 = fmul float %71, undef
- %75 = fsub float %6, undef
- %76 = fsub float %7, undef
- %77 = fmul float %75, undef
- %78 = fmul float %76, 0.000000e+00
- %79 = call float @llvm.minnum.f32(float %74, float %78)
- %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00)
- %81 = call float @llvm.maxnum.f32(float %73, float %77)
- %82 = call float @llvm.maxnum.f32(float undef, float %79)
- %83 = call float @llvm.minnum.f32(float %80, float %81)
- %84 = call float @llvm.minnum.f32(float %83, float undef)
- %85 = fsub float %14, undef
- %86 = fsub float %15, undef
- %87 = fsub float %16, undef
+ %74 = fsub float %6, undef
+ %75 = fsub float %7, undef
+ %76 = fmul float %74, undef
+ %77 = fmul float %75, 0.000000e+00
+ %78 = call float @llvm.minnum.f32(float %73, float %77)
+ %79 = call float @llvm.maxnum.f32(float %71, float 0.000000e+00)
+ %80 = call float @llvm.maxnum.f32(float %72, float %76)
+ %81 = call float @llvm.maxnum.f32(float undef, float %78)
+ %82 = call float @llvm.minnum.f32(float %79, float %80)
+ %83 = call float @llvm.minnum.f32(float %82, float undef)
+ %84 = fsub float %14, undef
+ %85 = fsub float %15, undef
+ %86 = fsub float %16, undef
+ %87 = fmul float %84, undef
%88 = fmul float %85, undef
%89 = fmul float %86, undef
- %90 = fmul float %87, undef
- %91 = fsub float %17, undef
- %92 = fsub float %18, undef
- %93 = fsub float %19, undef
- %94 = fmul float %91, 0.000000e+00
+ %90 = fsub float %17, undef
+ %91 = fsub float %18, undef
+ %92 = fsub float %19, undef
+ %93 = fmul float %90, 0.000000e+00
+ %94 = fmul float %91, undef
%95 = fmul float %92, undef
- %96 = fmul float %93, undef
- %97 = call float @llvm.minnum.f32(float %89, float %95)
- %98 = call float @llvm.maxnum.f32(float %88, float %94)
- %99 = call float @llvm.maxnum.f32(float %90, float %96)
- %100 = call float @llvm.maxnum.f32(float undef, float %97)
- %101 = call float @llvm.maxnum.f32(float %100, float undef)
- %102 = call float @llvm.minnum.f32(float %98, float undef)
- %103 = call float @llvm.minnum.f32(float %102, float %99)
- %104 = fsub float %30, undef
- %105 = fsub float %31, undef
+ %96 = call float @llvm.minnum.f32(float %88, float %94)
+ %97 = call float @llvm.maxnum.f32(float %87, float %93)
+ %98 = call float @llvm.maxnum.f32(float %89, float %95)
+ %99 = call float @llvm.maxnum.f32(float undef, float %96)
+ %100 = call float @llvm.maxnum.f32(float %99, float undef)
+ %101 = call float @llvm.minnum.f32(float %97, float undef)
+ %102 = call float @llvm.minnum.f32(float %101, float %98)
+ %103 = fsub float %30, undef
+ %104 = fsub float %31, undef
+ %105 = fmul float %103, 0.000000e+00
%106 = fmul float %104, 0.000000e+00
- %107 = fmul float %105, 0.000000e+00
- %108 = call float @llvm.minnum.f32(float undef, float %106)
+ %107 = call float @llvm.minnum.f32(float undef, float %105)
+ %108 = call float @llvm.maxnum.f32(float undef, float %106)
%109 = call float @llvm.maxnum.f32(float undef, float %107)
- %110 = call float @llvm.maxnum.f32(float undef, float %108)
- %111 = call float @llvm.maxnum.f32(float %110, float undef)
- %112 = call float @llvm.minnum.f32(float undef, float %109)
- %113 = fsub float %32, undef
- %114 = fsub float %33, undef
- %115 = fsub float %34, undef
- %116 = fmul float %113, 0.000000e+00
+ %110 = call float @llvm.maxnum.f32(float %109, float undef)
+ %111 = call float @llvm.minnum.f32(float undef, float %108)
+ %112 = fsub float %32, undef
+ %113 = fsub float %33, undef
+ %114 = fsub float %34, undef
+ %115 = fmul float %112, 0.000000e+00
+ %116 = fmul float %113, undef
%117 = fmul float %114, undef
- %118 = fmul float %115, undef
- %119 = fsub float %35, undef
- %120 = fsub float %36, undef
- %121 = fsub float %37, undef
+ %118 = fsub float %35, undef
+ %119 = fsub float %36, undef
+ %120 = fsub float %37, undef
+ %121 = fmul float %118, undef
%122 = fmul float %119, undef
%123 = fmul float %120, undef
- %124 = fmul float %121, undef
+ %124 = call float @llvm.minnum.f32(float %115, float %121)
%125 = call float @llvm.minnum.f32(float %116, float %122)
%126 = call float @llvm.minnum.f32(float %117, float %123)
- %127 = call float @llvm.minnum.f32(float %118, float %124)
- %128 = call float @llvm.maxnum.f32(float %125, float %126)
- %129 = call float @llvm.maxnum.f32(float %128, float %127)
- %130 = fsub float %38, undef
- %131 = fsub float %39, undef
- %132 = fsub float %40, undef
- %133 = fmul float %130, 0.000000e+00
+ %127 = call float @llvm.maxnum.f32(float %124, float %125)
+ %128 = call float @llvm.maxnum.f32(float %127, float %126)
+ %129 = fsub float %38, undef
+ %130 = fsub float %39, undef
+ %131 = fsub float %40, undef
+ %132 = fmul float %129, 0.000000e+00
+ %133 = fmul float %130, undef
%134 = fmul float %131, undef
- %135 = fmul float %132, undef
- %136 = fsub float %41, undef
- %137 = fsub float %42, undef
- %138 = fsub float %43, undef
+ %135 = fsub float %41, undef
+ %136 = fsub float %42, undef
+ %137 = fsub float %43, undef
+ %138 = fmul float %135, undef
%139 = fmul float %136, undef
%140 = fmul float %137, undef
- %141 = fmul float %138, undef
+ %141 = call float @llvm.minnum.f32(float %132, float %138)
%142 = call float @llvm.minnum.f32(float %133, float %139)
%143 = call float @llvm.minnum.f32(float %134, float %140)
- %144 = call float @llvm.minnum.f32(float %135, float %141)
- %145 = call float @llvm.maxnum.f32(float %142, float %143)
- %146 = call float @llvm.maxnum.f32(float %145, float %144)
- %147 = fsub float %44, undef
- %148 = fsub float %45, undef
- %149 = fsub float %46, undef
+ %144 = call float @llvm.maxnum.f32(float %141, float %142)
+ %145 = call float @llvm.maxnum.f32(float %144, float %143)
+ %146 = fsub float %44, undef
+ %147 = fsub float %45, undef
+ %148 = fsub float %46, undef
+ %149 = fmul float %146, 0.000000e+00
%150 = fmul float %147, 0.000000e+00
- %151 = fmul float %148, 0.000000e+00
- %152 = fmul float %149, undef
- %153 = fsub float %47, undef
- %154 = fsub float %48, undef
- %155 = fsub float %49, undef
- %156 = fmul float %153, undef
- %157 = fmul float %154, 0.000000e+00
- %158 = fmul float %155, undef
+ %151 = fmul float %148, undef
+ %152 = fsub float %47, undef
+ %153 = fsub float %48, undef
+ %154 = fsub float %49, undef
+ %155 = fmul float %152, undef
+ %156 = fmul float %153, 0.000000e+00
+ %157 = fmul float %154, undef
+ %158 = call float @llvm.minnum.f32(float %149, float %155)
%159 = call float @llvm.minnum.f32(float %150, float %156)
%160 = call float @llvm.minnum.f32(float %151, float %157)
- %161 = call float @llvm.minnum.f32(float %152, float %158)
- %162 = call float @llvm.maxnum.f32(float %159, float %160)
- %163 = call float @llvm.maxnum.f32(float %162, float %161)
- %164 = fsub float %50, undef
- %165 = fsub float %51, undef
- %166 = fsub float %52, undef
- %167 = fmul float %164, undef
+ %161 = call float @llvm.maxnum.f32(float %158, float %159)
+ %162 = call float @llvm.maxnum.f32(float %161, float %160)
+ %163 = fsub float %50, undef
+ %164 = fsub float %51, undef
+ %165 = fsub float %52, undef
+ %166 = fmul float %163, undef
+ %167 = fmul float %164, 0.000000e+00
%168 = fmul float %165, 0.000000e+00
- %169 = fmul float %166, 0.000000e+00
- %170 = fsub float %53, undef
- %171 = fsub float %54, undef
- %172 = fsub float %55, undef
- %173 = fdiv float 1.000000e+00, %temp18.0
+ %169 = fsub float %53, undef
+ %170 = fsub float %54, undef
+ %171 = fsub float %55, undef
+ %172 = fdiv float 1.000000e+00, %temp18.0
+ %173 = fmul float %169, undef
%174 = fmul float %170, undef
- %175 = fmul float %171, undef
- %176 = fmul float %172, %173
+ %175 = fmul float %171, %172
+ %176 = call float @llvm.minnum.f32(float %166, float %173)
%177 = call float @llvm.minnum.f32(float %167, float %174)
%178 = call float @llvm.minnum.f32(float %168, float %175)
- %179 = call float @llvm.minnum.f32(float %169, float %176)
- %180 = call float @llvm.maxnum.f32(float %177, float %178)
- %181 = call float @llvm.maxnum.f32(float %180, float %179)
- %182 = fsub float %62, undef
- %183 = fsub float %63, undef
- %184 = fsub float %64, undef
- %185 = fmul float %182, 0.000000e+00
+ %179 = call float @llvm.maxnum.f32(float %176, float %177)
+ %180 = call float @llvm.maxnum.f32(float %179, float %178)
+ %181 = fsub float %62, undef
+ %182 = fsub float %63, undef
+ %183 = fsub float %64, undef
+ %184 = fmul float %181, 0.000000e+00
+ %185 = fmul float %182, undef
%186 = fmul float %183, undef
- %187 = fmul float %184, undef
- %188 = fsub float %65, undef
- %189 = fsub float %66, undef
+ %187 = fsub float %65, undef
+ %188 = fsub float %66, undef
+ %189 = fmul float %187, undef
%190 = fmul float %188, undef
- %191 = fmul float %189, undef
+ %191 = call float @llvm.maxnum.f32(float %184, float %189)
%192 = call float @llvm.maxnum.f32(float %185, float %190)
- %193 = call float @llvm.maxnum.f32(float %186, float %191)
- %194 = call float @llvm.maxnum.f32(float %187, float undef)
- %195 = call float @llvm.minnum.f32(float %192, float %193)
- %196 = call float @llvm.minnum.f32(float %195, float %194)
- %.temp292.7 = select i1 undef, float %163, float undef
- %temp292.9 = select i1 false, float %181, float %.temp292.7
+ %193 = call float @llvm.maxnum.f32(float %186, float undef)
+ %194 = call float @llvm.minnum.f32(float %191, float %192)
+ %195 = call float @llvm.minnum.f32(float %194, float %193)
+ %.temp292.7 = select i1 undef, float %162, float undef
+ %temp292.9 = select i1 false, float %180, float %.temp292.7
%.temp292.9 = select i1 undef, float undef, float %temp292.9
- %197 = fcmp ogt float undef, 0.000000e+00
- %198 = fcmp olt float undef, %196
- %199 = and i1 %197, %198
- %200 = fcmp olt float undef, %.temp292.9
- %201 = and i1 %199, %200
- %temp292.11 = select i1 %201, float undef, float %.temp292.9
- br i1 undef, label %IF2565, label %ELSE2566
+ %196 = fcmp ogt float undef, 0.000000e+00
+ %197 = fcmp olt float undef, %195
+ %198 = and i1 %196, %197
+ %199 = fcmp olt float undef, %.temp292.9
+ %200 = and i1 %198, %199
+ %temp292.11 = select i1 %200, float undef, float %.temp292.9
+ %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+ %cmp0 = icmp eq i32 %tid0, 0
+ br i1 %cmp0, label %IF2565, label %ELSE2566
IF2565: ; preds = %ENDIF
- br i1 false, label %ENDIF2582, label %ELSE2584
+ %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+ %cmp1 = icmp eq i32 %tid1, 0
+ br i1 %cmp1, label %ENDIF2582, label %ELSE2584
ELSE2566: ; preds = %ENDIF
- %202 = fcmp oeq float %temp292.11, 1.000000e+04
- br i1 %202, label %ENDLOOP, label %ELSE2593
+ %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+ %tidf = bitcast i32 %tid2 to float
+ %201 = fcmp oeq float %temp292.11, %tidf
+ br i1 %201, label %ENDLOOP, label %ELSE2593
ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588
%temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
- %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ]
- %203 = fsub float %5, undef
- %204 = fmul float %203, undef
- %205 = call float @llvm.maxnum.f32(float undef, float %204)
+ %temp18.1 = phi float [ %218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+ %202 = fsub float %5, undef
+ %203 = fmul float %202, undef
+ %204 = call float @llvm.maxnum.f32(float undef, float %203)
+ %205 = call float @llvm.minnum.f32(float %204, float undef)
%206 = call float @llvm.minnum.f32(float %205, float undef)
- %207 = call float @llvm.minnum.f32(float %206, float undef)
- %208 = fcmp ogt float undef, 0.000000e+00
- %209 = fcmp olt float undef, 1.000000e+00
- %210 = and i1 %208, %209
- %211 = fcmp olt float undef, %207
- %212 = and i1 %210, %211
- br i1 %212, label %ENDIF2795, label %ELSE2797
+ %207 = fcmp ogt float undef, 0.000000e+00
+ %208 = fcmp olt float undef, 1.000000e+00
+ %209 = and i1 %207, %208
+ %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+ %tidf3 = bitcast i32 %tid3 to float
+ %210 = fcmp olt float %tidf3, %206
+ %211 = and i1 %209, %210
+ br i1 %211, label %ENDIF2795, label %ELSE2797
ELSE2584: ; preds = %IF2565
br label %ENDIF2582
ENDIF2582: ; preds = %ELSE2584, %IF2565
- %213 = fadd float %1, undef
- %214 = fadd float 0.000000e+00, %213
- %215 = call float @llvm.AMDIL.fraction.(float %214)
- br i1 undef, label %IF2589, label %ELSE2590
+ %212 = fadd float %1, undef
+ %213 = fadd float 0.000000e+00, %212
+ %floor = call float @llvm.floor.f32(float %213)
+ %214 = fsub float %213, %floor
+ %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+ %cmp4 = icmp eq i32 %tid4, 0
+ br i1 %cmp4, label %IF2589, label %ELSE2590
IF2589: ; preds = %ENDIF2582
br label %ENDIF2588
@@ -266,61 +280,61 @@ ELSE2590: ; preds = %ENDIF2582
br label %ENDIF2588
ENDIF2588: ; preds = %ELSE2590, %IF2589
- %216 = fsub float 1.000000e+00, %215
- %217 = call float @llvm.sqrt.f32(float %216)
- %218 = fmul float %217, undef
- %219 = fadd float %218, undef
+ %215 = fsub float 1.000000e+00, %214
+ %216 = call float @llvm.sqrt.f32(float %215)
+ %217 = fmul float %216, undef
+ %218 = fadd float %217, undef
br label %ENDIF2564
ELSE2593: ; preds = %ELSE2566
- %220 = fcmp oeq float %temp292.11, %82
- %221 = fcmp olt float %82, %84
- %222 = and i1 %220, %221
- br i1 %222, label %ENDIF2594, label %ELSE2596
+ %219 = fcmp oeq float %temp292.11, %81
+ %220 = fcmp olt float %81, %83
+ %221 = and i1 %219, %220
+ br i1 %221, label %ENDIF2594, label %ELSE2596
ELSE2596: ; preds = %ELSE2593
- %223 = fcmp oeq float %temp292.11, %101
- %224 = fcmp olt float %101, %103
- %225 = and i1 %223, %224
- br i1 %225, label %ENDIF2594, label %ELSE2632
+ %222 = fcmp oeq float %temp292.11, %100
+ %223 = fcmp olt float %100, %102
+ %224 = and i1 %222, %223
+ br i1 %224, label %ENDIF2594, label %ELSE2632
ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
%temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
- %226 = fmul float %temp894.2, undef
+ %225 = fmul float %temp894.2, undef
br label %ENDIF2564
ELSE2632: ; preds = %ELSE2596
br i1 undef, label %ENDIF2594, label %ELSE2650
ELSE2650: ; preds = %ELSE2632
- %227 = fcmp oeq float %temp292.11, %111
- %228 = fcmp olt float %111, %112
- %229 = and i1 %227, %228
- br i1 %229, label %IF2667, label %ELSE2668
+ %226 = fcmp oeq float %temp292.11, %110
+ %227 = fcmp olt float %110, %111
+ %228 = and i1 %226, %227
+ br i1 %228, label %IF2667, label %ELSE2668
IF2667: ; preds = %ELSE2650
br i1 undef, label %ENDIF2594, label %ELSE2671
ELSE2668: ; preds = %ELSE2650
- %230 = fcmp oeq float %temp292.11, %129
- %231 = fcmp olt float %129, undef
- %232 = and i1 %230, %231
- br i1 %232, label %ENDIF2594, label %ELSE2686
+ %229 = fcmp oeq float %temp292.11, %128
+ %230 = fcmp olt float %128, undef
+ %231 = and i1 %229, %230
+ br i1 %231, label %ENDIF2594, label %ELSE2686
ELSE2671: ; preds = %IF2667
br label %ENDIF2594
ELSE2686: ; preds = %ELSE2668
- %233 = fcmp oeq float %temp292.11, %146
- %234 = fcmp olt float %146, undef
- %235 = and i1 %233, %234
- br i1 %235, label %ENDIF2594, label %ELSE2704
+ %232 = fcmp oeq float %temp292.11, %145
+ %233 = fcmp olt float %145, undef
+ %234 = and i1 %232, %233
+ br i1 %234, label %ENDIF2594, label %ELSE2704
ELSE2704: ; preds = %ELSE2686
- %236 = fcmp oeq float %temp292.11, %181
- %237 = fcmp olt float %181, undef
- %238 = and i1 %236, %237
- br i1 %238, label %ENDIF2594, label %ELSE2740
+ %235 = fcmp oeq float %temp292.11, %180
+ %236 = fcmp olt float %180, undef
+ %237 = and i1 %235, %236
+ br i1 %237, label %ENDIF2594, label %ELSE2740
ELSE2740: ; preds = %ELSE2704
br i1 undef, label %IF2757, label %ELSE2758
@@ -335,8 +349,8 @@ ELSE2761: ; preds = %IF2757
br label %ENDIF2594
IF2775: ; preds = %ELSE2758
- %239 = fcmp olt float undef, undef
- br i1 %239, label %ENDIF2594, label %ELSE2779
+ %238 = fcmp olt float undef, undef
+ br i1 %238, label %ENDIF2594, label %ELSE2779
ELSE2779: ; preds = %IF2775
br i1 undef, label %ENDIF2594, label %ELSE2782
@@ -345,39 +359,39 @@ ELSE2782: ; preds = %ELSE2779
br i1 undef, label %ENDIF2594, label %ELSE2785
ELSE2785: ; preds = %ELSE2782
- %240 = fcmp olt float undef, 0.000000e+00
- br i1 %240, label %ENDIF2594, label %ELSE2788
+ %239 = fcmp olt float undef, 0.000000e+00
+ br i1 %239, label %ENDIF2594, label %ELSE2788
ELSE2788: ; preds = %ELSE2785
- %241 = fcmp olt float 0.000000e+00, undef
- %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00
+ %240 = fcmp olt float 0.000000e+00, undef
+ %.2848 = select i1 %240, float -1.000000e+00, float 1.000000e+00
br label %ENDIF2594
ELSE2797: ; preds = %ENDIF2564
- %242 = fsub float %8, undef
- %243 = fsub float %9, undef
- %244 = fsub float %10, undef
+ %241 = fsub float %8, undef
+ %242 = fsub float %9, undef
+ %243 = fsub float %10, undef
+ %244 = fmul float %241, undef
%245 = fmul float %242, undef
%246 = fmul float %243, undef
- %247 = fmul float %244, undef
- %248 = fsub float %11, undef
- %249 = fsub float %12, undef
- %250 = fsub float %13, undef
+ %247 = fsub float %11, undef
+ %248 = fsub float %12, undef
+ %249 = fsub float %13, undef
+ %250 = fmul float %247, undef
%251 = fmul float %248, undef
%252 = fmul float %249, undef
- %253 = fmul float %250, undef
+ %253 = call float @llvm.minnum.f32(float %244, float %250)
%254 = call float @llvm.minnum.f32(float %245, float %251)
- %255 = call float @llvm.minnum.f32(float %246, float %252)
- %256 = call float @llvm.maxnum.f32(float %247, float %253)
- %257 = call float @llvm.maxnum.f32(float %254, float %255)
- %258 = call float @llvm.maxnum.f32(float %257, float undef)
- %259 = call float @llvm.minnum.f32(float undef, float %256)
- %260 = fcmp ogt float %258, 0.000000e+00
- %261 = fcmp olt float %258, 1.000000e+00
- %262 = and i1 %260, %261
- %263 = fcmp olt float %258, %259
- %264 = and i1 %262, %263
- br i1 %264, label %ENDIF2795, label %ELSE2800
+ %255 = call float @llvm.maxnum.f32(float %246, float %252)
+ %256 = call float @llvm.maxnum.f32(float %253, float %254)
+ %257 = call float @llvm.maxnum.f32(float %256, float undef)
+ %258 = call float @llvm.minnum.f32(float undef, float %255)
+ %259 = fcmp ogt float %257, 0.000000e+00
+ %260 = fcmp olt float %257, 1.000000e+00
+ %261 = and i1 %259, %260
+ %262 = fcmp olt float %257, %258
+ %263 = and i1 %261, %262
+ br i1 %263, label %ENDIF2795, label %ELSE2800
ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
br label %LOOP
@@ -386,53 +400,53 @@ ELSE2800: ; preds = %ELSE2797
br i1 undef, label %ENDIF2795, label %ELSE2803
ELSE2803: ; preds = %ELSE2800
- %265 = fsub float %20, undef
- %266 = fsub float %21, undef
- %267 = fsub float %22, undef
+ %264 = fsub float %20, undef
+ %265 = fsub float %21, undef
+ %266 = fsub float %22, undef
+ %267 = fmul float %264, undef
%268 = fmul float %265, undef
- %269 = fmul float %266, undef
- %270 = fmul float %267, 0.000000e+00
- %271 = fsub float %23, undef
- %272 = fsub float %24, undef
- %273 = fsub float %25, undef
+ %269 = fmul float %266, 0.000000e+00
+ %270 = fsub float %23, undef
+ %271 = fsub float %24, undef
+ %272 = fsub float %25, undef
+ %273 = fmul float %270, undef
%274 = fmul float %271, undef
%275 = fmul float %272, undef
- %276 = fmul float %273, undef
- %277 = call float @llvm.minnum.f32(float %268, float %274)
+ %276 = call float @llvm.minnum.f32(float %267, float %273)
+ %277 = call float @llvm.maxnum.f32(float %268, float %274)
%278 = call float @llvm.maxnum.f32(float %269, float %275)
- %279 = call float @llvm.maxnum.f32(float %270, float %276)
- %280 = call float @llvm.maxnum.f32(float %277, float undef)
- %281 = call float @llvm.maxnum.f32(float %280, float undef)
- %282 = call float @llvm.minnum.f32(float undef, float %278)
- %283 = call float @llvm.minnum.f32(float %282, float %279)
- %284 = fcmp ogt float %281, 0.000000e+00
- %285 = fcmp olt float %281, 1.000000e+00
- %286 = and i1 %284, %285
- %287 = fcmp olt float %281, %283
- %288 = and i1 %286, %287
- br i1 %288, label %ENDIF2795, label %ELSE2806
+ %279 = call float @llvm.maxnum.f32(float %276, float undef)
+ %280 = call float @llvm.maxnum.f32(float %279, float undef)
+ %281 = call float @llvm.minnum.f32(float undef, float %277)
+ %282 = call float @llvm.minnum.f32(float %281, float %278)
+ %283 = fcmp ogt float %280, 0.000000e+00
+ %284 = fcmp olt float %280, 1.000000e+00
+ %285 = and i1 %283, %284
+ %286 = fcmp olt float %280, %282
+ %287 = and i1 %285, %286
+ br i1 %287, label %ENDIF2795, label %ELSE2806
ELSE2806: ; preds = %ELSE2803
- %289 = fsub float %26, undef
- %290 = fsub float %27, undef
- %291 = fsub float %28, undef
- %292 = fmul float %289, undef
- %293 = fmul float %290, 0.000000e+00
- %294 = fmul float %291, undef
- %295 = fsub float %29, undef
- %296 = fmul float %295, undef
- %297 = call float @llvm.minnum.f32(float %292, float %296)
- %298 = call float @llvm.minnum.f32(float %293, float undef)
- %299 = call float @llvm.maxnum.f32(float %294, float undef)
- %300 = call float @llvm.maxnum.f32(float %297, float %298)
- %301 = call float @llvm.maxnum.f32(float %300, float undef)
- %302 = call float @llvm.minnum.f32(float undef, float %299)
- %303 = fcmp ogt float %301, 0.000000e+00
- %304 = fcmp olt float %301, 1.000000e+00
- %305 = and i1 %303, %304
- %306 = fcmp olt float %301, %302
- %307 = and i1 %305, %306
- br i1 %307, label %ENDIF2795, label %ELSE2809
+ %288 = fsub float %26, undef
+ %289 = fsub float %27, undef
+ %290 = fsub float %28, undef
+ %291 = fmul float %288, undef
+ %292 = fmul float %289, 0.000000e+00
+ %293 = fmul float %290, undef
+ %294 = fsub float %29, undef
+ %295 = fmul float %294, undef
+ %296 = call float @llvm.minnum.f32(float %291, float %295)
+ %297 = call float @llvm.minnum.f32(float %292, float undef)
+ %298 = call float @llvm.maxnum.f32(float %293, float undef)
+ %299 = call float @llvm.maxnum.f32(float %296, float %297)
+ %300 = call float @llvm.maxnum.f32(float %299, float undef)
+ %301 = call float @llvm.minnum.f32(float undef, float %298)
+ %302 = fcmp ogt float %300, 0.000000e+00
+ %303 = fcmp olt float %300, 1.000000e+00
+ %304 = and i1 %302, %303
+ %305 = fcmp olt float %300, %301
+ %306 = and i1 %304, %305
+ br i1 %306, label %ENDIF2795, label %ELSE2809
ELSE2809: ; preds = %ELSE2806
br i1 undef, label %ENDIF2795, label %ELSE2812
@@ -447,40 +461,42 @@ ELSE2818: ; preds = %ELSE2815
br i1 undef, label %ENDIF2795, label %ELSE2821
ELSE2821: ; preds = %ELSE2818
- %308 = fsub float %56, undef
- %309 = fsub float %57, undef
- %310 = fsub float %58, undef
- %311 = fmul float %308, undef
- %312 = fmul float %309, 0.000000e+00
- %313 = fmul float %310, undef
- %314 = fsub float %59, undef
- %315 = fsub float %60, undef
- %316 = fsub float %61, undef
+ %307 = fsub float %56, undef
+ %308 = fsub float %57, undef
+ %309 = fsub float %58, undef
+ %310 = fmul float %307, undef
+ %311 = fmul float %308, 0.000000e+00
+ %312 = fmul float %309, undef
+ %313 = fsub float %59, undef
+ %314 = fsub float %60, undef
+ %315 = fsub float %61, undef
+ %316 = fmul float %313, undef
%317 = fmul float %314, undef
%318 = fmul float %315, undef
- %319 = fmul float %316, undef
+ %319 = call float @llvm.maxnum.f32(float %310, float %316)
%320 = call float @llvm.maxnum.f32(float %311, float %317)
%321 = call float @llvm.maxnum.f32(float %312, float %318)
- %322 = call float @llvm.maxnum.f32(float %313, float %319)
- %323 = call float @llvm.minnum.f32(float %320, float %321)
- %324 = call float @llvm.minnum.f32(float %323, float %322)
- %325 = fcmp ogt float undef, 0.000000e+00
- %326 = fcmp olt float undef, 1.000000e+00
- %327 = and i1 %325, %326
- %328 = fcmp olt float undef, %324
- %329 = and i1 %327, %328
- br i1 %329, label %ENDIF2795, label %ELSE2824
+ %322 = call float @llvm.minnum.f32(float %319, float %320)
+ %323 = call float @llvm.minnum.f32(float %322, float %321)
+ %324 = fcmp ogt float undef, 0.000000e+00
+ %325 = fcmp olt float undef, 1.000000e+00
+ %326 = and i1 %324, %325
+ %327 = fcmp olt float undef, %323
+ %328 = and i1 %326, %327
+ br i1 %328, label %ENDIF2795, label %ELSE2824
ELSE2824: ; preds = %ELSE2821
%.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
br label %ENDIF2795
}
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-; Function Attrs: readnone
-declare float @llvm.AMDIL.fraction.(float) #2
+; Function Attrs: nounwind readnone
+declare float @llvm.floor.f32(float) #1
; Function Attrs: nounwind readnone
declare float @llvm.sqrt.f32(float) #1
@@ -491,11 +507,7 @@ declare float @llvm.minnum.f32(float, float) #1
; Function Attrs: nounwind readnone
declare float @llvm.maxnum.f32(float, float) #1
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.lrp(float, float, float) #2
-
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
diff --git a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
new file mode 100644
index 000000000000..5171406469ab
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck %s
+
+; Make sure this doesn't crash.
+; CHECK: {{^}}test:
+; Make sure we are handling hazards correctly.
+; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
+; CHECK-NEXT: s_nop 4
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
+; CHECK: s_endpgm
+define void @test(i32 addrspace(1)* %out, i32 %in) {
+ call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
+ call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
+ call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
+ call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
+ call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
+ call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
+ call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
+ call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
+ call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
+ call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
+ call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
+ call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
+ call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" ()
+ call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" ()
+ call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19_VGPR20_VGPR21_VGPR22_VGPR23}" ()
+ call void asm sideeffect "", "~{VGPR24_VGPR25_VGPR26_VGPR27_VGPR28_VGPR29_VGPR30_VGPR31}" ()
+ call void asm sideeffect "", "~{VGPR32_VGPR33_VGPR34_VGPR35_VGPR36_VGPR37_VGPR38_VGPR39}" ()
+ call void asm sideeffect "", "~{VGPR40_VGPR41_VGPR42_VGPR43_VGPR44_VGPR45_VGPR46_VGPR47}" ()
+ call void asm sideeffect "", "~{VGPR48_VGPR49_VGPR50_VGPR51_VGPR52_VGPR53_VGPR54_VGPR55}" ()
+ call void asm sideeffect "", "~{VGPR56_VGPR57_VGPR58_VGPR59_VGPR60_VGPR61_VGPR62_VGPR63}" ()
+ call void asm sideeffect "", "~{VGPR64_VGPR65_VGPR66_VGPR67_VGPR68_VGPR69_VGPR70_VGPR71}" ()
+ call void asm sideeffect "", "~{VGPR72_VGPR73_VGPR74_VGPR75_VGPR76_VGPR77_VGPR78_VGPR79}" ()
+ call void asm sideeffect "", "~{VGPR80_VGPR81_VGPR82_VGPR83_VGPR84_VGPR85_VGPR86_VGPR87}" ()
+ call void asm sideeffect "", "~{VGPR88_VGPR89_VGPR90_VGPR91_VGPR92_VGPR93_VGPR94_VGPR95}" ()
+ call void asm sideeffect "", "~{VGPR96_VGPR97_VGPR98_VGPR99_VGPR100_VGPR101_VGPR102_VGPR103}" ()
+ call void asm sideeffect "", "~{VGPR104_VGPR105_VGPR106_VGPR107_VGPR108_VGPR109_VGPR110_VGPR111}" ()
+ call void asm sideeffect "", "~{VGPR112_VGPR113_VGPR114_VGPR115_VGPR116_VGPR117_VGPR118_VGPR119}" ()
+ call void asm sideeffect "", "~{VGPR120_VGPR121_VGPR122_VGPR123_VGPR124_VGPR125_VGPR126_VGPR127}" ()
+ call void asm sideeffect "", "~{VGPR128_VGPR129_VGPR130_VGPR131_VGPR132_VGPR133_VGPR134_VGPR135}" ()
+ call void asm sideeffect "", "~{VGPR136_VGPR137_VGPR138_VGPR139_VGPR140_VGPR141_VGPR142_VGPR143}" ()
+ call void asm sideeffect "", "~{VGPR144_VGPR145_VGPR146_VGPR147_VGPR148_VGPR149_VGPR150_VGPR151}" ()
+ call void asm sideeffect "", "~{VGPR152_VGPR153_VGPR154_VGPR155_VGPR156_VGPR157_VGPR158_VGPR159}" ()
+ call void asm sideeffect "", "~{VGPR160_VGPR161_VGPR162_VGPR163_VGPR164_VGPR165_VGPR166_VGPR167}" ()
+ call void asm sideeffect "", "~{VGPR168_VGPR169_VGPR170_VGPR171_VGPR172_VGPR173_VGPR174_VGPR175}" ()
+ call void asm sideeffect "", "~{VGPR176_VGPR177_VGPR178_VGPR179_VGPR180_VGPR181_VGPR182_VGPR183}" ()
+ call void asm sideeffect "", "~{VGPR184_VGPR185_VGPR186_VGPR187_VGPR188_VGPR189_VGPR190_VGPR191}" ()
+ call void asm sideeffect "", "~{VGPR192_VGPR193_VGPR194_VGPR195_VGPR196_VGPR197_VGPR198_VGPR199}" ()
+ call void asm sideeffect "", "~{VGPR200_VGPR201_VGPR202_VGPR203_VGPR204_VGPR205_VGPR206_VGPR207}" ()
+ call void asm sideeffect "", "~{VGPR208_VGPR209_VGPR210_VGPR211_VGPR212_VGPR213_VGPR214_VGPR215}" ()
+ call void asm sideeffect "", "~{VGPR216_VGPR217_VGPR218_VGPR219_VGPR220_VGPR221_VGPR222_VGPR223}" ()
+ call void asm sideeffect "", "~{VGPR224_VGPR225_VGPR226_VGPR227_VGPR228_VGPR229_VGPR230_VGPR231}" ()
+ call void asm sideeffect "", "~{VGPR232_VGPR233_VGPR234_VGPR235_VGPR236_VGPR237_VGPR238_VGPR239}" ()
+ call void asm sideeffect "", "~{VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247}" ()
+ call void asm sideeffect "", "~{VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255}" ()
+
+ store i32 %in, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index bc766dbcac67..0e9618523e32 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -2,7 +2,7 @@
declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.AMDGPU.barrier.local() #2
+declare void @llvm.amdgcn.s.barrier() #1
@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
@@ -10,14 +10,13 @@ declare void @llvm.AMDGPU.barrier.local() #2
@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
; FUNC-LABEL: @reorder_local_load_global_store_local_load
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
; CI: buffer_store_dword
define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
%ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
%ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
- %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
%tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
store i32 99, i32 addrspace(1)* %gptr, align 4
@@ -32,12 +31,12 @@ define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out,
; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
; CI: buffer_store_dword
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
%ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
%ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
- %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
%tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
store volatile i32 99, i32 addrspace(1)* %gptr, align 4
@@ -51,17 +50,17 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
; CI: buffer_store_dword
define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
%ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
%ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
- %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
%tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
store i32 99, i32 addrspace(1)* %gptr, align 4
- call void @llvm.AMDGPU.barrier.local() #2
+ call void @llvm.amdgcn.s.barrier() #1
%tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
%add = add nsw i32 %tmp1, %tmp2
@@ -70,19 +69,18 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
ret void
}
-; Technically we could reorder these, but just comparing the
-; instruction type of the load is insufficient.
-
-; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
-; CI: buffer_load_dword
-; CI: buffer_store_dword
-; CI: buffer_load_dword
+; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
+; CI-DAG: buffer_store_dword
+; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
; CI: buffer_store_dword
-define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
%ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
%ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
- %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
%tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
store i32 99, i32 addrspace(1)* %gptr, align 4
@@ -95,15 +93,17 @@ define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1
}
; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
-; CI: buffer_load_dword
-; CI: buffer_load_dword
+; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
+; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
; CI: ds_write_b32
; CI: buffer_store_dword
define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
%ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
%ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
- %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
%tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
store i32 99, i32 addrspace(3)* %lptr, align 4
@@ -142,7 +142,7 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32
; CI: buffer_store_dword
define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
- %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
%tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
store i32 99, i32 addrspace(3)* %lptr, align 4
@@ -155,17 +155,15 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
}
; FUNC-LABEL: @reorder_local_offsets
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
; CI: buffer_store_dword
; CI: s_endpgm
define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
%ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
%ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
- %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101
+ %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
store i32 123, i32 addrspace(3)* %ptr1, align 4
%tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
@@ -181,18 +179,17 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
}
; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
; CI: s_endpgm
define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
%ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
- %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101
+ %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
store i32 123, i32 addrspace(1)* %ptr1, align 4
%tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
@@ -211,7 +208,7 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
; XCI: TBUFFER_STORE_FORMAT
; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
-; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
+; define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -232,6 +229,5 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
; ret void
; }
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind convergent }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind convergent }
diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll
index bd427dd3ed46..c7d85a0340cc 100644
--- a/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -90,7 +90,7 @@ entry:
ret void
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
index 06bee114c23a..30e6bd1e78f2 100644
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; SI-LABEL: {{^}}s_sext_i1_to_i32:
-; SI: v_cndmask_b32_e64
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i1_to_i32:
+; GCN: v_cndmask_b32_e64
+; GCN: s_endpgm
define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp eq i32 %a, %b
%sext = sext i1 %cmp to i32
@@ -11,9 +11,9 @@ define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
ret void
}
-; SI-LABEL: {{^}}test_s_sext_i32_to_i64:
-; SI: s_ashr_i32
-; SI: s_endpg
+; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
+; GCN: s_ashr_i32
+; GCN: s_endpg
define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
entry:
%mul = mul i32 %a, %b
@@ -23,11 +23,11 @@ entry:
ret void
}
-; SI-LABEL: {{^}}s_sext_i1_to_i64:
-; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
-; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
-; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i1_to_i64:
+; GCN: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
+; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
+; GCN: s_endpgm
define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%cmp = icmp eq i32 %a, %b
%sext = sext i1 %cmp to i64
@@ -35,18 +35,18 @@ define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
ret void
}
-; SI-LABEL: {{^}}s_sext_i32_to_i64:
-; SI: s_ashr_i32
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i32_to_i64:
+; GCN: s_ashr_i32
+; GCN: s_endpgm
define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
%sext = sext i32 %a to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}v_sext_i32_to_i64:
-; SI: v_ashr
-; SI: s_endpgm
+; GCN-LABEL: {{^}}v_sext_i32_to_i64:
+; GCN: v_ashr
+; GCN: s_endpgm
define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
%val = load i32, i32 addrspace(1)* %in, align 4
%sext = sext i32 %val to i64
@@ -54,10 +54,112 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no
ret void
}
-; SI-LABEL: {{^}}s_sext_i16_to_i64:
-; SI: s_endpgm
+; GCN-LABEL: {{^}}s_sext_i16_to_i64:
+; GCN: s_endpgm
define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
%sext = sext i16 %a to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
+
+; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
+; GCN-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
+; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010
+; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24
+
+; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]]
+; GCN-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
+; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]]
+; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]]
+
+; GCN-DAG: buffer_store_dword [[VEXT0]]
+; GCN-DAG: buffer_store_dword [[VEXT1]]
+; GCN-DAG: buffer_store_dword [[VEXT2]]
+; GCN-DAG: buffer_store_dword [[VEXT3]]
+
+; GCN: s_endpgm
+define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+ %cast = bitcast i32 %a to <4 x i8>
+ %ext = sext <4 x i8> %cast to <4 x i32>
+ %elt0 = extractelement <4 x i32> %ext, i32 0
+ %elt1 = extractelement <4 x i32> %ext, i32 1
+ %elt2 = extractelement <4 x i32> %ext, i32 2
+ %elt3 = extractelement <4 x i32> %ext, i32 3
+ store volatile i32 %elt0, i32 addrspace(1)* %out
+ store volatile i32 %elt1, i32 addrspace(1)* %out
+ store volatile i32 %elt2, i32 addrspace(1)* %out
+ store volatile i32 %elt3, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
+; GCN-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
+; GCN-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
+; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]]
+
+; GCN: buffer_store_dword [[EXT0]]
+; GCN: buffer_store_dword [[EXT1]]
+; GCN: buffer_store_dword [[EXT2]]
+; GCN: buffer_store_dword [[EXT3]]
+define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+ %a = load i32, i32 addrspace(1)* %in
+ %cast = bitcast i32 %a to <4 x i8>
+ %ext = sext <4 x i8> %cast to <4 x i32>
+ %elt0 = extractelement <4 x i32> %ext, i32 0
+ %elt1 = extractelement <4 x i32> %ext, i32 1
+ %elt2 = extractelement <4 x i32> %ext, i32 2
+ %elt3 = extractelement <4 x i32> %ext, i32 3
+ store volatile i32 %elt0, i32 addrspace(1)* %out
+ store volatile i32 %elt1, i32 addrspace(1)* %out
+ store volatile i32 %elt2, i32 addrspace(1)* %out
+ store volatile i32 %elt3, i32 addrspace(1)* %out
+ ret void
+}
+
+; FIXME: s_bfe_i64
+; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32:
+; GCN-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
+; GCN-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; GCN-DAG: s_sext_i32_i16
+; GCN-DAG: s_sext_i32_i16
+; GCN: s_endpgm
+define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
+ %cast = bitcast i64 %a to <4 x i16>
+ %ext = sext <4 x i16> %cast to <4 x i32>
+ %elt0 = extractelement <4 x i32> %ext, i32 0
+ %elt1 = extractelement <4 x i32> %ext, i32 1
+ %elt2 = extractelement <4 x i32> %ext, i32 2
+ %elt3 = extractelement <4 x i32> %ext, i32 3
+ store volatile i32 %elt0, i32 addrspace(1)* %out
+ store volatile i32 %elt1, i32 addrspace(1)* %out
+ store volatile i32 %elt2, i32 addrspace(1)* %out
+ store volatile i32 %elt3, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32:
+; SI-DAG: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 48
+; VI-DAG: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 48, v{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: s_endpgm
+define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+ %a = load i64, i64 addrspace(1)* %in
+ %cast = bitcast i64 %a to <4 x i16>
+ %ext = sext <4 x i16> %cast to <4 x i32>
+ %elt0 = extractelement <4 x i32> %ext, i32 0
+ %elt1 = extractelement <4 x i32> %ext, i32 1
+ %elt2 = extractelement <4 x i32> %ext, i32 2
+ %elt3 = extractelement <4 x i32> %ext, i32 3
+ store volatile i32 %elt0, i32 addrspace(1)* %out
+ store volatile i32 %elt1, i32 addrspace(1)* %out
+ store volatile i32 %elt2, i32 addrspace(1)* %out
+ store volatile i32 %elt3, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index a94ccc32e61c..7c58f2d906d4 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
; SI: v_cvt_f64_i32_e32
@@ -10,14 +10,14 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
ret void
}
-; FIXME: select on 0, 0
-; SI-LABEL: {{^}}sint_to_fp_i1_f64:
-; SI: v_cmp_eq_i32_e64 vcc,
; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
; uses an SGPR (implicit vcc).
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
-; SI: buffer_store_dwordx2
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64:
+; SI-DAG: v_cmp_eq_i32_e64 vcc,
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
; SI: s_endpgm
define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
%cmp = icmp eq i32 %in, 0
@@ -52,7 +52,7 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%val = load i64, i64 addrspace(1)* %gep, align 8
%result = sitofp i64 %val to double
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 138b93b16d8d..16eae1899ec0 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
@@ -13,8 +13,7 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
; FUNC-LABEL: {{^}}v_sint_to_fp_i64_to_f32:
; GCN: {{buffer|flat}}_load_dwordx2
-; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 63
-; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\]}}, 63, {{v\[[0-9]+:[0-9]+\]}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
; GCN: v_xor_b32
; GCN: v_ffbh_u32
@@ -26,10 +25,10 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
; GCN-DAG: v_cmp_lt_u64
; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 [[SIGN_SEL:v[0-9]+]],
-; GCN: {{buffer|flat}}_store_dword [[SIGN_SEL]]
+; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
+; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]]
define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %in.gep
@@ -47,7 +46,7 @@ define void @s_sint_to_fp_v2i64(<2 x float> addrspace(1)* %out, <2 x i64> %in) #
; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64:
define void @v_sint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
%out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
%value = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep
@@ -56,7 +55,7 @@ define void @v_sint_to_fp_v4i64(<4 x float> addrspace(1)* %out, <4 x i64> addrsp
ret void
}
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
index 851085c9535d..75ffdd2cc85a 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -103,7 +103,7 @@ define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
; SI: v_cmp_eq_i32
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
-; SI: {{buffer|flat}}_store_dword [[RESULT]],
+; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
; SI: s_endpgm
define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x()
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
new file mode 100644
index 000000000000..10187f6125d6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -0,0 +1,390 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
+ call void @llvm.AMDGPU.kill(float 0.0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
+ call void @llvm.AMDGPU.kill(float -0.0)
+ ret void
+}
+
+; FIXME: Ideally only one would be emitted
+; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: s_mov_b64 exec, 0
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
+ call void @llvm.AMDGPU.kill(float -0.0)
+ call void @llvm.AMDGPU.kill(float -1.0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_var:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
+ call void @llvm.AMDGPU.kill(float %x)
+ ret void
+}
+
+; FIXME: Ideally only one would be emitted
+; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
+ call void @llvm.AMDGPU.kill(float %x)
+ call void @llvm.AMDGPU.kill(float %x)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v1
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
+ call void @llvm.AMDGPU.kill(float %x)
+ call void @llvm.AMDGPU.kill(float %y)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
+; CHECK-NEXT: ; BB#0:
+; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
+; CHECK-NEXT: ; BB#1:
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK-NEXT: ; BB#2:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
+ call void @llvm.AMDGPU.kill(float %x)
+ %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={VGPR7}"()
+ call void @llvm.AMDGPU.kill(float %y)
+ ret void
+}
+
+; FIXME: why does the skip depend on the asm length in the same block?
+
+; CHECK-LABEL: {{^}}test_kill_control_flow:
+; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ; BB#1:
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
+; CHECK-NEXT: ; BB#3:
+; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
+; CHECK-NEXT: s_endpgm
+
+; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
+entry:
+ %cmp = icmp eq i32 %arg, 0
+ br i1 %cmp, label %bb, label %exit
+
+bb:
+ %var = call float asm sideeffect "
+ v_mov_b32_e64 v7, -1
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64", "={VGPR7}"()
+ call void @llvm.AMDGPU.kill(float %var)
+ br label %exit
+
+exit:
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
+; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ; BB#1: ; %bb
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: v_nop_e64
+; CHECK: ;;#ASMEND
+; CHECK: v_mov_b32_e64 v8, -1
+; CHECK: ;;#ASMEND
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ; BB#4:
+; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0
+; CHECK-NEXT: s_endpgm
+
+; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
+; CHECK: buffer_store_dword v8
+; CHECK: v_mov_b32_e64 v9, -2
+
+; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}:
+; CHECK: buffer_store_dword v9
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
+entry:
+ %cmp = icmp eq i32 %arg, 0
+ br i1 %cmp, label %bb, label %exit
+
+bb:
+ %var = call float asm sideeffect "
+ v_mov_b32_e64 v7, -1
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64", "={VGPR7}"()
+ %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"()
+ call void @llvm.AMDGPU.kill(float %var)
+ store volatile float %live.across, float addrspace(1)* undef
+ %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"()
+ br label %exit
+
+exit:
+ %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ]
+ store float %phi, float addrspace(1)* undef
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_kill_divergent_loop:
+; CHECK: v_cmp_eq_i32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
+; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
+; CHECK-NEXT: ; mask branch [[EXIT]]
+
+; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]:
+
+; CHECK: v_mov_b32_e64 v7, -1
+; CHECK: v_nop_e64
+; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
+
+; CHECK-NEXT: ; BB#3:
+; CHECK: buffer_load_dword [[LOAD:v[0-9]+]]
+; CHECK: v_cmp_eq_i32_e32 vcc, 0, [[LOAD]]
+; CHECK-NEXT: s_and_b64 vcc, exec, vcc
+; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]]
+
+; CHECK-NEXT: {{^}}[[EXIT]]:
+; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]]
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
+entry:
+ %cmp = icmp eq i32 %arg, 0
+ br i1 %cmp, label %bb, label %exit
+
+bb:
+ %var = call float asm sideeffect "
+ v_mov_b32_e64 v7, -1
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64
+ v_nop_e64", "={VGPR7}"()
+ call void @llvm.AMDGPU.kill(float %var)
+ %vgpr = load volatile i32, i32 addrspace(1)* undef
+ %loop.cond = icmp eq i32 %vgpr, 0
+ br i1 %loop.cond, label %bb, label %exit
+
+exit:
+ store volatile i32 8, i32 addrspace(1)* undef
+ ret void
+}
+
+; bug 28550
+; CHECK-LABEL: {{^}}phi_use_def_before_kill:
+; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0,
+; CHECK: v_cmpx_le_f32_e32 vcc, 0,
+; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: exp
+; CHECK-NEXT: s_endpgm
+
+; CHECK: [[KILLBB:BB[0-9]+_[0-9]+]]:
+; CHECK: s_and_b64 vcc, exec,
+; CHECK-NEXT: s_cbranch_vccz [[PHIBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[PHIBB]]:
+; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]]
+; CHECK: s_and_b64 vcc, exec, vcc
+; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: ; BB#3: ; %bb10
+; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9
+; CHECK: buffer_store_dword
+
+; CHECK: [[ENDBB]]:
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @phi_use_def_before_kill() #0 {
+bb:
+ %tmp = fadd float undef, 1.000000e+00
+ %tmp1 = fcmp olt float 0.000000e+00, %tmp
+ %tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00
+ call void @llvm.AMDGPU.kill(float %tmp2)
+ br i1 undef, label %phibb, label %bb8
+
+phibb:
+ %tmp5 = phi float [ %tmp2, %bb ], [ 4.0, %bb8 ]
+ %tmp6 = fcmp oeq float %tmp5, 0.000000e+00
+ br i1 %tmp6, label %bb10, label %end
+
+bb8:
+ store volatile i32 8, i32 addrspace(1)* undef
+ br label %phibb
+
+bb10:
+ store volatile i32 9, i32 addrspace(1)* undef
+ br label %end
+
+end:
+ ret void
+}
+
+; CHECK-LABEL: {{^}}no_skip_no_successors:
+; CHECK: v_cmp_nle_f32
+; CHECK: s_and_b64 vcc, exec,
+; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]]
+
+; CHECK: ; BB#3: ; %bb6
+; CHECK: s_mov_b64 exec, 0
+
+; CHECK: [[SKIPKILL]]:
+; CHECK: v_cmp_nge_f32
+; CHECK: s_and_b64 vcc, exec, vcc
+; CHECK: s_cbranch_vccz [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[UNREACHABLE]]:
+; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
+define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
+bb:
+ %tmp = fcmp ult float %arg1, 0.000000e+00
+ %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
+ br i1 %tmp, label %bb6, label %bb3
+
+bb3: ; preds = %bb
+ br i1 %tmp2, label %bb5, label %bb4
+
+bb4: ; preds = %bb3
+ br i1 true, label %bb5, label %bb7
+
+bb5: ; preds = %bb4, %bb3
+ unreachable
+
+bb6: ; preds = %bb
+ call void @llvm.AMDGPU.kill(float -1.000000e+00)
+ unreachable
+
+bb7: ; preds = %bb4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}if_after_kill_block:
+; CHECK: ; BB#0:
+; CHECK: s_and_saveexec_b64
+; CHECK: s_xor_b64
+; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: v_cmpx_le_f32_e32 vcc, 0,
+; CHECK: [[BB4]]:
+; CHECK: s_or_b64 exec, exec
+; CHECK: image_sample_c
+
+; CHECK: v_cmp_neq_f32_e32 vcc, 0,
+; CHECK: s_and_b64 exec, exec,
+; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
+; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
+; CHECK-NOT: branch
+
+; CHECK: ; BB#3: ; %bb8
+; CHECK: buffer_store_dword
+
+; CHECK: [[END]]:
+; CHECK: s_or_b64 exec, exec
+; CHECK: s_endpgm
+define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 {
+bb:
+ %tmp = fcmp ult float %arg1, 0.000000e+00
+ br i1 %tmp, label %bb3, label %bb4
+
+bb3: ; preds = %bb
+ call void @llvm.AMDGPU.kill(float %arg)
+ br label %bb4
+
+bb4: ; preds = %bb3, %bb
+ %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp6 = extractelement <4 x float> %tmp5, i32 0
+ %tmp7 = fcmp une float %tmp6, 0.000000e+00
+ br i1 %tmp7, label %bb8, label %bb9
+
+bb8: ; preds = %bb9, %bb4
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+bb9: ; preds = %bb4
+ ret void
+}
+
+declare void @llvm.AMDGPU.kill(float) #0
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone } \ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll
new file mode 100644
index 000000000000..9b977fc54630
--- /dev/null
+++ b/test/CodeGen/AMDGPU/smed3.ll
@@ -0,0 +1,449 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
+; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp sgt i32 %a, 12
+ %i0 = select i1 %icmp0, i32 %a, i32 12
+
+ %icmp1 = icmp slt i32 %i0, 17
+ %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+ store i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32:
+; GCN: v_max_i32
+; GCN: v_min_i32
+define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp sgt i32 %a, 12
+ %i0 = select i1 %icmp0, i32 %a, i32 12
+
+ %icmp1 = icmp slt i32 %i0, 17
+ %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+ store volatile i32 %i0, i32 addrspace(1)* %outgep
+ store volatile i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32:
+; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp sgt i32 %a, 17
+ %i0 = select i1 %icmp0, i32 %a, i32 17
+
+ %icmp1 = icmp slt i32 %i0, 12
+ %i1 = select i1 %icmp1, i32 %i0, i32 12
+
+ store i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32:
+; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp ugt i32 %a, 12
+ %i0 = select i1 %icmp0, i32 %a, i32 12
+
+ %icmp1 = icmp slt i32 %i0, 17
+ %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+ store i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64:
+; GCN: v_cmp_lt_i64
+; GCN: v_cmp_gt_i64
+define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep0
+
+ %icmp0 = icmp sgt i64 %a, 12
+ %i0 = select i1 %icmp0, i64 %a, i64 12
+
+ %icmp1 = icmp slt i64 %i0, 17
+ %i1 = select i1 %icmp1, i64 %i0, i64 17
+
+ store i64 %i1, i64 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
+; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+ %a = load i16, i16 addrspace(1)* %gep0
+
+ %icmp0 = icmp sgt i16 %a, 12
+ %i0 = select i1 %icmp0, i16 %a, i16 12
+
+ %icmp1 = icmp slt i16 %i0, 17
+ %i1 = select i1 %icmp1, i16 %i0, i16 17
+
+ store i16 %i1, i16 addrspace(1)* %outgep
+ ret void
+}
+
+
+define internal i32 @smin(i32 %x, i32 %y) #2 {
+ %cmp = icmp slt i32 %x, %y
+ %sel = select i1 %cmp, i32 %x, i32 %y
+ ret i32 %sel
+}
+
+define internal i32 @smax(i32 %x, i32 %y) #2 {
+ %cmp = icmp sgt i32 %x, %y
+ %sel = select i1 %cmp, i32 %x, i32 %y
+ ret i32 %sel
+}
+
+define internal i16 @smin16(i16 %x, i16 %y) #2 {
+ %cmp = icmp slt i16 %x, %y
+ %sel = select i1 %cmp, i16 %x, i16 %y
+ ret i16 %sel
+}
+
+define internal i16 @smax16(i16 %x, i16 %y) #2 {
+ %cmp = icmp sgt i16 %x, %y
+ %sel = select i1 %cmp, i16 %x, i16 %y
+ ret i16 %sel
+}
+
+define internal i8 @smin8(i8 %x, i8 %y) #2 {
+ %cmp = icmp slt i8 %x, %y
+ %sel = select i1 %cmp, i8 %x, i8 %y
+ ret i8 %sel
+}
+
+define internal i8 @smax8(i8 %x, i8 %y) #2 {
+ %cmp = icmp sgt i8 %x, %y
+ %sel = select i1 %cmp, i8 %x, i8 %y
+ ret i8 %sel
+}
+
+; 16 combinations
+
+; 0: max(min(x, y), min(max(x, y), z))
+; 1: max(min(x, y), min(max(y, x), z))
+; 2: max(min(x, y), min(z, max(x, y)))
+; 3: max(min(x, y), min(z, max(y, x)))
+; 4: max(min(y, x), min(max(x, y), z))
+; 5: max(min(y, x), min(max(y, x), z))
+; 6: max(min(y, x), min(z, max(x, y)))
+; 7: max(min(y, x), min(z, max(y, x)))
+;
+; + commute outermost max
+
+
+; FIXME: In these cases we probably should have used scalar operations
+; instead.
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %y, i32 %x)
+ %tmp1 = call i32 @smax(i32 %y, i32 %x)
+ %tmp2 = call i32 @smin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @smax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
+; GCN: s_sext_i32_i16
+; GCN: s_sext_i32_i16
+; GCN: s_sext_i32_i16
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+bb:
+ %tmp0 = call i16 @smin16(i16 %x, i16 %y)
+ %tmp1 = call i16 @smax16(i16 %x, i16 %y)
+ %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
+ %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
+ store i16 %tmp3, i16 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i8_pat_0:
+; GCN: s_sext_i32_i8
+; GCN: s_sext_i32_i8
+; GCN: s_sext_i32_i8
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+bb:
+ %tmp0 = call i8 @smin8(i8 %x, i8 %y)
+ %tmp1 = call i8 @smax8(i8 %x, i8 %y)
+ %tmp2 = call i8 @smin8(i8 %tmp1, i8 %z)
+ %tmp3 = call i8 @smax8(i8 %tmp0, i8 %tmp2)
+ store i8 %tmp3, i8 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0:
+; GCN-NOT: v_med3_i32
+define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp0, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1:
+; GCN-NOT: v_med3_i32
+define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp1, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2:
+; GCN-NOT: v_med3_i32
+define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp2, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result:
+; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @smin(i32 %x, i32 %y)
+ %tmp1 = call i32 @smax(i32 %x, i32 %y)
+ %tmp2 = call i32 @smin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @smax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
index e646605f7da1..560d5597baa9 100644
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -1,9 +1,12 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}s_abs_i32:
; GCN: s_abs_i32
; GCN: s_add_i32
+
+; EG: MAX_INT
define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
%neg = sub i32 0, %val
%cond = icmp sgt i32 %val, %neg
@@ -17,6 +20,8 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
; GCN: v_add_i32
+
+; EG: MAX_INT
define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
%val = load i32, i32 addrspace(1)* %src, align 4
%neg = sub i32 0, %val
@@ -32,6 +37,9 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind
; GCN: s_abs_i32
; GCN: s_add_i32
; GCN: s_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
%z0 = insertelement <2 x i32> undef, i32 0, i32 0
%z1 = insertelement <2 x i32> %z0, i32 0, i32 1
@@ -46,14 +54,17 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind
}
; FUNC-LABEL: {{^}}v_abs_v2i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
; GCN: v_add_i32
; GCN: v_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
%z0 = insertelement <2 x i32> undef, i32 0, i32 0
%z1 = insertelement <2 x i32> %z0, i32 0, i32 1
@@ -79,6 +90,11 @@ define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %
; GCN: s_add_i32
; GCN: s_add_i32
; GCN: s_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
%z0 = insertelement <4 x i32> undef, i32 0, i32 0
%z1 = insertelement <4 x i32> %z0, i32 0, i32 1
@@ -97,20 +113,25 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind
}
; FUNC-LABEL: {{^}}v_abs_v4i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
; GCN: v_add_i32
; GCN: v_add_i32
; GCN: v_add_i32
; GCN: v_add_i32
+
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
+; EG: MAX_INT
define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
%z0 = insertelement <4 x i32> undef, i32 0, i32 0
%z1 = insertelement <4 x i32> %z0, i32 0, i32 1
@@ -128,3 +149,76 @@ define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
store <4 x i32> %res2, <4 x i32> addrspace(1)* %out, align 4
ret void
}
+
+; FUNC-LABEL: {{^}}s_min_max_i32:
+; GCN: s_load_dword [[VAL0:s[0-9]+]]
+; GCN: s_load_dword [[VAL1:s[0-9]+]]
+
+; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
+; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
+define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+ %cond0 = icmp sgt i32 %val0, %val1
+ %sel0 = select i1 %cond0, i32 %val0, i32 %val1
+ %sel1 = select i1 %cond0, i32 %val1, i32 %val0
+
+ store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
+ store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_min_max_i32:
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]]
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
+
+; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
+; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
+define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+ %val0 = load volatile i32, i32 addrspace(1)* %ptr0
+ %val1 = load volatile i32, i32 addrspace(1)* %ptr1
+
+ %cond0 = icmp sgt i32 %val0, %val1
+ %sel0 = select i1 %cond0, i32 %val0, i32 %val1
+ %sel1 = select i1 %cond0, i32 %val1, i32 %val0
+
+ store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
+ store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_min_max_v4i32:
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_min_i32
+; GCN-DAG: s_max_i32
+; GCN-DAG: s_max_i32
+; GCN-DAG: s_max_i32
+; GCN-DAG: s_max_i32
+define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
+ %cond0 = icmp sgt <4 x i32> %val0, %val1
+ %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1
+ %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0
+
+ store volatile <4 x i32> %sel0, <4 x i32> addrspace(1)* %out0, align 4
+ store volatile <4 x i32> %sel1, <4 x i32> addrspace(1)* %out1, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_min_max_i32_user:
+; GCN: v_cmp_gt_i32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
+define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+ %val0 = load volatile i32, i32 addrspace(1)* %ptr0
+ %val1 = load volatile i32, i32 addrspace(1)* %ptr1
+
+ %cond0 = icmp sgt i32 %val0, %val1
+ %sel0 = select i1 %cond0, i32 %val0, i32 %val1
+ %sel1 = select i1 %cond0, i32 %val1, i32 %val0
+
+ store volatile i32 %sel0, i32 addrspace(1)* %out0, align 4
+ store volatile i32 %sel1, i32 addrspace(1)* %out1, align 4
+ store volatile i1 %cond0, i1 addrspace(1)* undef
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
new file mode 100644
index 000000000000..ddac8a006c86
--- /dev/null
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s
+
+; GCN-FUNC: {{^}}vccz_workaround:
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
+; GCN: v_cmp_neq_f32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0
+; GCN: s_and_b64 vcc, exec, [[MASK]]
+; GCN: s_waitcnt lgkmcnt(0)
+; VCCZ-BUG: s_mov_b64 vcc, vcc
+; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
+; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
+; GCN: buffer_store_dword
+; GCN: [[EXIT]]:
+; GCN: s_endpgm
+define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
+entry:
+ %cnd = fcmp oeq float 0.0, %cond
+ %sgpr = load volatile i32, i32 addrspace(2)* %in
+ br i1 %cnd, label %if, label %endif
+
+if:
+ store i32 %sgpr, i32 addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
+
+; GCN-FUNC: {{^}}vccz_noworkaround:
+; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
+; GCN: buffer_store_dword
+; GCN: [[EXIT]]:
+; GCN: s_endpgm
+define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+ %vgpr = load volatile float, float addrspace(1)* %in
+ %cnd = fcmp oeq float 0.0, %vgpr
+ br i1 %cnd, label %if, label %endif
+
+if:
+ store float %vgpr, float addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 1d6bb9ece8c6..476da9486dff 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -88,7 +88,7 @@ entry:
; GCN-LABEL: {{^}}smrd_load_const0:
; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
main_body:
%20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
%21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -102,7 +102,7 @@ main_body:
; GCN-LABEL: {{^}}smrd_load_const1:
; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
main_body:
%20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
%21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -118,7 +118,7 @@ main_body:
; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
main_body:
%20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
%21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -133,7 +133,7 @@ main_body:
; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
main_body:
%20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
%21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -148,7 +148,7 @@ main_body:
; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
; GCN: s_endpgm
-define void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
main_body:
%20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
%21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -158,9 +158,8 @@ main_body:
}
; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const(<16 x i8>, i32) #0
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" }
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index c91a44cf60e5..cc4b6bcbfb51 100644
--- a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -6,7 +6,8 @@
; TONGA-LABEL: test
define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
entry:
- %tid = call i32 @llvm.SI.tid() nounwind readnone
+ %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
%aptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
%a = load <256 x i32>, <256 x i32> addrspace(1)* %aptr
call void asm sideeffect "", "~{memory}" ()
@@ -21,4 +22,7 @@ entry:
ret void
}
-declare i32 @llvm.SI.tid() nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 4a12ed545b81..9b3dfab2be6a 100644
--- a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -1,5 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 < %s | FileCheck %s
+; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 < %s | FileCheck %s
+ ;
+; There is something about Tonga that causes this test to spend a lot of time
+; in the default register allocator.
+
; When the offset of VGPR spills into scratch space gets too large, an additional SGPR
; is used to calculate the scratch load/store address. Make sure that this
@@ -7,10 +11,10 @@
; Just test that it compiles successfully.
; CHECK-LABEL: test
-define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in,
- <96 x i32> addrspace(1)* %sdata_out, <96 x i32> %sdata_in) {
+define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
entry:
- %tid = call i32 @llvm.SI.tid() nounwind readnone
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
%a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr
@@ -24,10 +28,13 @@ entry:
call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" ()
call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" ()
- %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
+ %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %out, i32 %tid
store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr
ret void
}
-declare i32 @llvm.SI.tid() nounwind readnone
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index 9e181bc14d9d..d4e2dc814050 100644
--- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-declare i32 @llvm.r600.read.tidig.x() readnone
+declare i32 @llvm.amdgcn.workitem.id.x() readnone
; This is broken because the low half of the 64-bit add remains on the
; SALU, but the upper half does not. The addc expects the carry bit
@@ -62,7 +62,7 @@ define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i6
; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
- %tid = call i32 @llvm.r600.read.tidig.x() readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
%load = load i32, i32 addrspace(1)* %gep
%vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
diff --git a/test/CodeGen/AMDGPU/split-smrd.ll b/test/CodeGen/AMDGPU/split-smrd.ll
new file mode 100644
index 000000000000..237a62c1360a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/split-smrd.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; FIXME: Move this to sgpr-copy.ll when this is fixed on VI.
+; Make sure that when we split an smrd instruction in order to move it to
+; the VALU, we are also moving its users to the VALU.
+; CHECK-LABEL: {{^}}split_smrd_add_worklist:
+; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
+
+define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+bb:
+ %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+ %tmp1 = bitcast float %tmp to i32
+ br i1 undef, label %bb2, label %bb3
+
+bb2: ; preds = %bb
+ unreachable
+
+bb3: ; preds = %bb
+ %tmp4 = bitcast float %tmp to i32
+ %tmp5 = add i32 %tmp4, 4
+ %tmp6 = sext i32 %tmp5 to i64
+ %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6
+ %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+ %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp10 = extractelement <4 x float> %tmp9, i32 0
+ %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
+ %tmp13 = bitcast i32 %tmp12 to float
+ call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef)
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare i32 @llvm.SI.packf16(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
+!2 = !{!1, !1, i64 0}
diff --git a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index 4c82ed6affc2..484150bc25fd 100644
--- a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -1,4 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
+; XFAIL: *
@sPrivateStorage = external addrspace(3) global [256 x [8 x <4 x i64>]]
@@ -34,14 +35,14 @@ define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly
entry:
%tmp = tail call i32 @llvm.r600.read.local.size.y()
%tmp1 = tail call i32 @llvm.r600.read.local.size.z()
- %tmp2 = tail call i32 @llvm.r600.read.tidig.x()
- %tmp3 = tail call i32 @llvm.r600.read.tidig.y()
- %tmp4 = tail call i32 @llvm.r600.read.tidig.z()
+ %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp3 = tail call i32 @llvm.amdgcn.workitem.id.y()
+ %tmp4 = tail call i32 @llvm.amdgcn.workitem.id.z()
%tmp6 = mul i32 %tmp2, %tmp
%tmp10 = add i32 %tmp3, %tmp6
%tmp11 = mul i32 %tmp10, %tmp1
%tmp9 = add i32 %tmp11, %tmp4
- %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
+ %x.i.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
%x.i.12.i = tail call i32 @llvm.r600.read.local.size.x() #1
%mul.26.i = mul i32 %x.i.12.i, %x.i.i
%add.i = add i32 %tmp2, %mul.26.i
@@ -80,13 +81,13 @@ entry:
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.local.size.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.local.size.y() #1
@@ -95,10 +96,10 @@ declare i32 @llvm.r600.read.local.size.y() #1
declare i32 @llvm.r600.read.local.size.z() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.z() #1
+declare i32 @llvm.amdgcn.workitem.id.z() #1
attributes #0 = { norecurse nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index 3b59bbfb18c0..dddfbfd3ed10 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,213 +1,258 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-;EG-LABEL: {{^}}ashr_v2i32:
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+declare i32 @llvm.r600.read.tidig.x() #0
-;SI-LABEL: {{^}}ashr_v2i32:
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; FUNC-LABEL: {{^}}ashr_v2i32:
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI-LABEL: {{^}}ashr_v2i32:
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
- %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
- %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+ %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+ %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
%result = ashr <2 x i32> %a, %b
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
ret void
}
-;EG-LABEL: {{^}}ashr_v4i32:
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: {{^}}ashr_v4i32:
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-LABEL: {{^}}ashr_v4i32:
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v4i32:
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
- %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
- %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+ %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
%result = ashr <4 x i32> %a, %b
store <4 x i32> %result, <4 x i32> addrspace(1)* %out
ret void
}
-;EG-LABEL: {{^}}ashr_i64:
-;EG: ASHR
-
-;SI-LABEL: {{^}}ashr_i64:
-;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
-
-;VI-LABEL: {{^}}ashr_i64:
-;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+; FUNC-LABEL: {{^}}s_ashr_i64:
+; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
-define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+; EG: ASHR
+define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
entry:
- %0 = sext i32 %in to i64
- %1 = ashr i64 %0, 8
- store i64 %1, i64 addrspace(1)* %out
+ %in.ext = sext i32 %in to i64
+ %ashr = ashr i64 %in.ext, 8
+ store i64 %ashr, i64 addrspace(1)* %out
ret void
}
-;EG-LABEL: {{^}}ashr_i64_2:
-;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
-;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
-;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
-;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-
-;SI-LABEL: {{^}}ashr_i64_2:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_i64_2:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}ashr_i64_2:
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
+; EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+; EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
entry:
%b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
- %a = load i64, i64 addrspace(1) * %in
- %b = load i64, i64 addrspace(1) * %b_ptr
+ %a = load i64, i64 addrspace(1)* %in
+ %b = load i64, i64 addrspace(1)* %b_ptr
%result = ashr i64 %a, %b
store i64 %result, i64 addrspace(1)* %out
ret void
}
-;EG-LABEL: {{^}}ashr_v2i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: ASHR {{.*}}, [[SHA]]
-;EG-DAG: ASHR {{.*}}, [[SHB]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI-LABEL: {{^}}ashr_v2i64:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v2i64:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: {{^}}ashr_v2i64:
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: ASHR {{.*}}, [[SHA]]
+; EG-DAG: ASHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
- %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
- %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
+ %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+ %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
%result = ashr <2 x i64> %a, %b
store <2 x i64> %result, <2 x i64> addrspace(1)* %out
ret void
}
-;EG-LABEL: {{^}}ashr_v4i64:
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
-;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
-;EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: LSHL {{.*}}, 1
-;EG-DAG: ASHR {{.*}}, [[SHA]]
-;EG-DAG: ASHR {{.*}}, [[SHB]]
-;EG-DAG: ASHR {{.*}}, [[SHC]]
-;EG-DAG: ASHR {{.*}}, [[SHD]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: LSHR {{.*}}, [[SHA]]
-;EG-DAG: LSHR {{.*}}, [[SHB]]
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: OR_INT
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: ASHR {{.*}}, literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
-;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-;EG-DAG: CNDE_INT
-
-;SI-LABEL: {{^}}ashr_v4i64:
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-
-;VI-LABEL: {{^}}ashr_v4i64:
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; FIXME: Broken on r600
+; XFUNC-LABEL: {{^}}s_ashr_v2i64:
+; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
+; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
+; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
+; %result = ashr <2 x i64> %a, %b
+; store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+; ret void
+; }
+
+; FUNC-LABEL: {{^}}ashr_v4i64:
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: ASHR {{.*}}, [[SHA]]
+; EG-DAG: ASHR {{.*}}, [[SHB]]
+; EG-DAG: ASHR {{.*}}, [[SHC]]
+; EG-DAG: ASHR {{.*}}, [[SHD]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: ASHR {{.*}}, literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
- %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
- %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
+ %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+ %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
%result = ashr <4 x i64> %a, %b
store <4 x i64> %result, <4 x i64> addrspace(1)* %out
ret void
}
+; GCN-LABEL: {{^}}s_ashr_32_i64:
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
+; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
+define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %result = ashr i64 %a, 32
+ %add = add i64 %result, %b
+ store i64 %add, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_32_i64:
+; SI: buffer_load_dword v[[HI:[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; VI: flat_load_dword v[[HI:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}}
+define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.in
+ %result = ashr i64 %a, 32
+ store i64 %result, i64 addrspace(1)* %gep.out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_ashr_63_i64:
+; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
+; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
+; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
+define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+ %result = ashr i64 %a, 63
+ %add = add i64 %result, %b
+ store i64 %add, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_63_i64:
+; SI: buffer_load_dword v[[HI:[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; VI: flat_load_dword v[[HI:[0-9]+]]
+; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
+; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]]
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}}
+define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+ %tid = call i32 @llvm.r600.read.tidig.x() #0
+ %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep.in
+ %result = ashr i64 %a, 63
+ store i64 %result, i64 addrspace(1)* %gep.out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll
index ba4049f28a6e..57a93ccd2505 100644
--- a/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/test/CodeGen/AMDGPU/store-barrier.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s
; This test is for a bug in the machine scheduler where stores without
; an underlying object would be moved across the barrier. In this
@@ -12,16 +12,16 @@
; CHECK: s_barrier
; CHECK: s_endpgm
; Function Attrs: nounwind
-define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) {
+define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
bb:
%tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
%tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
%tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13
- %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2
+ %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 1
%tmp16 = add i32 %tmp13, 1
%tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16
- store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2
- tail call void @llvm.AMDGPU.barrier.local() #2
+ store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 1
+ tail call void @llvm.amdgcn.s.barrier()
%tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4
%tmp26 = sext i32 %tmp25 to i64
%tmp27 = sext i32 %arg4 to i64
@@ -37,6 +37,7 @@ bb:
}
; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
+declare void @llvm.amdgcn.s.barrier() #1
-attributes #2 = { convergent nounwind }
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
index e0c554ad2c17..b4d7505e0a8a 100644
--- a/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -1,29 +1,128 @@
-; XFAIL: *
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; SI-LABEL: {{^}}global_store_v3i64:
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}global_store_v3i64:
+; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
ret void
}
-; SI-LABEL: {{^}}global_store_v3i64_unaligned:
+; GCN-LABEL: {{^}}global_store_v3i64_unaligned:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
ret void
}
-; SI-LABEL: {{^}}local_store_v3i64:
+; GCN-LABEL: {{^}}local_store_v3i64:
+; GCN: ds_write_b64
+; GCN: ds_write_b64
+; GCN: ds_write_b64
define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
ret void
}
-; SI-LABEL: {{^}}local_store_v3i64_unaligned:
-define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
- store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+; GCN-LABEL: {{^}}local_store_v3i64_unaligned:
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+ store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
+; GCN-DAG: buffer_store_dwordx2
+; GCN-DAG: buffer_store_dword v
+define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
+ %trunc = trunc <3 x i64> %x to <3 x i32>
+ store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16:
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_dword v
+define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
+ %trunc = trunc <3 x i64> %x to <3 x i16>
+ store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out
+ ret void
+}
+
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8:
+; GCN-DAG: buffer_store_short
+; GCN-DAG: buffer_store_byte v
+define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
+ %trunc = trunc <3 x i64> %x to <3 x i8>
+ store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i1:
+; GCN-DAG: buffer_store_byte v
+; GCN-DAG: buffer_store_byte v
+; GCN-DAG: buffer_store_byte v
+define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
+ %trunc = trunc <3 x i64> %x to <3 x i1>
+ store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out
ret void
}
diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll
index d22f43fa05ef..68c659fdd2ff 100644
--- a/test/CodeGen/AMDGPU/store.ll
+++ b/test/CodeGen/AMDGPU/store.ll
@@ -77,12 +77,31 @@ entry:
ret void
}
+; FUNC-LABEL: {{^}}store_i24:
+; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_short
+define void @store_i24(i24 addrspace(1)* %out, i24 %in) {
+entry:
+ store i24 %in, i24 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_i25:
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
+; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
+; SI: buffer_store_dword [[VAND]]
+define void @store_i25(i25 addrspace(1)* %out, i25 %in) {
+entry:
+ store i25 %in, i25 addrspace(1)* %out
+ ret void
+}
+
; FUNC-LABEL: {{^}}store_v2i8:
; EG: MEM_RAT MSKOR
; EG-NOT: MEM_RAT MSKOR
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_short
define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
entry:
%0 = trunc <2 x i32> %in to <2 x i8>
@@ -96,8 +115,7 @@ entry:
; CM: MEM_RAT_CACHELESS STORE_DWORD
-; SI: buffer_store_short
-; SI: buffer_store_short
+; SI: buffer_store_dword
define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
entry:
%0 = trunc <2 x i32> %in to <2 x i16>
@@ -110,10 +128,7 @@ entry:
; CM: MEM_RAT_CACHELESS STORE_DWORD
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_dword
define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
@@ -135,17 +150,9 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
}
; FUNC-LABEL: {{^}}store_v4i16:
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG: MEM_RAT MSKOR
-; EG-NOT: MEM_RAT MSKOR
+; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI: buffer_store_short
-; SI-NOT: buffer_store_byte
+; SI: buffer_store_dwordx2
define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
entry:
%0 = trunc <4 x i32> %in to <4 x i16>
@@ -239,8 +246,7 @@ define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
; CM: LDS_WRITE
-; SI: ds_write_b16
-; SI: ds_write_b16
+; SI: ds_write_b32
define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
entry:
store <2 x i16> %in, <2 x i16> addrspace(3)* %out
@@ -252,10 +258,7 @@ entry:
; CM: LDS_WRITE
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI: ds_write_b32
define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
entry:
store <4 x i8> %in, <4 x i8> addrspace(3)* %out
@@ -287,8 +290,7 @@ entry:
; CM: LDS_WRITE
; CM: LDS_WRITE
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_write2_b64
define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
entry:
store <4 x i32> %in, <4 x i32> addrspace(3)* %out
@@ -358,20 +360,13 @@ entry:
ret void
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
; When i128 was a legal type this program generated cannot select errors:
; FUNC-LABEL: {{^}}"i128-const-store":
-; FIXME: We should be able to to this with one store instruction
-; EG: STORE_RAW
-; EG: STORE_RAW
-; EG: STORE_RAW
-; EG: STORE_RAW
-; CM: STORE_DWORD
-; CM: STORE_DWORD
-; CM: STORE_DWORD
-; CM: STORE_DWORD
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 1
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X
+
; SI: buffer_store_dwordx4
define void @i128-const-store(i32 addrspace(1)* %out) {
entry:
@@ -384,3 +379,5 @@ entry:
store i32 2, i32 addrspace(1)* %arrayidx6, align 4
ret void
}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll
index 02e592e9a559..174e64e2cf8b 100644
--- a/test/CodeGen/AMDGPU/structurize.ll
+++ b/test/CodeGen/AMDGPU/structurize.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood -r600-ir-structurize=0 | FileCheck %s
; Test case for a crash in the AMDILCFGStructurizer from a CFG like this:
;
; entry
diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll
index 77432c1f9d2b..db0f50247e38 100644
--- a/test/CodeGen/AMDGPU/structurize1.ll
+++ b/test/CodeGen/AMDGPU/structurize1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -r600-if-convert=0 < %s | FileCheck %s
; This tests for abug where the AMDILCFGStructurizer was crashing on loops
; like this:
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index 9f9446a4e608..5a026cdf2990 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -58,13 +58,11 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)
; SI: s_sub_u32
; SI: s_subb_u32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: SUB_INT {{[* ]*}}
; EG-DAG: SUBB_UINT
; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
+; EG-DAG: SUB_INT {{[* ]*}}
define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
%result = sub i64 %a, %b
store i64 %result, i64 addrspace(1)* %out, align 8
@@ -75,13 +73,11 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind
; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
-; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
-; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: SUB_INT {{[* ]*}}
; EG-DAG: SUBB_UINT
; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
-; EG-NOT: SUB
+; EG-DAG: SUB_INT {{[* ]*}}
define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x() readnone
%a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
@@ -110,13 +106,13 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(
}
; FUNC-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
; SI: v_subb_u32_e32
-; SI: v_sub_i32_e32
+; SI: v_subrev_i32_e32
; SI: v_subb_u32_e32
define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
%tid = call i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
index c4dae4736cfa..ec2ed78b4954 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -44,7 +44,7 @@ for.inc.1: ; preds = %do.body.1562.prehea
; SI-LABEL: {{^}}foo:
; SI: s_endpgm
-define void @foo() #0 {
+define amdgpu_ps void @foo() #0 {
bb:
br i1 undef, label %bb2, label %bb1
@@ -67,7 +67,7 @@ bb7: ; preds = %bb6
br label %bb4
bb9: ; preds = %bb2
- %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2)
+ %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tmp11 = extractelement <4 x float> %tmp10, i32 1
%tmp12 = extractelement <4 x float> %tmp10, i32 3
br label %bb14
@@ -98,12 +98,12 @@ bb27: ; preds = %bb24
}
; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index ac9bedb2f8b5..4b6f65a77b9a 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -7,23 +7,25 @@ target triple="amdgcn--"
; CHECK: s_load_dword s2, s[0:1], 0x9
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; CHECK: v_mbcnt_lo_u32_b32_e64
+; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; BB0_1:
-; CHECK: s_load_dword s6, s[0:1], 0xa
+; CHECK: s_load_dword s0, s[0:1], 0xa
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
; BB0_2:
; CHECK: s_or_b64 exec, exec, s[2:3]
; CHECK-NEXT: s_mov_b32 s7, 0xf000
; CHECK-NEXT: s_mov_b32 s6, -1
-; CHECK-NEXT: buffer_store_dword v1, s[4:7], 0
+; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
; CHECK-NEXT: s_endpgm
define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
entry:
%v0 = insertelement <4 x float> undef, float %a0, i32 0
- br i1 undef, label %ift, label %ife
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+ %cnd = icmp eq i32 %tid, 0
+ br i1 %cnd, label %ift, label %ife
ift:
%v1 = insertelement <4 x float> undef, float %a1, i32 0
@@ -35,3 +37,7 @@ ife:
store float %v2, float addrspace(1)* %out, align 4
ret void
}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/swizzle-export.ll b/test/CodeGen/AMDGPU/swizzle-export.ll
index 000ee2faa478..7cf380520d42 100644
--- a/test/CodeGen/AMDGPU/swizzle-export.ll
+++ b/test/CodeGen/AMDGPU/swizzle-export.ll
@@ -6,7 +6,7 @@
;EG: EXPORT T{{[0-9]+}}.XXWX
;EG: EXPORT T{{[0-9]+}}.XXXW
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -68,27 +68,27 @@ main_body:
%57 = insertelement <4 x float> %56, float %1, i32 1
%58 = insertelement <4 x float> %57, float %2, i32 2
%59 = insertelement <4 x float> %58, float %3, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %59, i32 60, i32 1)
%60 = insertelement <4 x float> undef, float %10, i32 0
%61 = insertelement <4 x float> %60, float %13, i32 1
%62 = insertelement <4 x float> %61, float %16, i32 2
%63 = insertelement <4 x float> %62, float %19, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %63, i32 0, i32 2)
%64 = insertelement <4 x float> undef, float %22, i32 0
%65 = insertelement <4 x float> %64, float %25, i32 1
%66 = insertelement <4 x float> %65, float %28, i32 2
%67 = insertelement <4 x float> %66, float %31, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %67, i32 1, i32 2)
%68 = insertelement <4 x float> undef, float %34, i32 0
%69 = insertelement <4 x float> %68, float %37, i32 1
%70 = insertelement <4 x float> %69, float %40, i32 2
%71 = insertelement <4 x float> %70, float %43, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %71, i32 2, i32 2)
%72 = insertelement <4 x float> undef, float %46, i32 0
%73 = insertelement <4 x float> %72, float %49, i32 1
%74 = insertelement <4 x float> %73, float %52, i32 2
%75 = insertelement <4 x float> %74, float %55, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %75, i32 3, i32 2)
ret void
}
@@ -96,7 +96,7 @@ main_body:
; EG: T{{[0-9]+}}.XY__
; EG: T{{[0-9]+}}.ZXY0
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_vs void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
main_body:
%0 = extractelement <4 x float> %reg1, i32 0
%1 = extractelement <4 x float> %reg1, i32 1
@@ -111,19 +111,18 @@ main_body:
%10 = extractelement <4 x float> %9, i32 1
%11 = insertelement <4 x float> undef, float %2, i32 0
%12 = insertelement <4 x float> %11, float %3, i32 1
- call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1)
+ call void @llvm.r600.store.swizzle(<4 x float> %12, i32 60, i32 1)
%13 = insertelement <4 x float> undef, float %6, i32 0
%14 = insertelement <4 x float> %13, float %8, i32 1
%15 = insertelement <4 x float> %14, float %10, i32 2
%16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3
- call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2)
+ call void @llvm.r600.store.swizzle(<4 x float> %16, i32 0, i32 2)
ret void
}
; Function Attrs: nounwind readonly
declare float @llvm.cos.f32(float) #1
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-attributes #0 = { "ShaderType"="1" }
attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll
new file mode 100644
index 000000000000..c1662acbf2a0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/target-cpu.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; CI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+
+; VI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.wb() #0
+
+; CHECK-LABEL: {{^}}target_none:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_none() #0 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ ret void
+}
+
+; CHECK-LABEL: {{^}}target_tahiti:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_tahiti() #1 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ ret void
+}
+
+; CHECK-LABEL: {{^}}target_bonaire:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; CHECK: s_dcache_inv_vol
+define void @target_bonaire() #3 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ call void @llvm.amdgcn.s.dcache.inv.vol()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}target_fiji:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
+; CHECK: flat_store_dword
+; CHECK: s_dcache_wb{{$}}
+define void @target_fiji() #4 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ call void @llvm.amdgcn.s.dcache.wb()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_enabled:
+; CHECK: ds_read_b32
+; CHECK: ; LDSByteSize: 5120
+define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ %load = load i32, i32* %arrayidx1
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_disabled:
+; CHECK: SCRATCH_RSRC_DWORD0
+; CHECK: SCRATCH_RSRC_DWORD1
+; CHECK: ScratchSize: 24
+define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ %load = load i32, i32* %arrayidx1
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-cpu"="tahiti" }
+attributes #3 = { nounwind "target-cpu"="bonaire" }
+attributes #4 = { nounwind "target-cpu"="fiji" }
+attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-max-waves-per-eu"="3" }
+attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-max-waves-per-eu"="3" }
diff --git a/test/CodeGen/AMDGPU/tex-clause-antidep.ll b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
index cbb9c50974a4..2420286f766e 100644
--- a/test/CodeGen/AMDGPU/tex-clause-antidep.ll
+++ b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
@@ -3,7 +3,7 @@
;CHECK: TEX
;CHECK-NEXT: ALU
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_vs void @test(<4 x float> inreg %reg0) {
%1 = extractelement <4 x float> %reg0, i32 0
%2 = extractelement <4 x float> %reg0, i32 1
%3 = extractelement <4 x float> %reg0, i32 2
@@ -12,14 +12,12 @@ define void @test(<4 x float> inreg %reg0) #0 {
%6 = insertelement <4 x float> %5, float %2, i32 1
%7 = insertelement <4 x float> %6, float %3, i32 2
%8 = insertelement <4 x float> %7, float %4, i32 3
- %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %9 = call <4 x float> @llvm.r600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %10 = call <4 x float> @llvm.r600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%11 = fadd <4 x float> %9, %10
- call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %11, i32 0, i32 0)
ret void
}
-declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" } \ No newline at end of file
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/texture-input-merge.ll b/test/CodeGen/AMDGPU/texture-input-merge.ll
index 789538af5821..a56a5ca39dcc 100644
--- a/test/CodeGen/AMDGPU/texture-input-merge.ll
+++ b/test/CodeGen/AMDGPU/texture-input-merge.ll
@@ -2,7 +2,7 @@
;CHECK-NOT: MOV
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_vs void @test(<4 x float> inreg %reg0) {
%1 = extractelement <4 x float> %reg0, i32 0
%2 = extractelement <4 x float> %reg0, i32 1
%3 = extractelement <4 x float> %reg0, i32 2
@@ -16,16 +16,14 @@ define void @test(<4 x float> inreg %reg0) #0 {
%11 = insertelement <4 x float> undef, float %7, i32 0
%12 = insertelement <4 x float> %11, float %5, i32 1
%13 = insertelement <4 x float> undef, float %8, i32 0
- %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %14 = call <4 x float> @llvm.r600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %15 = call <4 x float> @llvm.r600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %16 = call <4 x float> @llvm.r600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%17 = fadd <4 x float> %14, %15
%18 = fadd <4 x float> %17, %16
- call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0)
+ call void @llvm.r600.store.swizzle(<4 x float> %18, i32 0, i32 0)
ret void
}
-declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" } \ No newline at end of file
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll
new file mode 100644
index 000000000000..1555cfe39b1e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trap.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+; GCN: warning: <unknown>:0:0: in function trap void (): trap handler not supported
+
+declare void @llvm.trap() #0
+
+; GCN-LABEL: {{^}}trap:
+; GCN: s_endpgm
+; GCN-NEXT: s_endpgm
+define void @trap() {
+ call void @llvm.trap()
+ ret void
+}
+
+attributes #0 = { nounwind noreturn }
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
new file mode 100644
index 000000000000..9e2373c55e35
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+ %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+ %bc = bitcast <2 x i32> %ld to i64
+ %trunc = trunc i64 %bc to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+ %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
+ %bc = bitcast <3 x i32> %ld to i96
+ %trunc = trunc i96 %bc to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %bc = bitcast <4 x i32> %ld to i128
+ %trunc = trunc i128 %bc to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+ %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
+ %bc = bitcast <2 x i16> %ld to i32
+ %trunc = trunc i32 %bc to i16
+ store i16 %trunc, i16 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+ %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
+ %bc = bitcast <4 x i16> %ld to i64
+ %trunc = trunc i64 %bc to i16
+ store i16 %trunc, i16 addrspace(1)* %out
+ ret void
+}
+
+; FIXME: Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
+; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+ %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %bc = bitcast <2 x i8> %ld to i16
+ %trunc = trunc i16 %bc to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+ %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %bc = bitcast <4 x i8> %ld to i32
+ %trunc = trunc i32 %bc to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+ %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+ %bc = bitcast <3 x i8> %ld to i24
+ %trunc = trunc i24 %bc to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index dac74728b3ce..6d820dbd2692 100644
--- a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -4,8 +4,7 @@
; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
-; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}}
+; SI: v_cmp_eq_i32_e32 vcc, 0, [[TMP]]{{$}}
; SI: v_cndmask_b32_e64
; SI: buffer_store_byte
define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
@@ -23,7 +22,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@@ -46,7 +45,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@@ -58,7 +57,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = sext i1 %load to i32
@@ -82,7 +81,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = sext i1 %load to i32
@@ -94,7 +93,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@@ -120,7 +119,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI-NEXT: buffer_store_byte [[RESULT]]
+; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@@ -159,7 +158,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}}
; SI-NEXT: v_cndmask_b32_e64
-; SI-NEXT: buffer_store_byte
+; SI: buffer_store_byte
define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
%load = load i8, i8 addrspace(1)* %in
%masked = and i8 %load, 255
diff --git a/test/CodeGen/AMDGPU/trunc-store.ll b/test/CodeGen/AMDGPU/trunc-store.ll
index 4ba815f26690..cf5c00e65b7d 100644
--- a/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/test/CodeGen/AMDGPU/trunc-store.ll
@@ -2,22 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_dwordx4
define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
%trunc = trunc <16 x i32> %in to <16 x i8>
store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
@@ -25,22 +10,7 @@ define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x
}
; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI: buffer_store_dwordx4
define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
%trunc = trunc <16 x i64> %in to <16 x i8>
store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
index ad52d0f2e238..dbd07fee6bbe 100644
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -37,8 +37,8 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
; SI: s_addc_u32
; SI: v_mov_b32_e32
-; SI: v_mov_b32_e32
; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
+; SI: v_mov_b32_e32
; SI: buffer_store_dword v[[LO_VREG]],
define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
%aa = add i64 %a, 234 ; Prevent shrinking store.
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index 2a09e0b20498..f72c22095e4a 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}udiv_i32:
; EG-NOT: SETGE_INT
@@ -91,3 +91,57 @@ define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
store i32 %result, i32 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: {{^}}v_udiv_i8:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+ %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+ %num = load i8, i8 addrspace(1) * %in
+ %den = load i8, i8 addrspace(1) * %den_ptr
+ %result = udiv i8 %num, %den
+ %result.ext = zext i8 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i16:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+ %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+ %num = load i16, i16 addrspace(1) * %in
+ %den = load i16, i16 addrspace(1) * %den_ptr
+ %result = udiv i16 %num, %den
+ %result.ext = zext i16 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i23:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+ %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
+ %num = load i23, i23 addrspace(1) * %in
+ %den = load i23, i23 addrspace(1) * %den_ptr
+ %result = udiv i23 %num, %den
+ %result.ext = zext i23 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i24:
+; SI-NOT: v_rcp_f32
+define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+ %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
+ %num = load i24, i24 addrspace(1) * %in
+ %den = load i24, i24 addrspace(1) * %den_ptr
+ %result = udiv i24 %num, %den
+ %result.ext = zext i24 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
index f692b7dfdc27..268f3c764d6e 100644
--- a/test/CodeGen/AMDGPU/udivrem.ll
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
@@ -51,11 +51,11 @@
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
%result0 = udiv i32 %x, %y
- store i32 %result0, i32 addrspace(1)* %out
+ store i32 %result0, i32 addrspace(1)* %out0
%result1 = urem i32 %x, %y
- store i32 %result1, i32 addrspace(1)* %out
+ store i32 %result1, i32 addrspace(1)* %out1
ret void
}
@@ -107,50 +107,54 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
; EG-DAG: CNDE_INT
; EG-DAG: CNDE_INT
-; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[FIRST_RCP_LO]]
+; For SI, we used to have checks for the input and output registers
+; of the instructions, but these are way too fragile. The division for
+; the two vector elements can be intermixed which makes it impossible to
+; accurately check all the operands.
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], vcc, [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_and_b32_e32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_rcp_iflag_f32_e32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_sub_i32_e32
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], vcc, [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_and_b32_e32
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_add_i32_e32
+; SI-DAG: v_subrev_i32_e32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll
index 4de881b66f10..147b95560935 100644
--- a/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/test/CodeGen/AMDGPU/udivrem24.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}udiv24_i8:
@@ -40,7 +40,7 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
ret void
}
-; FUNC-LABEL: {{^}}udiv24_i32:
+; FUNC-LABEL: {{^}}udiv23_i32:
; SI: v_cvt_f32_u32
; SI-DAG: v_cvt_f32_u32
; SI-DAG: v_rcp_f32
@@ -50,6 +50,23 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
; EG-DAG: UINT_TO_FLT
; EG-DAG: RECIP_IEEE
; EG: FLT_TO_UINT
+define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i23.0 = shl i32 %num, 9
+ %den.i23.0 = shl i32 %den, 9
+ %num.i23 = lshr i32 %num.i23.0, 9
+ %den.i23 = lshr i32 %den.i23.0, 9
+ %result = udiv i32 %num.i23, %den.i23
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
@@ -63,6 +80,40 @@ define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
ret void
}
+; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
+define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i23.0 = shl i32 %num, 9
+ %den.i24.0 = shl i32 %den, 8
+ %num.i23 = lshr i32 %num.i23.0, 9
+ %den.i24 = lshr i32 %den.i24.0, 8
+ %result = udiv i32 %num.i23, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
+define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i24.0 = shl i32 %num, 8
+ %den.i23.0 = shl i32 %den, 9
+ %num.i24 = lshr i32 %num.i24.0, 8
+ %den.i23 = lshr i32 %den.i23.0, 9
+ %result = udiv i32 %num.i24, %den.i23
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
; FUNC-LABEL: {{^}}udiv25_i32:
; RCP_IFLAG is for URECIP in the full 32b alg
; SI: v_rcp_iflag
@@ -74,11 +125,11 @@ define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
- %num.i24.0 = shl i32 %num, 7
- %den.i24.0 = shl i32 %den, 7
- %num.i24 = lshr i32 %num.i24.0, 7
- %den.i24 = lshr i32 %den.i24.0, 7
- %result = udiv i32 %num.i24, %den.i24
+ %num.i25.0 = shl i32 %num, 7
+ %den.i25.0 = shl i32 %den, 7
+ %num.i25 = lshr i32 %num.i25.0, 7
+ %den.i25 = lshr i32 %den.i25.0, 7
+ %result = udiv i32 %num.i25, %den.i25
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
@@ -162,15 +213,8 @@ define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
}
; FUNC-LABEL: {{^}}urem24_i32:
-; SI: v_cvt_f32_u32
-; SI: v_cvt_f32_u32
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
+; SI-NOT: v_rcp_f32
+; EG-NOT: RECIP_IEEE
define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
@@ -243,3 +287,41 @@ define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
+
+; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32:
+; SI-DAG: v_rcp_f32
+; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; EG: RECIP_IEEE
+define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i16.0 = shl i32 %num, 16
+ %den.i23.0 = shl i32 %den, 9
+ %num.i16 = lshr i32 %num.i16.0, 16
+ %den.i23 = lshr i32 %den.i23.0, 9
+ %result = udiv i32 %num.i16, %den.i23
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32:
+; SI-DAG: v_rcp_f32
+; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; EG: RECIP_IEEE
+define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i23.0 = shl i32 %num, 9
+ %den.i16.0 = shl i32 %den, 16
+ %num.i23 = lshr i32 %num.i23.0, 9
+ %den.i16 = lshr i32 %den.i16.0, 16
+ %result = udiv i32 %num.i23, %den.i16
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll
index 9f3069bdf80c..72e6af9a6eea 100644
--- a/test/CodeGen/AMDGPU/udivrem64.ll
+++ b/test/CodeGen/AMDGPU/udivrem64.ll
@@ -1,4 +1,4 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
@@ -184,7 +184,7 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
ret void
}
-;FUNC-LABEL: {{^}}test_udiv2464:
+;FUNC-LABEL: {{^}}test_udiv2364:
;EG: UINT_TO_FLT
;EG: UINT_TO_FLT
;EG: FLT_TO_UINT
@@ -195,15 +195,15 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
;VI-NOT: v_lshrrev_b64
;GCN: v_mad_f32
;GCN: s_endpgm
-define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
- %1 = lshr i64 %x, 40
- %2 = lshr i64 %y, 40
+define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+ %1 = lshr i64 %x, 41
+ %2 = lshr i64 %y, 41
%result = udiv i64 %1, %2
store i64 %result, i64 addrspace(1)* %out
ret void
}
-;FUNC-LABEL: {{^}}test_urem2464:
+;FUNC-LABEL: {{^}}test_urem2364:
;EG: UINT_TO_FLT
;EG: UINT_TO_FLT
;EG: FLT_TO_UINT
@@ -214,9 +214,9 @@ define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
;VI-NOT: v_lshrrev_b64
;GCN: v_mad_f32
;GCN: s_endpgm
-define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
- %1 = lshr i64 %x, 40
- %2 = lshr i64 %y, 40
+define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+ %1 = lshr i64 %x, 41
+ %2 = lshr i64 %y, 41
%result = urem i64 %1, %2
store i64 %result, i64 addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 65fe580792a5..b36ce6b8d6ca 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
; SI: buffer_store_dwordx2 [[RESULT]]
define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%val = load i64, i64 addrspace(1)* %gep, align 8
%result = uitofp i64 %val to double
@@ -70,14 +70,14 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
ret void
}
-; FIXME: select on 0, 0
-; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
-; SI: v_cmp_eq_i32_e64 vcc
; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
; uses an SGPR (implicit vcc).
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc
-; SI: buffer_store_dwordx2
+
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
+; SI-DAG: v_cmp_eq_i32_e64 vcc
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
; SI: s_endpgm
define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
%cmp = icmp eq i32 %in, 0
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 3ab11442d5cc..27c41e41a0e7 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -22,7 +22,7 @@ define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
; GCN-DAG: v_cmp_lt_u64
; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
-; GCN: {{buffer|flat}}_store_dword [[VR]]
+; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
index a3343d1e2d9c..0c3d54cf0d09 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -103,7 +103,7 @@ define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
; SI: v_cmp_eq_i32
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
-; SI: {{buffer|flat}}_store_dword [[RESULT]],
+; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
; SI: s_endpgm
define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x()
diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll
new file mode 100644
index 000000000000..a26eb8f9ada9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/umed3.ll
@@ -0,0 +1,484 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
+; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp ugt i32 %a, 12
+ %i0 = select i1 %icmp0, i32 %a, i32 12
+
+ %icmp1 = icmp ult i32 %i0, 17
+ %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+ store i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
+; GCN: v_max_u32
+; GCN: v_min_u32
+define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp ugt i32 %a, 12
+ %i0 = select i1 %icmp0, i32 %a, i32 12
+
+ %icmp1 = icmp ult i32 %i0, 17
+ %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+ store volatile i32 %i0, i32 addrspace(1)* %outgep
+ store volatile i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
+; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp ugt i32 %a, 17
+ %i0 = select i1 %icmp0, i32 %a, i32 17
+
+ %icmp1 = icmp ult i32 %i0, 12
+ %i1 = select i1 %icmp1, i32 %i0, i32 12
+
+ store i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
+; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
+; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
+define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ %a = load i32, i32 addrspace(1)* %gep0
+
+ %icmp0 = icmp sgt i32 %a, 12
+ %i0 = select i1 %icmp0, i32 %a, i32 12
+
+ %icmp1 = icmp ult i32 %i0, 17
+ %i1 = select i1 %icmp1, i32 %i0, i32 17
+
+ store i32 %i1, i32 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
+; GCN: v_cmp_lt_u64
+; GCN: v_cmp_gt_u64
+define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+ %a = load i64, i64 addrspace(1)* %gep0
+
+ %icmp0 = icmp ugt i64 %a, 12
+ %i0 = select i1 %icmp0, i64 %a, i64 12
+
+ %icmp1 = icmp ult i64 %i0, 17
+ %i1 = select i1 %icmp1, i64 %i0, i64 17
+
+ store i64 %i1, i64 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
+; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.r600.read.tidig.x()
+ %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+ %a = load i16, i16 addrspace(1)* %gep0
+
+ %icmp0 = icmp ugt i16 %a, 12
+ %i0 = select i1 %icmp0, i16 %a, i16 12
+
+ %icmp1 = icmp ult i16 %i0, 17
+ %i1 = select i1 %icmp1, i16 %i0, i16 17
+
+ store i16 %i1, i16 addrspace(1)* %outgep
+ ret void
+}
+
+define internal i32 @umin(i32 %x, i32 %y) #2 {
+ %cmp = icmp ult i32 %x, %y
+ %sel = select i1 %cmp, i32 %x, i32 %y
+ ret i32 %sel
+}
+
+define internal i32 @umax(i32 %x, i32 %y) #2 {
+ %cmp = icmp ugt i32 %x, %y
+ %sel = select i1 %cmp, i32 %x, i32 %y
+ ret i32 %sel
+}
+
+define internal i16 @umin16(i16 %x, i16 %y) #2 {
+ %cmp = icmp ult i16 %x, %y
+ %sel = select i1 %cmp, i16 %x, i16 %y
+ ret i16 %sel
+}
+
+define internal i16 @umax16(i16 %x, i16 %y) #2 {
+ %cmp = icmp ugt i16 %x, %y
+ %sel = select i1 %cmp, i16 %x, i16 %y
+ ret i16 %sel
+}
+
+define internal i8 @umin8(i8 %x, i8 %y) #2 {
+ %cmp = icmp ult i8 %x, %y
+ %sel = select i1 %cmp, i8 %x, i8 %y
+ ret i8 %sel
+}
+
+define internal i8 @umax8(i8 %x, i8 %y) #2 {
+ %cmp = icmp ugt i8 %x, %y
+ %sel = select i1 %cmp, i8 %x, i8 %y
+ ret i8 %sel
+}
+
+; 16 combinations
+
+; 0: max(min(x, y), min(max(x, y), z))
+; 1: max(min(x, y), min(max(y, x), z))
+; 2: max(min(x, y), min(z, max(x, y)))
+; 3: max(min(x, y), min(z, max(y, x)))
+; 4: max(min(y, x), min(max(x, y), z))
+; 5: max(min(y, x), min(max(y, x), z))
+; 6: max(min(y, x), min(z, max(x, y)))
+; 7: max(min(y, x), min(z, max(y, x)))
+;
+; + commute outermost max
+
+
+; FIXME: In these cases we probably should have used scalar operations
+; instead.
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %y, i32 %x)
+ %tmp1 = call i32 @umax(i32 %y, i32 %x)
+ %tmp2 = call i32 @umin(i32 %z, i32 %tmp1)
+ %tmp3 = call i32 @umax(i32 %tmp2, i32 %tmp0)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i16_pat_0:
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+bb:
+ %tmp0 = call i16 @umin16(i16 %x, i16 %y)
+ %tmp1 = call i16 @umax16(i16 %x, i16 %y)
+ %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
+ %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
+ store i16 %tmp3, i16 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i8_pat_0:
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+bb:
+ %tmp0 = call i8 @umin8(i8 %x, i8 %y)
+ %tmp1 = call i8 @umax8(i8 %x, i8 %y)
+ %tmp2 = call i8 @umin8(i8 %tmp1, i8 %z)
+ %tmp3 = call i8 @umax8(i8 %tmp0, i8 %tmp2)
+ store i8 %tmp3, i8 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0:
+; GCN-NOT: v_med3_u32
+define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp0, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1:
+; GCN-NOT: v_med3_u32
+define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp1, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2:
+; GCN-NOT: v_med3_u32
+define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp2, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ store volatile i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 1, i32 %y)
+ %tmp1 = call i32 @umax(i32 1, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
+define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 2)
+ %tmp1 = call i32 @umax(i32 %x, i32 2)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 %z)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
+; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
+define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+bb:
+ %tmp0 = call i32 @umin(i32 %x, i32 %y)
+ %tmp1 = call i32 @umax(i32 %x, i32 %y)
+ %tmp2 = call i32 @umin(i32 %tmp1, i32 9)
+ %tmp3 = call i32 @umax(i32 %tmp0, i32 %tmp2)
+ store i32 %tmp3, i32 addrspace(1)* %arg
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 82d88ebd3ae7..129748afd938 100644
--- a/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -1,62 +1,97 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
-; SI-LABEL: {{^}}unaligned_load_store_i16_local:
+; SI-LABEL: {{^}}local_unaligned_load_store_i16:
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_write_b8
; SI: ds_write_b8
; SI: s_endpgm
-define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
+define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
%v = load i16, i16 addrspace(3)* %p, align 1
store i16 %v, i16 addrspace(3)* %r, align 1
ret void
}
-; SI-LABEL: {{^}}unaligned_load_store_i16_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
+; SI-LABEL: {{^}}global_unaligned_load_store_i16:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_ushort
+; UNALIGNED: buffer_store_short
; SI: s_endpgm
-define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
+define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
%v = load i16, i16 addrspace(1)* %p, align 1
store i16 %v, i16 addrspace(1)* %r, align 1
ret void
}
-; SI-LABEL: {{^}}unaligned_load_store_i32_local:
+; FUNC-LABEL: {{^}}local_unaligned_load_store_i32:
+
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
+; SI-NOT: v_or
+; SI-NOT: v_lshl
; SI: ds_write_b8
; SI: ds_write_b8
; SI: ds_write_b8
; SI: ds_write_b8
; SI: s_endpgm
-define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
%v = load i32, i32 addrspace(3)* %p, align 1
store i32 %v, i32 addrspace(3)* %r, align 1
ret void
}
-; SI-LABEL: {{^}}unaligned_load_store_i32_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
+; SI-LABEL: {{^}}global_unaligned_load_store_i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_dword
+; UNALIGNED: buffer_store_dword
+define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
%v = load i32, i32 addrspace(1)* %p, align 1
store i32 %v, i32 addrspace(1)* %r, align 1
ret void
}
-; SI-LABEL: {{^}}unaligned_load_store_i64_local:
+; SI-LABEL: {{^}}global_align2_load_store_i32:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+
+; UNALIGNED: buffer_load_dword
+; UNALIGNED: buffer_store_dword
+define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+ %v = load i32, i32 addrspace(1)* %p, align 2
+ store i32 %v, i32 addrspace(1)* %r, align 2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_align2_load_store_i32:
+; GCN: ds_read_u16
+; GCN: ds_read_u16
+; GCN: ds_write_b16
+; GCN: ds_write_b16
+define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+ %v = load i32, i32 addrspace(3)* %p, align 2
+ store i32 %v, i32 addrspace(3)* %r, align 2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_unaligned_load_store_i64:
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
@@ -65,139 +100,263 @@ define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
+
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
; SI: ds_write_b8
; SI: s_endpgm
-define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
+define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
%v = load i64, i64 addrspace(3)* %p, align 1
store i64 %v, i64 addrspace(3)* %r, align 1
ret void
}
-; SI-LABEL: {{^}}unaligned_load_store_i64_global:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-; SI: buffer_store_byte
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
- %v = load i64, i64 addrspace(1)* %p, align 1
- store i64 %v, i64 addrspace(1)* %r, align 1
- ret void
-}
-
-; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
+; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
-
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
-; SI: ds_write_b8
-; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
+
; SI: ds_write_b8
+; SI-NOT: v_or_b32
+; SI-NOT: v_lshl
; SI: ds_write_b8
; SI: s_endpgm
-define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
+ %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
+ store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
+ ret void
+}
+
+; SI-LABEL: {{^}}global_align2_load_store_i64:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_load_ushort
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_load_ushort
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+
+; UNALIGNED: buffer_load_dwordx2
+; UNALIGNED: buffer_store_dwordx2
+define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+ %v = load i64, i64 addrspace(1)* %p, align 2
+ store i64 %v, i64 addrspace(1)* %r, align 2
+ ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_global:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED-NOT: v_or_
+; ALIGNED-NOT: v_lshl
+
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_dwordx2
+; UNALIGNED: buffer_store_dwordx2
+define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+ %v = load i64, i64 addrspace(1)* %p, align 1
+ store i64 %v, i64 addrspace(1)* %r, align 1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}local_unaligned_load_store_v4i32:
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: s_endpgm
+define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
%v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
ret void
}
-; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
-; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-; FIXME-SI: buffer_load_ubyte
-define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
+; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+
+; UNALIGNED: buffer_load_dwordx4
+; UNALIGNED: buffer_store_dwordx4
+define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
%v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
ret void
}
-; SI-LABEL: {{^}}load_lds_i64_align_4:
-; SI: ds_read2_b32
-; SI: s_endpgm
-define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; FUNC-LABEL: {{^}}local_load_i64_align_4:
+; GCN: ds_read2_b32
+define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
%val = load i64, i64 addrspace(3)* %in, align 4
store i64 %val, i64 addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-; SI: s_endpgm
-define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
+; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
+define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
%ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
%val = load i64, i64 addrspace(3)* %ptr, align 4
store i64 %val, i64 addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
+; FUNC-LABEL: {{^}}local_load_i64_align_4_with_split_offset:
; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
-; SI: s_endpgm
-define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
+; GCN: s_endpgm
+define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
%ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
%ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
%ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -206,49 +365,191 @@ define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture
ret void
}
-; SI-LABEL: {{^}}load_lds_i64_align_1:
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: ds_read_u8
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
-
-define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+; FUNC-LABEL: {{^}}local_load_i64_align_1:
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: ds_read_u8
+; GCN: store_dwordx2
+define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
%val = load i64, i64 addrspace(3)* %in, align 1
store i64 %val, i64 addrspace(1)* %out, align 8
ret void
}
-; SI-LABEL: {{^}}store_lds_i64_align_4:
-; SI: ds_write2_b32
-; SI: s_endpgm
-define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+; FUNC-LABEL: {{^}}local_store_i64_align_4:
+; GCN: ds_write2_b32
+define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
store i64 %val, i64 addrspace(3)* %out, align 4
ret void
}
-; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
-; SI: s_endpgm
-define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
+; GCN: s_endpgm
+define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
%ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
store i64 0, i64 addrspace(3)* %ptr, align 4
ret void
}
-; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
+; FUNC-LABEL: {{^}}local_store_i64_align_4_with_split_offset:
; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
-; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; SI: s_endpgm
-define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN: s_endpgm
+define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
%ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
%ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
%ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
store i64 0, i64 addrspace(3)* %out, align 4
ret void
}
+
+; SI-LABEL: {{^}}constant_unaligned_load_i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; UNALIGNED: s_load_dword
+
+; SI: buffer_store_dword
+define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+ %v = load i32, i32 addrspace(2)* %p, align 1
+ store i32 %v, i32 addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_align2_load_i32:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+
+; UNALIGNED: s_load_dword
+; UNALIGNED: buffer_store_dword
+define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+ %v = load i32, i32 addrspace(2)* %p, align 2
+ store i32 %v, i32 addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_align2_load_i64:
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+; ALIGNED: buffer_load_ushort
+
+; UNALIGNED: s_load_dwordx2
+; UNALIGNED: buffer_store_dwordx2
+define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+ %v = load i64, i64 addrspace(2)* %p, align 2
+ store i64 %v, i64 addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_load_i64:
+; SI: s_load_dwordx2
+; SI: buffer_store_dwordx2
+define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+ %v = load i64, i64 addrspace(2)* %p, align 4
+ store i64 %v, i64 addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_load_v4i32:
+; SI: s_load_dwordx4
+; SI: buffer_store_dwordx4
+define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+ %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
+ store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_unaligned_load_v2i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; UNALIGNED: buffer_load_dwordx2
+
+; SI: buffer_store_dwordx2
+define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
+ %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
+ store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_unaligned_load_v4i32:
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+; ALIGNED: buffer_load_ubyte
+
+; UNALIGNED: buffer_load_dwordx4
+
+; SI: buffer_store_dwordx4
+define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+ %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
+ store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_load_i8:
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+ %v = load i8, i8 addrspace(2)* %p, align 4
+ store i8 %v, i8 addrspace(1)* %r, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_align2_load_i8:
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+ %v = load i8, i8 addrspace(2)* %p, align 2
+ store i8 %v, i8 addrspace(1)* %r, align 2
+ ret void
+}
+
+; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
+; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+ %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
+ %v0 = load i32, i32 addrspace(2)* %p, align 4
+ %v1 = load i32, i32 addrspace(2)* %gep0, align 4
+
+ %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1
+ store i32 %v0, i32 addrspace(1)* %r, align 4
+ store i32 %v1, i32 addrspace(1)* %gep1, align 4
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
new file mode 100644
index 000000000000..4902e9a3cafb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; We may have subregister live ranges that are undefined on some paths. The
+; verifier should not complain about this.
+
+
+; CHECK-LABEL: {{^}}func:
+define void @func() #0 {
+B0:
+ br i1 undef, label %B1, label %B2
+
+B1:
+ br label %B2
+
+B2:
+ %v0 = phi <4 x float> [ zeroinitializer, %B1 ], [ <float 0.0, float 0.0, float 0.0, float undef>, %B0 ]
+ br i1 undef, label %B30.1, label %B30.2
+
+B30.1:
+ %sub = fsub <4 x float> %v0, undef
+ br label %B30.2
+
+B30.2:
+ %v3 = phi <4 x float> [ %sub, %B30.1 ], [ %v0, %B2 ]
+ %ve0 = extractelement <4 x float> %v3, i32 0
+ store float %ve0, float addrspace(3)* undef, align 4
+ ret void
+}
+
+; FIXME: Extra undef subregister copy should be removed before
+; overwritten with defined copy
+; CHECK-LABEL: {{^}}valley_partially_undef_copy:
+define amdgpu_ps float @valley_partially_undef_copy() #0 {
+bb:
+ %tmp = load volatile i32, i32 addrspace(1)* undef, align 4
+ %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4
+ %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0
+ %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+ %tmp4 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp3, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tmp5 = extractelement <4 x float> %tmp4, i32 0
+ %tmp6 = fmul float %tmp5, undef
+ %tmp7 = fadd float %tmp6, %tmp6
+ %tmp8 = insertelement <4 x i32> %tmp2, i32 %tmp, i32 1
+ store <4 x i32> %tmp8, <4 x i32> addrspace(1)* undef, align 16
+ store float %tmp7, float addrspace(1)* undef, align 4
+ br label %bb9
+
+bb9: ; preds = %bb9, %bb
+ %tmp10 = icmp eq i32 %tmp, 0
+ br i1 %tmp10, label %bb9, label %bb11
+
+bb11: ; preds = %bb9
+ store <4 x i32> %tmp2, <4 x i32> addrspace(1)* undef, align 16
+ ret float undef
+}
+
+; FIXME: Should be able to remove the undef copies
+
+; CHECK-LABEL: {{^}}partially_undef_copy:
+; CHECK: v_mov_b32_e32 v5, 5
+; CHECK: v_mov_b32_e32 v6, 6
+
+; CHECK: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5
+
+; Undef copy
+; CHECK: v_mov_b32_e32 v1, v6
+
+; undef copy
+; CHECK: v_mov_b32_e32 v2, v7
+
+; CHECK: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8
+; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
+
+; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
+define void @partially_undef_copy() #0 {
+ %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
+ %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
+
+ %partially.undef.0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+ %partially.undef.1 = insertelement <4 x i32> %partially.undef.0, i32 %tmp1, i32 0
+
+ store volatile <4 x i32> %partially.undef.1, <4 x i32> addrspace(1)* undef, align 16
+ tail call void asm sideeffect "v_nop", "v={VGPR5_VGPR6_VGPR7_VGPR8}"(<4 x i32> %partially.undef.0)
+ ret void
+}
+
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index 036a7e91b47f..f09f73c38b4c 100644
--- a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -1,8 +1,6 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
+; XUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
@@ -41,8 +39,9 @@ for.end: ; preds = %for.body, %entry
}
; COMMON-LABEL: {{^}}branch_false:
-; SI: .text
-; SI-NEXT: s_endpgm
+; SI: s_cbranch_vccnz
+; SI: s_cbranch_vccnz
+; SI: s_endpgm
define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
entry:
br i1 false, label %for.end, label %for.body.lr.ph
@@ -77,8 +76,9 @@ for.end: ; preds = %for.body, %entry
}
; COMMON-LABEL: {{^}}branch_undef:
-; SI: .text
-; SI-NEXT: s_endpgm
+; SI: s_cbranch_vccnz
+; SI: s_cbranch_vccnz
+; SI: s_endpgm
define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
entry:
br i1 undef, label %for.end, label %for.body.lr.ph
diff --git a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
new file mode 100644
index 000000000000..93a2c6998be4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; This used to raise an assertion due to how the choice between uniform and
+; non-uniform branches was determined.
+;
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_cbranch_vccnz
+define amdgpu_ps float @main(<4 x i32> inreg %rsrc) {
+main_body:
+ %v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 true, i1 false)
+ %cc = fcmp une float %v, 1.000000e+00
+ br i1 %cc, label %if, label %else
+
+if:
+ %u = fadd float %v, %v
+ br label %else
+
+else:
+ %r = phi float [ %v, %main_body ], [ %u, %if ]
+ ret float %r
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll
new file mode 100644
index 000000000000..ac9e2b5f8432
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -0,0 +1,439 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; SI-LABEL: {{^}}uniform_if_scc:
+; SI-DAG: s_cmp_eq_i32 s{{[0-9]+}}, 0
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+ %cmp0 = icmp eq i32 %cond, 0
+ br i1 %cmp0, label %if, label %else
+
+if:
+ br label %done
+
+else:
+ br label %done
+
+done:
+ %value = phi i32 [0, %if], [1, %else]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_vcc:
+; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
+; also scheduled the write first.
+; SI-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI-DAG: s_and_b64 vcc, exec, [[COND]]
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+entry:
+ %cmp0 = fcmp oeq float %cond, 0.0
+ br i1 %cmp0, label %if, label %else
+
+if:
+ br label %done
+
+else:
+ br label %done
+
+done:
+ %value = phi i32 [0, %if], [1, %else]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_swap_br_targets_scc:
+; SI-DAG: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_scc1 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+ %cmp0 = icmp eq i32 %cond, 0
+ br i1 %cmp0, label %else, label %if
+
+if:
+ br label %done
+
+else:
+ br label %done
+
+done:
+ %value = phi i32 [0, %if], [1, %else]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_swap_br_targets_vcc:
+; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and
+; also scheduled the write first.
+; SI-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI-DAG: s_and_b64 vcc, exec, [[COND]]
+; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0
+; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; Fall-through to the else
+; SI: v_mov_b32_e32 [[STORE_VAL]], 1
+
+; SI: [[IF_LABEL]]:
+; SI: buffer_store_dword [[STORE_VAL]]
+define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+entry:
+ %cmp0 = fcmp oeq float %cond, 0.0
+ br i1 %cmp0, label %else, label %if
+
+if:
+ br label %done
+
+else:
+ br label %done
+
+done:
+ %value = phi i32 [0, %if], [1, %else]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_move_valu:
+; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
+; Using a floating-point value in an integer compare will cause the compare to
+; be selected for the SALU and then later moved to the VALU.
+; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
+; SI: s_and_b64 vcc, exec, [[COND]]
+; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: buffer_store_dword
+; SI: [[ENDIF_LABEL]]:
+; SI: s_endpgm
+define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+entry:
+ %a.0 = fadd float %a, 10.0
+ %cond = bitcast float %a.0 to i32
+ %cmp = icmp eq i32 %cond, 5
+ br i1 %cmp, label %if, label %endif
+
+if:
+ store i32 0, i32 addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_move_valu_commute:
+; SI: v_add_f32_e32 [[CMP:v[0-9]+]]
+; Using a floating-point value in an integer compare will cause the compare to
+; be selected for the SALU and then later moved to the VALU.
+; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
+; SI: s_and_b64 vcc, exec, [[COND]]
+; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: buffer_store_dword
+; SI: [[ENDIF_LABEL]]:
+; SI: s_endpgm
+define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+entry:
+ %a.0 = fadd float %a, 10.0
+ %cond = bitcast float %a.0 to i32
+ %cmp = icmp ugt i32 %cond, 5
+ br i1 %cmp, label %if, label %endif
+
+if:
+ store i32 0, i32 addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
+
+
+; SI-LABEL: {{^}}uniform_if_else_ret:
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; SI: buffer_store_dword [[TWO]]
+; SI: s_endpgm
+
+; SI: {{^}}[[IF_LABEL]]:
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+; SI: s_endpgm
+define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+entry:
+ %cmp = icmp eq i32 %a, 0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ store i32 1, i32 addrspace(1)* %out
+ br label %if.end
+
+if.else: ; preds = %entry
+ store i32 2, i32 addrspace(1)* %out
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ ret void
+}
+
+; SI-LABEL: {{^}}uniform_if_else:
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; SI: buffer_store_dword [[TWO]]
+; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+
+; SI: [[IF_LABEL]]:
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+
+; SI: [[ENDIF_LABEL]]:
+; SI: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
+; SI: buffer_store_dword [[THREE]]
+; SI: s_endpgm
+define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+entry:
+ %cmp = icmp eq i32 %a, 0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ store i32 1, i32 addrspace(1)* %out0
+ br label %if.end
+
+if.else: ; preds = %entry
+ store i32 2, i32 addrspace(1)* %out0
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ store i32 3, i32 addrspace(1)* %out1
+ ret void
+}
+
+; SI-LABEL: {{^}}icmp_2_users:
+; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1
+; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]]
+; SI: buffer_store_dword
+; SI: [[LABEL]]:
+; SI: s_endpgm
+define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+main_body:
+ %0 = icmp sgt i32 %cond, 0
+ %1 = sext i1 %0 to i32
+ br i1 %0, label %IF, label %ENDIF
+
+IF:
+ store i32 %1, i32 addrspace(1)* %out
+ br label %ENDIF
+
+ENDIF: ; preds = %IF, %main_body
+ ret void
+}
+
+; SI-LABEL: {{^}}icmp_users_different_blocks:
+; SI: s_load_dword [[COND:s[0-9]+]]
+; SI: s_cmp_lt_i32 [[COND]], 1
+; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
+; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]]
+; SI: s_and_b64 vcc, exec, [[MASK]]
+; SI: s_cbranch_vccnz [[EXIT]]
+; SI: buffer_store
+; SI: {{^}}[[EXIT]]:
+; SI: s_endpgm
+define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+ %cmp0 = icmp sgt i32 %cond0, 0
+ %cmp1 = icmp sgt i32 %cond1, 0
+ br i1 %cmp0, label %bb2, label %bb9
+
+bb2: ; preds = %bb
+ %tmp2 = sext i1 %cmp1 to i32
+ %tmp3 = add i32 %tmp2, %tmp
+ br i1 %cmp1, label %bb9, label %bb7
+
+bb7: ; preds = %bb5
+ store i32 %tmp3, i32 addrspace(1)* %out
+ br label %bb9
+
+bb9: ; preds = %bb8, %bb4
+ ret void
+}
+
+; SI-LABEL: {{^}}uniform_loop:
+; SI: {{^}}[[LOOP_LABEL:[A-Z0-9_a-z]+]]:
+; FIXME: We need to teach SIFixSGPRCopies about uniform branches so we
+; get s_add_i32 here.
+; SI: v_add_i32_e32 [[I:v[0-9]+]], vcc, -1, v{{[0-9]+}}
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[I]]
+; SI: s_and_b64 vcc, exec, vcc
+; SI: s_cbranch_vccnz [[LOOP_LABEL]]
+; SI: s_endpgm
+define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [0, %entry], [%i.i, %loop]
+ %i.i = add i32 %i, 1
+ %cmp = icmp eq i32 %a, %i.i
+ br i1 %cmp, label %done, label %loop
+
+done:
+ ret void
+}
+
+; Test uniform and divergent.
+
+; SI-LABEL: {{^}}uniform_inside_divergent:
+; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
+; SI: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cmp_lg_i32 {{s[0-9]+}}, 0
+; SI: s_cbranch_scc1 [[ENDIF_LABEL]]
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if, label %endif
+
+if:
+ store i32 0, i32 addrspace(1)* %out
+ %u_cmp = icmp eq i32 %cond, 0
+ br i1 %u_cmp, label %if_uniform, label %endif
+
+if_uniform:
+ store i32 1, i32 addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
+
+; SI-LABEL: {{^}}divergent_inside_uniform:
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+; SI: [[ENDIF_LABEL]]:
+; SI: s_endpgm
+define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %u_cmp = icmp eq i32 %cond, 0
+ br i1 %u_cmp, label %if, label %endif
+
+if:
+ store i32 0, i32 addrspace(1)* %out
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %d_cmp = icmp ult i32 %tid, 16
+ br i1 %d_cmp, label %if_uniform, label %endif
+
+if_uniform:
+ store i32 1, i32 addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
+
+; SI-LABEL: {{^}}divergent_if_uniform_if:
+; SI: v_cmp_eq_i32_e32 vcc, 0, v0
+; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
+; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; SI: buffer_store_dword [[ONE]]
+; SI: s_or_b64 exec, exec, [[MASK]]
+; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
+; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
+; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; SI: buffer_store_dword [[TWO]]
+; SI: [[EXIT]]:
+; SI: s_endpgm
+define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
+ %d_cmp = icmp eq i32 %tid, 0
+ br i1 %d_cmp, label %if, label %endif
+
+if:
+ store i32 1, i32 addrspace(1)* %out
+ br label %endif
+
+endif:
+ %u_cmp = icmp eq i32 %cond, 0
+ br i1 %u_cmp, label %if_uniform, label %exit
+
+if_uniform:
+ store i32 2, i32 addrspace(1)* %out
+ br label %exit
+
+exit:
+ ret void
+}
+
+; The condition of the branches in the two blocks are
+; uniform. MachineCSE replaces the 2nd condition with the inverse of
+; the first, leaving an scc use in a different block than it was
+; defed.
+
+; SI-LABEL: {{^}}cse_uniform_condition_different_blocks:
+; SI: s_load_dword [[COND:s[0-9]+]]
+; SI: s_cmp_lt_i32 [[COND]], 1
+; SI: s_cbranch_scc1 BB[[FNNUM:[0-9]+]]_3
+
+; SI: BB#1:
+; SI-NOT: cmp
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: s_cbranch_scc1 BB[[FNNUM]]_3
+
+; SI: BB[[FNNUM]]_3:
+; SI: s_endpgm
+define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+ %tmp1 = icmp sgt i32 %cond, 0
+ br i1 %tmp1, label %bb2, label %bb9
+
+bb2: ; preds = %bb
+ %tmp3 = load volatile i32, i32 addrspace(1)* undef
+ store volatile i32 0, i32 addrspace(1)* undef
+ %tmp9 = icmp sle i32 %cond, 0
+ br i1 %tmp9, label %bb9, label %bb7
+
+bb7: ; preds = %bb5
+ store i32 %tmp3, i32 addrspace(1)* %out
+ br label %bb9
+
+bb9: ; preds = %bb8, %bb4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/uniform-crash.ll b/test/CodeGen/AMDGPU/uniform-crash.ll
new file mode 100644
index 000000000000..cfbb2af58677
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}icmp_2_users:
+; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1
+; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
+; GCN: [[LABEL]]:
+; GCN-NEXT: s_endpgm
+define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+main_body:
+ %0 = icmp sgt i32 %cond, 0
+ %1 = sext i1 %0 to i32
+ br i1 %0, label %IF, label %ENDIF
+
+IF:
+ store i32 %1, i32 addrspace(1)* %out
+ br label %ENDIF
+
+ENDIF: ; preds = %IF, %main_body
+ ret void
+}
+
+; GCN-LABEL: {{^}}fix_sgpr_live_ranges_crash:
+; GCN: s_cbranch_scc1 [[BB0:[A-Z0-9_]+]]
+; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
+; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN: {{^}}[[BB0]]:
+define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1) {
+bb:
+ %cnd = trunc i32 %arg to i1
+ br i1 %cnd, label %bb2, label %bb5
+
+bb2: ; preds = %bb
+ %tmp = mul i32 10, %arg1
+ br label %bb3
+
+bb3: ; preds = %bb3, %bb2
+ %val = load volatile i32, i32 addrspace(2)* undef
+ %tmp4 = icmp eq i32 %val, %arg1
+ br i1 %tmp4, label %bb5, label %bb3
+
+bb5: ; preds = %bb3, %bb
+ %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #1
+ %tmp10 = icmp ult i32 %tmp6, %arg
+ br i1 %tmp10, label %bb11, label %bb12
+
+bb11: ; preds = %bb11, %bb5
+ br i1 undef, label %bb11, label %bb12
+
+bb12: ; preds = %bb11, %bb5
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
new file mode 100644
index 000000000000..9f2f0d67d245
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s
+
+; Test a simple uniform loop that lives inside non-uniform control flow.
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: v_cmp_ne_i32_e32 vcc, 0
+; CHECK: s_and_saveexec_b64
+
+; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
+; CHECK: s_and_b64 vcc, exec, vcc
+; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
+
+; CHECK: s_endpgm
+define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
+main_body:
+ %cc = icmp eq i32 %p, 0
+ br i1 %cc, label %out, label %loop_body
+
+loop_body:
+ %counter = phi i32 [ 0, %main_body ], [ %incr, %loop_body ]
+
+ ; Prevent the loop from being optimized out
+ call void asm sideeffect "", "" ()
+
+ %incr = add i32 %counter, 1
+ %lc = icmp sge i32 %incr, 1000
+ br i1 %lc, label %out, label %loop_body
+
+out:
+ ret void
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: s_and_saveexec_b64
+;CHECK: s_xor_b64
+;CHECK-NEXT: s_cbranch_execz
+define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+main_body:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %cc = icmp eq i32 %tid, 0
+ br i1 %cc, label %done1, label %if
+
+if:
+ %cmp = icmp eq i32 %a, 0
+ br i1 %cmp, label %done0, label %loop_body
+
+loop_body:
+ %counter = phi i32 [ 0, %if ], [0, %done0], [ %incr, %loop_body ]
+
+ ; Prevent the loop from being optimized out
+ call void asm sideeffect "", "" ()
+
+ %incr = add i32 %counter, 1
+ %lc = icmp sge i32 %incr, 1000
+ br i1 %lc, label %done1, label %loop_body
+
+done0:
+ %cmp0 = icmp eq i32 %b, 0
+ br i1 %cmp0, label %done1, label %loop_body
+
+done1:
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
new file mode 100644
index 000000000000..941f4c601e34
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=amdgcn -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -march=r600 -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+
+; Should not crash when the processor is not recognized and the
+; wavefront size feature not set.
+
+; Should also not have fragments of r600 and gcn isa mixed.
+
+; ERROR: 'unknown' is not a recognized processor for this target (ignoring processor)
+
+; GCN-NOT: MOV
+; GCN: buffer_store_dword
+; GCN: ScratchSize: 8{{$}}
+
+; R600: MOV
+define void @foo() {
+ %alloca = alloca i32, align 4
+ store volatile i32 0, i32* %alloca
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 87b925a24a04..0435ed4d5525 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
declare float @llvm.fma.f32(float, float, float) #1
declare double @llvm.fma.f64(double, double, double) #1
declare float @llvm.fmuladd.f32(float, float, float) #1
-declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
+declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1
; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
@@ -28,10 +28,10 @@ define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a)
}
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
; GCN: buffer_store_dword [[RESULT]]
@@ -68,10 +68,10 @@ define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float
}
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
; GCN: buffer_store_dword [[RESULT]]
@@ -82,10 +82,10 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa
}
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
-; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
; GCN: buffer_store_dword [[RESULT]]
@@ -107,7 +107,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
; GCN: buffer_store_dword [[RESULT]]
define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
%fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
@@ -118,11 +118,11 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl
; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
; GCN: s_load_dword [[SGPR:s[0-9]+]]
-; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
- %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
- store i32 %fma, i32 addrspace(1)* %out, align 4
+define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+ %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
+ store float %val, float addrspace(1)* %out, align 4
ret void
}
@@ -227,7 +227,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out
; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS1]], [[SGPR0]], [[VK0]]
; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
@@ -254,7 +254,7 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
; Same zero component is re-used for half of each immediate.
; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
-; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
+; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
; GCN: buffer_store_dwordx2 [[RESULT0]]
; GCN: buffer_store_dwordx2 [[RESULT1]]
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
index c368c5aaf7dc..ca6bff4f6fc8 100644
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
; SI-LABEL: {{^}}v_cnd_nan_nosgpr:
; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
@@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
; All nan values are converted to 0xffffffff
; SI: s_endpgm
define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
- %idx = call i32 @llvm.r600.read.tidig.x() #1
+ %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
%f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
%f = load float, float addrspace(1)* %fptr
%setcc = icmp ne i32 %c, 0
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index a4eaec3403c9..027c63817903 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -2,19 +2,19 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
; GCN-LABEL: {{^}}mac_vvv:
-; GCN: buffer_load_dword [[A:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:4
-; GCN: buffer_load_dword [[C:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
+; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
; GCN: buffer_store_dword [[C]]
-define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) {
+define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
entry:
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
%c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
- %a = load float, float addrspace(1)* %in
- %b = load float, float addrspace(1)* %b_ptr
- %c = load float, float addrspace(1)* %c_ptr
+ %a = load volatile float, float addrspace(1)* %in
+ %b = load volatile float, float addrspace(1)* %b_ptr
+ %c = load volatile float, float addrspace(1)* %c_ptr
%tmp0 = fmul float %a, %b
%tmp1 = fadd float %tmp0, %c
@@ -24,8 +24,8 @@ entry:
; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
; GCN-NOT: v_mac_f32
-; GCN: v_mad_f32 v{{[0-9]}}, 0.5, s{{[0-9]+}}, 0.5
-define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) {
+; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
+define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
entry:
%tmp0 = fmul float 0.5, %in
%tmp1 = fadd float %tmp0, 0.5
@@ -36,7 +36,7 @@ entry:
; GCN-LABEL: {{^}}mad_vvs:
; GCN-NOT: v_mac_f32
; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) {
+define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
entry:
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
@@ -51,7 +51,7 @@ entry:
; GCN-LABEL: {{^}}mac_ssv:
; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) {
+define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
entry:
%c = load float, float addrspace(1)* %in
@@ -64,18 +64,18 @@ entry:
; GCN-LABEL: {{^}}mac_mad_same_add:
; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
-define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) {
+define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
entry:
%b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
%c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
%d_ptr = getelementptr float, float addrspace(1)* %in, i32 3
%e_ptr = getelementptr float, float addrspace(1)* %in, i32 4
- %a = load float, float addrspace(1)* %in
- %b = load float, float addrspace(1)* %b_ptr
- %c = load float, float addrspace(1)* %c_ptr
- %d = load float, float addrspace(1)* %d_ptr
- %e = load float, float addrspace(1)* %e_ptr
+ %a = load volatile float, float addrspace(1)* %in
+ %b = load volatile float, float addrspace(1)* %b_ptr
+ %c = load volatile float, float addrspace(1)* %c_ptr
+ %d = load volatile float, float addrspace(1)* %d_ptr
+ %e = load volatile float, float addrspace(1)* %e_ptr
%tmp0 = fmul float %a, %b
%tmp1 = fadd float %tmp0, %c
@@ -104,6 +104,46 @@ entry:
%b = load float, float addrspace(1)* %b_ptr
%c = load float, float addrspace(1)* %c_ptr
+ %neg_a = fsub float -0.0, %a
+ %tmp0 = fmul float %neg_a, %b
+ %tmp1 = fadd float %tmp0, %c
+
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}unsafe_mad_sub0_src0:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @unsafe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+
+ %neg_a = fsub float 0.0, %a
+ %tmp0 = fmul float %neg_a, %b
+ %tmp1 = fadd float %tmp0, %c
+
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}safe_mad_sub0_src0:
+; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
+; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
+define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+
%neg_a = fsub float 0.0, %a
%tmp0 = fmul float %neg_a, %b
%tmp1 = fadd float %tmp0, %c
@@ -124,6 +164,26 @@ entry:
%b = load float, float addrspace(1)* %b_ptr
%c = load float, float addrspace(1)* %c_ptr
+ %neg_b = fsub float -0.0, %b
+ %tmp0 = fmul float %a, %neg_b
+ %tmp1 = fadd float %tmp0, %c
+
+ store float %tmp1, float addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}unsafe_mad_sub0_src1:
+; GCN-NOT: v_mac_f32
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+define void @unsafe_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+entry:
+ %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+ %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
+
+ %a = load float, float addrspace(1)* %in
+ %b = load float, float addrspace(1)* %b_ptr
+ %c = load float, float addrspace(1)* %c_ptr
+
%neg_b = fsub float 0.0, %b
%tmp0 = fmul float %a, %neg_b
%tmp1 = fadd float %tmp0, %c
@@ -144,7 +204,7 @@ entry:
%b = load float, float addrspace(1)* %b_ptr
%c = load float, float addrspace(1)* %c_ptr
- %neg_c = fsub float 0.0, %c
+ %neg_c = fsub float -0.0, %c
%tmp0 = fmul float %a, %b
%tmp1 = fadd float %tmp0, %neg_c
@@ -152,4 +212,5 @@ entry:
ret void
}
-attributes #0 = { "true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index 1cbefba60c95..02a1278f76cb 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -1,15 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; SI-LABEL: @test_if
; Make sure the i1 values created by the cfg structurizer pass are
; moved using VALU instructions
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
entry:
- switch i32 %a, label %default [
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+ switch i32 %tid, label %default [
i32 0, label %case0
i32 1, label %case1
]
@@ -25,7 +26,7 @@ case1:
br label %end
default:
- %cmp8 = icmp eq i32 %a, 2
+ %cmp8 = icmp eq i32 %tid, 2
%arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
br i1 %cmp8, label %if, label %else
@@ -54,7 +55,7 @@ end:
; SI: s_or_b64 exec, exec, [[BR_SREG]]
; SI: s_endpgm
define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%is.0 = icmp ne i32 %tid, 0
br i1 %is.0, label %store, label %exit
@@ -71,22 +72,22 @@ exit:
; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
-; SI: s_cbranch_execz BB2_2
+; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
-; SI: ; BB#1:
; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; SI: BB2_3:
+; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
; SI: buffer_load_dword
; SI-DAG: buffer_store_dword
; SI-DAG: v_cmp_eq_i32_e32 vcc,
-; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
-; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
-; SI: s_cbranch_execnz BB2_3
+; SI-DAG: s_and_b64 vcc, exec, vcc
+; SI: s_cbranch_vccz [[LABEL_LOOP]]
+; SI: [[LABEL_EXIT]]:
+; SI: s_endpgm
define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
entry:
- %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%is.0 = icmp ne i32 %tid, 0
%limit = add i32 %tid, 64
br i1 %is.0, label %loop, label %exit
@@ -114,7 +115,7 @@ exit:
; SI: v_cmp_lt_i32_e32 vcc
; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
-; SI: s_cbranch_execz BB3_2
+; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
; Initialize inner condition to false
; SI: ; BB#1:
@@ -122,7 +123,7 @@ exit:
; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
; Clear exec bits for workitems that load -1s
-; SI: BB3_3:
+; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
; SI: buffer_load_dword [[B:v[0-9]+]]
; SI: buffer_load_dword [[A:v[0-9]+]]
; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
@@ -130,29 +131,29 @@ exit:
; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]]
-; SI: s_cbranch_execz BB3_5
+; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
-; SI: BB#4:
+; SI: BB#3:
; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]]
-; SI: s_or_b64 [[COND_STATE]], [[CMP]], [[COND_STATE]]
+; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
+; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
-; SI: BB3_5:
+; SI: [[LABEL_FLOW]]:
; SI: s_or_b64 exec, exec, [[ORNEG2]]
-; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[COND_STATE]]
+; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
-; SI: s_cbranch_execnz BB3_3
+; SI: s_cbranch_execnz [[LABEL_LOOP]]
-; SI: BB#6
+; SI: BB#5
; SI: s_or_b64 exec, exec, [[COND_STATE]]
-; SI: BB3_2:
+; SI: [[LABEL_EXIT]]:
; SI-NOT: [[COND_STATE]]
; SI: s_endpgm
define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
bb:
- %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tmp4 = sext i32 %tmp to i64
%tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
%tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index 6f3b4847fbdf..c151ca9ef9b4 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}vector_read:
; EG: MOV
diff --git a/test/CodeGen/AMDGPU/vector-extract-insert.ll b/test/CodeGen/AMDGPU/vector-extract-insert.ll
new file mode 100644
index 000000000000..2d39f82e2499
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Test that when extracting the same unknown vector index from an
+; insertelement the dynamic indexing is folded away.
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+; No dynamic indexing required
+; GCN-LABEL: {{^}}extract_insert_same_dynelt_v4i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd{{$}}
+; GCN-NOT buffer_load_dword
+; GCN-NOT: [[VAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
+ %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
+ %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+ %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx
+ %extract = extractelement <4 x i32> %insert, i32 %idx
+ store i32 %extract, i32 addrspace(1)* %gep.out
+ ret void
+}
+
+; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32:
+; GCN: buffer_load_dwordx4
+; GCN: v_movreld_b32
+; GCN: v_movrels_b32
+; GCN: buffer_store_dword v
+define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
+ %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
+ %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+ %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx0
+ %extract = extractelement <4 x i32> %insert, i32 %idx1
+ store i32 %extract, i32 addrspace(1)* %gep.out
+ ret void
+}
+
+; GCN-LABEL: {{^}}extract_insert_same_elt2_v4i32:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd{{$}}
+; GCN-NOT buffer_load_dword
+; GCN-NOT: [[VAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
+ %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %id.ext
+ %vec = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in
+ %insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx
+ %extract = extractelement <4 x i32> %insert, i32 %idx
+ store i32 %extract, i32 addrspace(1)* %gep.out
+ ret void
+}
+
+; GCN-LABEL: {{^}}extract_insert_same_dynelt_v4f32:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd{{$}}
+; GCN-NOT buffer_load_dword
+; GCN-NOT: [[VAL]]
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOT: [[VVAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext
+ %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %id.ext
+ %vec = load volatile <4 x float>, <4 x float> addrspace(1)* %gep.in
+ %insert = insertelement <4 x float> %vec, float %val, i32 %idx
+ %extract = extractelement <4 x float> %insert, i32 %idx
+ store float %extract, float addrspace(1)* %gep.out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind } \ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index cd7c78f408dd..6c33bc98c605 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
-; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
; This ends up using all 256 registers and requires register
; scavenging which will fail to find an unsued register.
@@ -11,24 +11,33 @@
; FIXME: The same register is initialized to 0 for every spill.
-declare i32 @llvm.r600.read.tgid.x() #1
-declare i32 @llvm.r600.read.tgid.y() #1
-declare i32 @llvm.r600.read.tgid.z() #1
-
; GCN-LABEL: {{^}}spill_vgpr_compute:
-; GCN: s_mov_b32 s16, s3
-; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x80f000
-; VI-NEXT: s_mov_b32 s15, 0x800000
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: workitem_private_segment_byte_size = 1024
+
+; GCN-NOT: flat_scr
+
+; GCNMESA-DAG: s_mov_b32 s16, s3
+; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCNMESA--DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCNMESA-DAG: s_mov_b32 s14, -1
+; SIMESA-DAG: s_mov_b32 s15, 0xe8f000
+; VIMESA-DAG: s_mov_b32 s15, 0xe80000
-; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024
@@ -175,7 +184,8 @@ bb12: ; preds = %bb145, %bb
%tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
%tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
%tmp142 = bitcast float %tmp95 to i32
- %tmp143 = icmp sgt i32 %tmp142, 125
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %tmp143 = icmp sgt i32 %tmp142, %tid
br i1 %tmp143, label %bb144, label %bb145
bb144: ; preds = %bb12
@@ -581,5 +591,7 @@ bb145: ; preds = %bb12
br label %bb12
}
+declare i32 @llvm.r600.read.tidig.x() #1
+
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 16abb89bb0b8..7d97777a78bd 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -8,22 +8,25 @@
; intermediate register class copies.
; FIXME: The same register is initialized to 0 for every spill.
+; FIXME: The unused arguments are removed
; GCN-LABEL: {{^}}main:
-; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x80f000
-; VI-NEXT: s_mov_b32 s15, 0x800000
+; GCN-DAG: s_mov_b32 s13, s12
+; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s18, -1
+; SI-DAG: s_mov_b32 s19, 0xe8f000
+; VI-DAG: s_mov_b32 s19, 0xe80000
-; s12 is offset user SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; s13 is offset system SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024
-define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
bb:
%tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0
%tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
@@ -172,7 +175,8 @@ bb24: ; preds = %bb157, %bb
%tmp152 = phi float [ 0.000000e+00, %bb ], [ %tmp417, %bb157 ]
%tmp153 = phi float [ 0.000000e+00, %bb ], [ %tmp418, %bb157 ]
%tmp154 = bitcast float %tmp107 to i32
- %tmp155 = icmp sgt i32 %tmp154, 125
+ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+ %tmp155 = icmp sgt i32 %tmp154, %tid
br i1 %tmp155, label %bb156, label %bb157
bb156: ; preds = %bb24
@@ -487,7 +491,9 @@ declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
!0 = !{!1, !1, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
new file mode 100644
index 000000000000..ad7521a3da9b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -0,0 +1,24 @@
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: :1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
+
+declare float @llvm.amdgcn.rsq.legacy(float) #0
+
+define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+ %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
+ store float %rsq, float addrspace(1)* %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "foo.cl", directory: "/dev/null")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocation(line: 1, column: 42, scope: !5)
+!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0)
diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll
index 2bfe1b2bd6ec..ae8ec58270c1 100644
--- a/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; Test that we correctly commute a sub instruction
; FUNC-LABEL: {{^}}sub_rev:
@@ -10,7 +10,7 @@
define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
entry:
- %vgpr = call i32 @llvm.r600.read.tidig.x() #1
+ %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
%tmp = icmp eq i32 %cond, 0
br i1 %tmp, label %if, label %else
@@ -37,7 +37,7 @@ endif: ; preds = %else, %if
; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
define void @add_fold(float addrspace(1)* %out) {
entry:
- %tmp = call i32 @llvm.r600.read.tidig.x()
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = uitofp i32 %tmp to float
%tmp2 = fadd float %tmp1, 1.024000e+03
store float %tmp2, float addrspace(1)* %out
@@ -45,7 +45,7 @@ entry:
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone }
attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index dc1f1ea11b01..0cd706b642d7 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -1,29 +1,29 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-;FUNC-LABEL: {{^}}test_select_v2i32:
+; FUNC-LABEL: {{^}}test_select_v2i32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-;SI: v_cndmask_b32_e64
-;SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e32
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
entry:
- %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
- %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
- %cmp = icmp ne <2 x i32> %0, %1
- %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1
+ %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
+ %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
+ %cmp = icmp sgt <2 x i32> %load0, %load1
+ %result = select <2 x i1> %cmp, <2 x i32> %val, <2 x i32> %load0
store <2 x i32> %result, <2 x i32> addrspace(1)* %out
ret void
}
-;FUNC-LABEL: {{^}}test_select_v2f32:
+; FUNC-LABEL: {{^}}test_select_v2f32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
;SI: v_cndmask_b32_e64
;SI: v_cndmask_b32_e32
@@ -40,24 +40,24 @@ entry:
;FUNC-LABEL: {{^}}test_select_v4i32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[4].X
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].W
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
+; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
; FIXME: The shrinking does not happen on tonga
-;SI: v_cndmask_b32
-;SI: v_cndmask_b32
-;SI: v_cndmask_b32
-;SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
entry:
- %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
- %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
- %cmp = icmp ne <4 x i32> %0, %1
- %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
+ %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
+ %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
+ %cmp = icmp sgt <4 x i32> %load0, %load1
+ %result = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %load0
store <4 x i32> %result, <4 x i32> addrspace(1)* %out
ret void
}
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
index 107e84b33be9..265774180a7f 100644
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -11,14 +11,14 @@
; DEFAULT: exp
; DEFAULT: s_waitcnt lgkmcnt(0)
; DEFAULT: s_endpgm
-define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
+define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) {
main_body:
%tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
%tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
%tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
%tmp12 = extractelement <4 x float> %tmp11, i32 0
%tmp13 = extractelement <4 x float> %tmp11, i32 1
- call void @llvm.AMDGPU.barrier.global() #1
+ call void @llvm.amdgcn.s.barrier() #1
%tmp14 = extractelement <4 x float> %tmp11, i32 2
; %tmp15 = extractelement <4 x float> %tmp11, i32 3
%tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
@@ -45,8 +45,8 @@ main_body:
; ILPMAX: s_waitcnt vmcnt(0)
; ILPMAX: s_endpgm
-define void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
-byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
+define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
+byval, i32 inreg, i32 inreg, i32, i32, i32, i32) {
main_body:
%11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
%12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
@@ -71,14 +71,13 @@ main_body:
; Function Attrs: convergent nounwind
-declare void @llvm.AMDGPU.barrier.global() #1
+declare void @llvm.amdgcn.s.barrier() #1
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-attributes #0 = { "ShaderType"="1" }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/waitcnt-flat.ll b/test/CodeGen/AMDGPU/waitcnt-flat.ll
new file mode 100644
index 000000000000..38dbf2794fc5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=GCN %s
+
+; If flat_store_dword and flat_load_dword use different registers for the data
+; operand, this test is not broken. It just means it is no longer testing
+; for the original bug.
+
+; GCN: {{^}}test:
+; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
+; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
+; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
+define void @test(i32 addrspace(1)* %out, i32 %in) {
+ store volatile i32 0, i32 addrspace(1)* %out
+ %val = load volatile i32, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
new file mode 100644
index 000000000000..23b0ffd5b3da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -0,0 +1,366 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+
+; Check that WQM isn't triggered by image load/store intrinsics.
+;
+;CHECK-LABEL: {{^}}test1:
+;CHECK-NOT: s_wqm
+define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+main_body:
+ %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret <4 x float> %tex
+}
+
+; Check that WQM is triggered by image samples and left untouched for loads...
+;
+;CHECK-LABEL: {{^}}test2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK-NOT: exec
+;CHECK: _load_dword v0,
+define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+main_body:
+ %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
+ %c.3 = extractelement <4 x i32> %c.2, i32 0
+ %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
+ %data = load float, float addrspace(1)* %gep
+ ret float %data
+}
+
+; ... but disabled for stores (and, in this simple case, not re-enabled).
+;
+;CHECK-LABEL: {{^}}test3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK-NOT: exec
+;CHECK: .size test3
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+main_body:
+ %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+ %tex.2 = extractelement <4 x i32> %tex.1, i32 0
+ %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
+ %wr = extractelement <4 x float> %tex, i32 1
+ store float %wr, float addrspace(1)* %gep
+ ret <4 x float> %tex
+}
+
+; Check that WQM is re-enabled when required.
+;
+;CHECK-LABEL: {{^}}test4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
+define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
+main_body:
+ %c.1 = mul i32 %c, %d
+ %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
+ store float %data, float addrspace(1)* %gep
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %tex
+}
+
+; Check a case of one branch of an if-else requiring WQM, the other requiring
+; exact.
+;
+; Note: In this particular case, the save-and-restore could be avoided if the
+; analysis understood that the two branches of the if-else are mutually
+; exclusive.
+;
+;CHECK-LABEL: {{^}}test_control_flow_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %ELSE
+;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVED]]
+;CHECK: %IF
+;CHECK: image_sample
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+main_body:
+ %cmp = icmp eq i32 %z, 0
+ br i1 %cmp, label %IF, label %ELSE
+
+IF:
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %data.if = extractelement <4 x float> %tex, i32 0
+ br label %END
+
+ELSE:
+ %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+ store float %data, float addrspace(1)* %gep
+ br label %END
+
+END:
+ %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+ ret float %r
+}
+
+; Reverse branch order compared to the previous test.
+;
+;CHECK-LABEL: {{^}}test_control_flow_1:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: image_sample
+;CHECK: %Flow
+;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
+;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
+;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
+;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
+;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
+;CHECK-NEXT: ; BB#3: ; %ELSE
+;CHECK: store_dword
+;CHECK: [[END_BB]]: ; %END
+;CHECK: s_or_b64 exec, exec,
+;CHECK: v_mov_b32_e32 v0
+;CHECK: ; return
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+main_body:
+ %cmp = icmp eq i32 %z, 0
+ br i1 %cmp, label %ELSE, label %IF
+
+IF:
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %data.if = extractelement <4 x float> %tex, i32 0
+ br label %END
+
+ELSE:
+ %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+ store float %data, float addrspace(1)* %gep
+ br label %END
+
+END:
+ %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+ ret float %r
+}
+
+; Check that branch conditions are properly marked as needing WQM...
+;
+;CHECK-LABEL: {{^}}test_control_flow_2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: load
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmp
+define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+main_body:
+ %idx.1 = extractelement <3 x i32> %idx, i32 0
+ %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+ %data.1 = extractelement <2 x float> %data, i32 0
+ store float %data.1, float addrspace(1)* %gep.1
+
+ ; The load that determines the branch (and should therefore be WQM) is
+ ; surrounded by stores that require disabled WQM.
+ %idx.2 = extractelement <3 x i32> %idx, i32 1
+ %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+ %z = load float, float addrspace(1)* %gep.2
+
+ %idx.3 = extractelement <3 x i32> %idx, i32 2
+ %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+ %data.3 = extractelement <2 x float> %data, i32 1
+ store float %data.3, float addrspace(1)* %gep.3
+
+ %cc = fcmp ogt float %z, 0.0
+ br i1 %cc, label %IF, label %ELSE
+
+IF:
+ %coord.IF = mul i32 %coord, 3
+ br label %END
+
+ELSE:
+ %coord.ELSE = mul i32 %coord, 4
+ br label %END
+
+END:
+ %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %tex
+}
+
+; ... but only if they really do need it.
+;
+;CHECK-LABEL: {{^}}test_control_flow_3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: load
+;CHECK: store
+;CHECK: v_cmp
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+main_body:
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tex.1 = extractelement <4 x float> %tex, i32 0
+
+ %idx.1 = extractelement <3 x i32> %idx, i32 0
+ %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+ %data.1 = extractelement <2 x float> %data, i32 0
+ store float %data.1, float addrspace(1)* %gep.1
+
+ %idx.2 = extractelement <3 x i32> %idx, i32 1
+ %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+ %z = load float, float addrspace(1)* %gep.2
+
+ %idx.3 = extractelement <3 x i32> %idx, i32 2
+ %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+ %data.3 = extractelement <2 x float> %data, i32 1
+ store float %data.3, float addrspace(1)* %gep.3
+
+ %cc = fcmp ogt float %z, 0.0
+ br i1 %cc, label %IF, label %ELSE
+
+IF:
+ %tex.IF = fmul float %tex.1, 3.0
+ br label %END
+
+ELSE:
+ %tex.ELSE = fmul float %tex.1, 4.0
+ br label %END
+
+END:
+ %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
+ ret float %tex.END
+}
+
+; Another test that failed at some point because of terminator handling.
+;
+;CHECK-LABEL: {{^}}test_control_flow_4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: load
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: %END
+;CHECK: image_sample
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
+main_body:
+ %cond = icmp eq i32 %y, 0
+ br i1 %cond, label %IF, label %END
+
+IF:
+ %data = load float, float addrspace(1)* %ptr
+ %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
+ store float %data, float addrspace(1)* %gep
+ br label %END
+
+END:
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %tex
+}
+
+; Kill is performed in WQM mode so that uniform kill behaves correctly ...
+;
+;CHECK-LABEL: {{^}}test_kill_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmpx_
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: image_sample
+define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
+main_body:
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ %idx.0 = extractelement <2 x i32> %idx, i32 0
+ %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
+ %data.0 = extractelement <2 x float> %data, i32 0
+ store float %data.0, float addrspace(1)* %gep.0
+
+ call void @llvm.AMDGPU.kill(float %z)
+
+ %idx.1 = extractelement <2 x i32> %idx, i32 1
+ %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+ %data.1 = extractelement <2 x float> %data, i32 1
+ store float %data.1, float addrspace(1)* %gep.1
+
+ %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %out = fadd <4 x float> %tex, %tex2
+
+ ret <4 x float> %out
+}
+
+; ... but only if WQM is necessary.
+;
+; CHECK-LABEL: {{^}}test_kill_1:
+; CHECK-NEXT: ; %main_body
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: image_sample
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; SI: buffer_store_dword
+; VI: flat_store_dword
+; CHECK-NOT: wqm
+; CHECK: v_cmpx_
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
+main_body:
+ %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
+ store float %data, float addrspace(1)* %gep
+
+ call void @llvm.AMDGPU.kill(float %z)
+
+ ret <4 x float> %tex
+}
+
+; Check prolog shaders.
+;
+; CHECK-LABEL: {{^}}test_prolog_1:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_add_f32_e32 v0,
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
+main_body:
+ %s = fadd float %a, %b
+ ret float %s
+}
+
+declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+
+declare void @llvm.AMDGPU.kill(float)
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind readnone }
+attributes #4 = { "amdgpu-ps-wqm-outputs" }
diff --git a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
new file mode 100644
index 000000000000..deac809f9b05
--- /dev/null
+++ b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
@@ -0,0 +1,22 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
+
+; write_register doesn't prevent us from illegally trying to write a
+; vgpr value into a scalar register, but I don't think there's much we
+; can do to avoid this.
+
+declare void @llvm.write_register.i32(metadata, i32) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+
+define void @write_vgpr_into_sgpr() {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ call void @llvm.write_register.i32(metadata !0, i32 %tid)
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!0 = !{!"exec_lo"}
diff --git a/test/CodeGen/AMDGPU/write_register.ll b/test/CodeGen/AMDGPU/write_register.ll
new file mode 100644
index 000000000000..88660ba6ec6a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/write_register.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s
+
+declare void @llvm.write_register.i32(metadata, i32) #0
+declare void @llvm.write_register.i64(metadata, i64) #0
+
+; CHECK-LABEL: {{^}}test_write_m0:
+define void @test_write_m0(i32 %val) #0 {
+ call void @llvm.write_register.i32(metadata !0, i32 0)
+ call void @llvm.write_register.i32(metadata !0, i32 -1)
+ call void @llvm.write_register.i32(metadata !0, i32 %val)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_exec:
+; CHECK: s_mov_b64 exec, 0
+; CHECK: s_mov_b64 exec, -1
+; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}}
+define void @test_write_exec(i64 %val) #0 {
+ call void @llvm.write_register.i64(metadata !1, i64 0)
+ call void @llvm.write_register.i64(metadata !1, i64 -1)
+ call void @llvm.write_register.i64(metadata !1, i64 %val)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_flat_scratch:
+; CHECK: s_mov_b64 flat_scratch, 0
+; CHECK: s_mov_b64 flat_scratch, -1
+; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}}
+define void @test_write_flat_scratch(i64 %val) #0 {
+ call void @llvm.write_register.i64(metadata !2, i64 0)
+ call void @llvm.write_register.i64(metadata !2, i64 -1)
+ call void @llvm.write_register.i64(metadata !2, i64 %val)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_flat_scratch_lo:
+; CHECK: s_mov_b32 flat_scratch_lo, 0
+; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}}
+define void @test_write_flat_scratch_lo(i32 %val) #0 {
+ call void @llvm.write_register.i32(metadata !3, i32 0)
+ call void @llvm.write_register.i32(metadata !3, i32 %val)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_flat_scratch_hi:
+; CHECK: s_mov_b32 flat_scratch_hi, 0
+; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}}
+define void @test_write_flat_scratch_hi(i32 %val) #0 {
+ call void @llvm.write_register.i32(metadata !4, i32 0)
+ call void @llvm.write_register.i32(metadata !4, i32 %val)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_exec_lo:
+; CHECK: s_mov_b32 exec_lo, 0
+; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}}
+define void @test_write_exec_lo(i32 %val) #0 {
+ call void @llvm.write_register.i32(metadata !5, i32 0)
+ call void @llvm.write_register.i32(metadata !5, i32 %val)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_write_exec_hi:
+; CHECK: s_mov_b32 exec_hi, 0
+; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}}
+define void @test_write_exec_hi(i32 %val) #0 {
+ call void @llvm.write_register.i32(metadata !6, i32 0)
+ call void @llvm.write_register.i32(metadata !6, i32 %val)
+ ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!"m0"}
+!1 = !{!"exec"}
+!2 = !{!"flat_scratch"}
+!3 = !{!"flat_scratch_lo"}
+!4 = !{!"flat_scratch_hi"}
+!5 = !{!"exec_lo"}
+!6 = !{!"exec_hi"}
diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index 8b383e4c393d..7f6b80459047 100644
--- a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -71,7 +71,7 @@ declare i32 @llvm.r600.read.global.size.y() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.global.size.z() #1
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
!opencl.kernels = !{!0, !1, !2}
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 655655d92f08..202170d6e229 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -64,8 +64,8 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
; SI: buffer_store_byte [[RESULT]]
define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
- %a = load i1, i1 addrspace(1)* %in0
- %b = load i1, i1 addrspace(1)* %in1
+ %a = load volatile i1, i1 addrspace(1)* %in0
+ %b = load volatile i1, i1 addrspace(1)* %in1
%xor = xor i1 %a, %b
store i1 %xor, i1 addrspace(1)* %out
ret void
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
index 35ddf2b0a465..c3b76da5f778 100644
--- a/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
; R600: {{^}}test:
; R600: MEM_RAT_CACHELESS STORE_RAW
diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
new file mode 100644
index 000000000000..842c30b40df2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}zext_or_operand_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dword v[[LD32:[0-9]+]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+ %ld.64 = load volatile i64, i64 addrspace(1)* %in0
+ %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+ %ext = zext i32 %ld.32 to i64
+ %or = or i64 %ld.64, %ext
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}zext_or_operand_commute_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dword v[[LD32:[0-9]+]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN-NOT: v[[HI]]
+; GCN-NOT: _or_
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+ %ld.64 = load volatile i64, i64 addrspace(1)* %in0
+ %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+ %ext = zext i32 %ld.32 to i64
+ %or = or i64 %ext, %ld.64
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-08-31-LSDA-Name.ll b/test/CodeGen/ARM/2009-08-31-LSDA-Name.ll
index d402c16ccacb..0a9fd8955746 100644
--- a/test/CodeGen/ARM/2009-08-31-LSDA-Name.ll
+++ b/test/CodeGen/ARM/2009-08-31-LSDA-Name.ll
@@ -1,8 +1,10 @@
; RUN: llc < %s -mtriple=arm-apple-darwin9 -march=arm | FileCheck %s
-; CHECK: .cfi_lsda 16, [[LABEL:.*]]
-; CHECK: .long [[LABEL]]-
-; CHECK: [[LABEL]]:
+; CHECK: ldr r0, [[CPI_PERSONALITY:[A-Za-z0-9_]+]]
+; CHECK: ldr r0, [[CPI_LSDA:[A-Za-z0-9_]+]]
+; CHECK: [[CPI_LSDA]]:
+; CHECK: .long [[LSDA_LABEL:[A-Za-z0-9_]+]]-
+; CHECK: [[LSDA_LABEL]]:
; CHECK: .byte 255 @ @LPStart Encoding = omit
%struct.A = type { i32* }
diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index 613694f091d1..3e687322de4f 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll
@@ -22,10 +22,11 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
declare i32 @foo(i32) ssp
+!llvm.dbg.cu = !{!0}
!0 = !DILocation(line: 5, column: 2, scope: !1)
!1 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !2)
-!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3)
-!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9)
+!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scope: !3)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: FullDebug, file: !8, retainedTypes: !9)
!4 = !DILocalVariable(name: "count_", line: 5, scope: !5, file: !3, type: !6)
!5 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !1)
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index 1341830b4a4b..089cff82bc5d 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
@@ -15,10 +15,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!3}
!llvm.module.flags = !{!15}
!0 = !DILocalVariable(name: "b", line: 93, arg: 2, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "__addvsi3", linkageName: "__addvsi3", line: 94, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: null, type: !4)
+!1 = distinct !DISubprogram(name: "__addvsi3", linkageName: "__addvsi3", line: 94, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !12, scope: null, type: !4)
!2 = !DIFile(filename: "libgcc2.c", directory: "/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc")
!12 = !DIFile(filename: "libgcc2.c", directory: "/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc")
-!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !12, enums: !13, retainedTypes: !13, subprograms: !14)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: FullDebug, file: !12, enums: !13, retainedTypes: !13)
!4 = !DISubroutineType(types: !5)
!5 = !{!6, !6, !6}
!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "SItype", line: 152, file: !12, baseType: !8)
@@ -28,5 +28,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!10 = distinct !DILexicalBlock(line: 94, column: 0, file: !12, scope: !1)
!11 = !DILocation(line: 100, scope: !10)
!13 = !{}
-!14 = !{!1}
!15 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
index 38b352c473b1..5ae1b6626f1a 100644
--- a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
+++ b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
@@ -44,13 +44,12 @@ declare i32 @x1() optsize
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
-!llvm.dbg.lv.fn = !{!0, !8, !10, !12}
-!llvm.dbg.gv = !{!14}
+!llvm.dbg.cu = !{!3}
!0 = !DILocalVariable(name: "buf", line: 4, arg: 1, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "x0", linkageName: "x0", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !26, scope: null, type: !4)
+!1 = distinct !DISubprogram(name: "x0", linkageName: "x0", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !26, scope: null, type: !4)
!2 = !DIFile(filename: "t.c", directory: "/private/tmp")
-!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 2.0", isOptimized: true, file: !26)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 2.0", isOptimized: true, file: !26, globals: !{!14})
!4 = !DISubroutineType(types: !5)
!5 = !{null}
!6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !26, scope: !2, baseType: !7)
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index 14ddb59b5387..a90ea137cb7c 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -1,6 +1,9 @@
; RUN: llc -O0 -mtriple=arm-apple-darwin < %s | grep DW_OP_breg
; Use DW_OP_breg in variable's location expression if the variable is in a stack slot.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
+target triple = "arm-apple-darwin"
+
%struct.SVal = type { i8*, i32 }
define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp !dbg !17 {
@@ -78,9 +81,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.module.flags = !{!49}
!0 = !DISubprogram(name: "SVal", line: 11, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14)
-!1 = !DICompositeType(tag: DW_TAG_structure_type, name: "SVal", line: 1, size: 128, align: 64, file: !48, elements: !4)
+!1 = !DICompositeType(tag: DW_TAG_structure_type, name: "SVal", line: 1, size: 64, align: 64, file: !48, elements: !4)
!2 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330")
-!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !48, enums: !47, retainedTypes: !47, subprograms: !46, globals: !47, imports: !47)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: FullDebug, file: !48, enums: !47, retainedTypes: !47, globals: !47, imports: !47)
!4 = !{!5, !7, !0, !9}
!5 = !DIDerivedType(tag: DW_TAG_member, name: "Data", line: 7, size: 64, align: 64, file: !48, scope: !1, baseType: !6)
!6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !48, baseType: null)
@@ -93,24 +96,24 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!14 = !DISubroutineType(types: !15)
!15 = !{null, !12}
-!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !1, type: !14)
-!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !18)
+!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !48, scope: !1, type: !14)
+!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !48, scope: !2, type: !18)
!18 = !DISubroutineType(types: !19)
!19 = !{!13, !13, !1}
-!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !48, scope: !2, type: !21)
+!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !48, scope: !2, type: !21)
!21 = !DISubroutineType(types: !22)
!22 = !{!13}
!23 = !DILocalVariable(name: "i", line: 16, arg: 1, scope: !17, file: !2, type: !13)
!24 = !DILocation(line: 16, scope: !17)
!25 = !DILocalVariable(name: "location", line: 16, arg: 2, scope: !17, file: !2, type: !26)
-!26 = !DIDerivedType(tag: DW_TAG_reference_type, name: "SVal", size: 64, align: 64, file: !48, scope: !2, baseType: !1)
+!26 = !DIDerivedType(tag: DW_TAG_reference_type, name: "SVal", size: 32, align: 32, file: !48, scope: !2, baseType: !1)
!27 = !DILocation(line: 17, scope: !28)
!28 = distinct !DILexicalBlock(line: 16, column: 0, file: !2, scope: !17)
!29 = !DILocation(line: 18, scope: !28)
!30 = !DILocation(line: 20, scope: !28)
!31 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !16, file: !2, type: !32)
-!32 = !DIDerivedType(tag: DW_TAG_const_type, size: 64, align: 64, flags: DIFlagArtificial, file: !48, scope: !2, baseType: !33)
-!33 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !48, scope: !2, baseType: !1)
+!32 = !DIDerivedType(tag: DW_TAG_const_type, flags: DIFlagArtificial, file: !48, scope: !2, baseType: !33)
+!33 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !48, scope: !2, baseType: !1)
!34 = !DILocation(line: 11, scope: !16)
!35 = !DILocation(line: 11, scope: !36)
!36 = distinct !DILexicalBlock(line: 11, column: 0, file: !48, scope: !37)
@@ -123,7 +126,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!43 = !DILocation(line: 26, scope: !39)
!44 = !DILocalVariable(name: "k", line: 26, scope: !39, file: !2, type: !13)
!45 = !DILocation(line: 27, scope: !39)
-!46 = !{!16, !17, !20}
!47 = !{}
!48 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330")
!49 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/2010-11-29-PrologueBug.ll b/test/CodeGen/ARM/2010-11-29-PrologueBug.ll
index 4179d8c99d6a..15e17b4fd0f1 100644
--- a/test/CodeGen/ARM/2010-11-29-PrologueBug.ll
+++ b/test/CodeGen/ARM/2010-11-29-PrologueBug.ll
@@ -1,24 +1,17 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB2
+; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
; rdar://8690640
define i32* @t(i32* %x) nounwind {
entry:
-; ARM-LABEL: t:
-; ARM: push
-; ARM: mov r7, sp
-; ARM: bl _foo
-; ARM: bl _foo
-; ARM: bl _foo
-; ARM: pop {r7, pc}
+; CHECK-LABEL: t:
+; CHECK: push
+; CHECK: mov r7, sp
+; CHECK: bl _foo
+; CHECK: bl _foo
+; CHECK: bl _foo
+; CHECK: pop {r7, pc}
-; THUMB2-LABEL: t:
-; THUMB2: push
-; THUMB2: mov r7, sp
-; THUMB2: blx _foo
-; THUMB2: blx _foo
-; THUMB2: blx _foo
-; THUMB2: pop
%0 = tail call i32* @foo(i32* %x) nounwind
%1 = tail call i32* @foo(i32* %0) nounwind
%2 = tail call i32* @foo(i32* %1) nounwind
diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index d5eed8b6a2c4..8c6cf00cc3a0 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
@@ -78,16 +78,16 @@ entry:
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!49}
-!0 = distinct !DISubprogram(name: "get1", linkageName: "get1", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 4, file: !47, scope: !1, type: !3, variables: !42)
+!0 = distinct !DISubprogram(name: "get1", linkageName: "get1", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 4, file: !47, scope: !1, type: !3, variables: !42)
!1 = !DIFile(filename: "foo.c", directory: "/tmp/")
-!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", isOptimized: true, emissionKind: 0, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports: !48)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", isOptimized: true, emissionKind: FullDebug, file: !47, enums: !48, retainedTypes: !48, globals: !41, imports: !48)
!3 = !DISubroutineType(types: !4)
!4 = !{!5, !5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "_Bool", size: 8, align: 8, encoding: DW_ATE_boolean)
-!6 = distinct !DISubprogram(name: "get2", linkageName: "get2", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !47, scope: !1, type: !3, variables: !43)
-!7 = distinct !DISubprogram(name: "get3", linkageName: "get3", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !47, scope: !1, type: !3, variables: !44)
-!8 = distinct !DISubprogram(name: "get4", linkageName: "get4", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !47, scope: !1, type: !3, variables: !45)
-!9 = distinct !DISubprogram(name: "get5", linkageName: "get5", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !47, scope: !1, type: !3, variables: !46)
+!6 = distinct !DISubprogram(name: "get2", linkageName: "get2", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 7, file: !47, scope: !1, type: !3, variables: !43)
+!7 = distinct !DISubprogram(name: "get3", linkageName: "get3", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 10, file: !47, scope: !1, type: !3, variables: !44)
+!8 = distinct !DISubprogram(name: "get4", linkageName: "get4", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 13, file: !47, scope: !1, type: !3, variables: !45)
+!9 = distinct !DISubprogram(name: "get5", linkageName: "get5", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 16, file: !47, scope: !1, type: !3, variables: !46)
!10 = !DILocalVariable(name: "a", line: 4, arg: 1, scope: !0, file: !1, type: !5)
!11 = !DILocalVariable(name: "b", line: 4, scope: !12, file: !1, type: !5)
!12 = distinct !DILexicalBlock(line: 4, column: 0, file: !47, scope: !0)
@@ -118,7 +118,6 @@ entry:
!37 = !DILocation(line: 13, scope: !26)
!38 = !DILocation(line: 16, scope: !9)
!39 = !DILocation(line: 16, scope: !29)
-!40 = !{!0, !6, !7, !8, !9}
!41 = !{!13, !14, !15, !16, !17}
!42 = !{!10, !11}
!43 = !{!18, !19}
diff --git a/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll b/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
index f17884e0fa41..91adba41b1ac 100644
--- a/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
+++ b/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
@@ -3,7 +3,7 @@
; CHECK: _f
; CHECK-NOT: ands
; CHECK: cmp
-; CHECK: blxle _g
+; CHECK: blle _g
define i32 @f(i32 %a, i32 %b) nounwind ssp {
entry:
diff --git a/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll b/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
index 864e2917b7bb..852038147b26 100644
--- a/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
+++ b/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
@@ -3,7 +3,7 @@
; CHECK: _f
; CHECK: adds
; CHECK-NOT: cmp
-; CHECK: blxeq _h
+; CHECK: bleq _h
define i32 @f(i32 %a, i32 %b) nounwind ssp {
entry:
@@ -22,7 +22,7 @@ if.end: ; preds = %if.then, %entry
; CHECK: _g
; CHECK: orrs
; CHECK-NOT: cmp
-; CHECK: blxeq _h
+; CHECK: bleq _h
define i32 @g(i32 %a, i32 %b) nounwind ssp {
entry:
diff --git a/test/CodeGen/ARM/2011-04-26-SchedTweak.ll b/test/CodeGen/ARM/2011-04-26-SchedTweak.ll
index ee6d79c39f2f..3c5579acf6ae 100644
--- a/test/CodeGen/ARM/2011-04-26-SchedTweak.ll
+++ b/test/CodeGen/ARM/2011-04-26-SchedTweak.ll
@@ -38,9 +38,9 @@ bb:
bb1:
; CHECK: %bb1
; CHECK-NOT: umull
-; CHECK: blx _Get
+; CHECK: bl _Get
; CHECK: umull
-; CHECK: blx _foo
+; CHECK: bl _foo
%tmp5 = load i32, i32* %block_size, align 4
%tmp6 = load i32, i32* %block_count, align 4
%tmp7 = call %struct.FF* @Get() nounwind
diff --git a/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
index 12cdd04b7bb7..f4b93ca74fc0 100644
--- a/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
+++ b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
@@ -29,7 +29,7 @@ target triple = "thumbv7-apple-darwin10"
@"\01_fnmatch.initial" = external constant %union.__mbstate_t, align 4
; CHECK: _fnmatch
-; CHECK: blx _fnmatch1
+; CHECK: bl _fnmatch1
define i32 @"\01_fnmatch"(i8* %pattern, i8* %string, i32 %flags) nounwind optsize {
entry:
diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index 3d82e706862c..4da4fd4c926e 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
@@ -73,16 +73,16 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!49}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: true, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !40, globals: !41, imports: !48)
-!1 = distinct !DISubprogram(name: "get1", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !47, scope: !2, type: !3, variables: !42)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: true, emissionKind: FullDebug, file: !47, enums: !48, retainedTypes: !48, globals: !41, imports: !48)
+!1 = distinct !DISubprogram(name: "get1", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 5, file: !47, scope: !2, type: !3, variables: !42)
!2 = !DIFile(filename: "ss3.c", directory: "/private/tmp")
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = distinct !DISubprogram(name: "get2", line: 8, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !47, scope: !2, type: !3, variables: !43)
-!7 = distinct !DISubprogram(name: "get3", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !47, scope: !2, type: !3, variables: !44)
-!8 = distinct !DISubprogram(name: "get4", line: 14, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 14, file: !47, scope: !2, type: !3, variables: !45)
-!9 = distinct !DISubprogram(name: "get5", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 17, file: !47, scope: !2, type: !3, variables: !46)
+!6 = distinct !DISubprogram(name: "get2", line: 8, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 8, file: !47, scope: !2, type: !3, variables: !43)
+!7 = distinct !DISubprogram(name: "get3", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 11, file: !47, scope: !2, type: !3, variables: !44)
+!8 = distinct !DISubprogram(name: "get4", line: 14, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 14, file: !47, scope: !2, type: !3, variables: !45)
+!9 = distinct !DISubprogram(name: "get5", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 17, file: !47, scope: !2, type: !3, variables: !46)
!10 = !DILocalVariable(name: "a", line: 5, arg: 1, scope: !1, file: !2, type: !5)
!11 = !DILocalVariable(name: "b", line: 5, scope: !12, file: !2, type: !5)
!12 = distinct !DILexicalBlock(line: 5, column: 19, file: !47, scope: !1)
@@ -110,7 +110,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!37 = !DILocation(line: 14, column: 32, scope: !21)
!38 = !DILocation(line: 17, column: 16, scope: !9)
!39 = !DILocation(line: 17, column: 32, scope: !29)
-!40 = !{!1, !6, !7, !8, !9}
!41 = !{!25, !26}
!42 = !{!10, !11}
!43 = !{!13, !14}
diff --git a/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll b/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
index 9163166177c1..e70f973d6a76 100644
--- a/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
+++ b/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a9 -jump-table-density=40 | FileCheck %s
; Test that ldmia_ret preserves implicit operands for return values.
;
; This CFG is reduced from a benchmark miscompile. With current
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
index b47247c5454f..a928543d7cf2 100644
--- a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
@@ -2,9 +2,10 @@
; Check for error message:
; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
+; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
define hidden void @f(i32* %corr, i32 %order) nounwind ssp {
- tail call void asm sideeffect "vst1.s32 { ${1:q}, ${2:q} }, [$0]", "r,{q0},{q1}"(i32* %corr, <2 x i64>* undef, <2 x i64>* undef) nounwind, !srcloc !0
+ tail call void asm sideeffect "vst1.s32 { ${1:q}, ${2:q} }, [$0]", "r,{q0},{q1}"(i32* %corr, <2 x i64>* undef, i32 %order) nounwind, !srcloc !0
ret void
}
diff --git a/test/CodeGen/ARM/2016-05-01-RegScavengerAssert.ll b/test/CodeGen/ARM/2016-05-01-RegScavengerAssert.ll
new file mode 100644
index 000000000000..15e0dad18232
--- /dev/null
+++ b/test/CodeGen/ARM/2016-05-01-RegScavengerAssert.ll
@@ -0,0 +1,192 @@
+; RUN: llc < %s
+; This test has around 4000 bytes of local variables and it also stresses register allocation
+; to force a register scavenging. It tests if the stack is treated as "BigStack" and thus
+; spill slots are reserved. If not, reg scavenger will assert.
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7--linux-android"
+
+%struct.r = type { i32 (...)**, [10 x [9 x float]], [10 x [9 x float]], [101 x [9 x float]], [101 x [9 x float]], i32, i32, i32, i32, i32, [8 x [2 x i32]], [432 x float], [432 x i32], [10 x i8*], [10 x i8*], [10 x i32], [10 x i32], [10 x i32], [10 x i32], [10 x i32], [10 x i32], i32, i32, i32, i32, float, float, i32, i32, [9 x float], float*, float }
+
+define void @foo(%struct.r* %this, float* %srcR, float* %srcC, float* %tempPntsX, float* %tY, float* %ms, float* %sX, float* %sY, i32* dereferenceable(4) %num, float* %tm, i32 %SR, i32 %lev, i8* %tdata, i32 %siW, i32 %pyw, i32 %pyh, i8* %sdata) #0 align 2 {
+entry:
+ %sFV = alloca [49 x float], align 4
+ %tFV = alloca [49 x float], align 4
+ %TIM = alloca [9 x float], align 4
+ %sort_tmp = alloca [432 x float], align 4
+ %msDiffs = alloca [432 x float], align 4
+ %TM.sroa.0.0.copyload = load float, float* %tm, align 4
+ %TM.sroa.8.0.copyload = load float, float* null, align 4
+ %TM.sroa.9.0..sroa_idx813 = getelementptr inbounds float, float* %tm, i32 6
+ %TM.sroa.9.0.copyload = load float, float* %TM.sroa.9.0..sroa_idx813, align 4
+ %TM.sroa.11.0.copyload = load float, float* undef, align 4
+ br i1 undef, label %for.body.lr.ph, label %if.then343
+
+for.body.lr.ph: ; preds = %entry
+ %arrayidx8 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 1, i32 %lev, i32 0
+ %arrayidx12 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 1, i32 %lev, i32 6
+ %arrayidx15 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 1, i32 %lev, i32 4
+ %arrayidx20 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 1, i32 %lev, i32 7
+ %arrayidx24 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 2, i32 %lev, i32 0
+ %arrayidx28 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 2, i32 %lev, i32 6
+ %arrayidx32 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 2, i32 %lev, i32 4
+ %arrayidx36 = getelementptr inbounds %struct.r, %struct.r* %this, i32 0, i32 2, i32 %lev, i32 7
+ %arrayidx84 = getelementptr inbounds [9 x float], [9 x float]* %TIM, i32 0, i32 6
+ %arrayidx92 = getelementptr inbounds [9 x float], [9 x float]* %TIM, i32 0, i32 7
+ %add116 = add nsw i32 %pyh, 15
+ br label %for.body
+
+for.body: ; preds = %for.cond.cleanup40, %for.body.lr.ph
+ %arrayidx.phi = phi float* [ %sX, %for.body.lr.ph ], [ %arrayidx.inc, %for.cond.cleanup40 ]
+ %arrayidx4.phi = phi float* [ %sY, %for.body.lr.ph ], [ %arrayidx4.inc, %for.cond.cleanup40 ]
+ %0 = load float, float* %arrayidx.phi, align 4
+ %1 = load float, float* %arrayidx4.phi, align 4
+ %2 = load float, float* %arrayidx12, align 4
+ %add = fadd fast float 0.000000e+00, %2
+ %3 = load float, float* %arrayidx20, align 4
+ %add21 = fadd fast float 0.000000e+00, %3
+ %mul3.i = fmul fast float %add21, %TM.sroa.8.0.copyload
+ %add.i = fadd fast float 0.000000e+00, %TM.sroa.11.0.copyload
+ %add5.i = fadd fast float %add.i, %mul3.i
+ %conv6.i = fdiv fast float 1.000000e+00, %add5.i
+ %mul8.i = fmul fast float %add, %TM.sroa.0.0.copyload
+ %add11.i = fadd fast float %mul8.i, %TM.sroa.9.0.copyload
+ %add13.i = fadd fast float %add11.i, 0.000000e+00
+ %4 = load float, float* %arrayidx24, align 4
+ %mul14.i = fmul fast float %add13.i, %4
+ %mul25 = fmul fast float %mul14.i, %conv6.i
+ %add29 = fadd fast float %mul25, 0.000000e+00
+ %arrayidx.inc = getelementptr float, float* %arrayidx.phi, i32 1
+ %arrayidx4.inc = getelementptr float, float* %arrayidx4.phi, i32 1
+ %conv64.1 = sitofp i32 undef to float
+ %conv64.6 = sitofp i32 undef to float
+ br label %for.body41
+
+for.cond.cleanup40: ; preds = %for.body41
+ %call = call fast float undef(%struct.r* nonnull %this, float* undef, i32 49)
+ br label %for.body
+
+for.body41: ; preds = %for.cond.cleanup56.for.body41_crit_edge, %for.body
+ %5 = phi float [ 0.000000e+00, %for.body ], [ %.pre, %for.cond.cleanup56.for.body41_crit_edge ]
+ %sFVData.0840 = phi float* [ undef, %for.body ], [ undef, %for.cond.cleanup56.for.body41_crit_edge ]
+ %dx.0838 = phi i32 [ -3, %for.body ], [ undef, %for.cond.cleanup56.for.body41_crit_edge ]
+ %conv42 = sitofp i32 %dx.0838 to float
+ %add43 = fadd fast float %conv42, %add29
+ %conv44 = fptosi float %add43 to i32
+ %conv48 = sitofp i32 %conv44 to float
+ %mul49 = fmul fast float %5, %conv48
+ %add53 = fadd fast float %mul49, 0.000000e+00
+ %conv111 = fptosi float undef to i32
+ %cond = select i1 undef, i32 %conv111, i32 -16
+ %cond.add116 = select i1 undef, i32 %cond, i32 %add116
+ %cmp132 = icmp sgt i32 undef, -16
+ %cond137 = select i1 %cmp132, i32 undef, i32 -16
+ %cond153 = select i1 undef, i32 %cond137, i32 undef
+ %add.ptr = getelementptr inbounds i8, i8* %sdata, i32 %cond153
+ %mul154 = mul nsw i32 %cond.add116, %siW
+ %add.ptr155 = getelementptr inbounds i8, i8* %add.ptr, i32 %mul154
+ %6 = load i8, i8* %add.ptr155, align 1
+ %conv157 = uitofp i8 %6 to float
+ %incdec.ptr = getelementptr inbounds float, float* %sFVData.0840, i32 1
+ store float %conv157, float* %sFVData.0840, align 4
+ %7 = load float, float* %arrayidx15, align 4
+ %mul65.1 = fmul fast float %7, %conv64.1
+ %8 = load float, float* %arrayidx20, align 4
+ %add69.1 = fadd fast float %mul65.1, %8
+ %conv78.1 = fdiv fast float 1.000000e+00, 0.000000e+00
+ %9 = load float, float* undef, align 4
+ %mul80.1 = fmul fast float %9, %add53
+ %10 = load float, float* undef, align 4
+ %mul82.1 = fmul fast float %10, %add69.1
+ %add83.1 = fadd fast float %mul82.1, %mul80.1
+ %11 = load float, float* %arrayidx84, align 4
+ %add85.1 = fadd fast float %add83.1, %11
+ %mul86.1 = fmul fast float %add85.1, %conv78.1
+ %12 = load float, float* %arrayidx92, align 4
+ %add93.1 = fadd fast float 0.000000e+00, %12
+ %mul94.1 = fmul fast float %add93.1, %conv78.1
+ %13 = load float, float* %arrayidx24, align 4
+ %mul98.1 = fmul fast float %mul86.1, %13
+ %14 = load float, float* %arrayidx28, align 4
+ %add102.1 = fadd fast float %mul98.1, %14
+ %15 = load float, float* %arrayidx32, align 4
+ %mul106.1 = fmul fast float %mul94.1, %15
+ %16 = load float, float* %arrayidx36, align 4
+ %add110.1 = fadd fast float %mul106.1, %16
+ %conv111.1 = fptosi float %add102.1 to i32
+ %conv112.1 = fptosi float %add110.1 to i32
+ %cond.1 = select i1 undef, i32 %conv111.1, i32 -16
+ %cond.add116.1 = select i1 undef, i32 %cond.1, i32 %add116
+ %cond137.1 = select i1 undef, i32 %conv112.1, i32 -16
+ %cond153.1 = select i1 undef, i32 %cond137.1, i32 undef
+ %add.ptr.1 = getelementptr inbounds i8, i8* %sdata, i32 %cond153.1
+ %mul154.1 = mul nsw i32 %cond.add116.1, %siW
+ %add.ptr155.1 = getelementptr inbounds i8, i8* %add.ptr.1, i32 %mul154.1
+ %17 = load i8, i8* %add.ptr155.1, align 1
+ %conv157.1 = uitofp i8 %17 to float
+ %incdec.ptr.1 = getelementptr inbounds float, float* %sFVData.0840, i32 2
+ store float %conv157.1, float* %incdec.ptr, align 4
+ %conv112.2 = fptosi float undef to i32
+ %cond137.2 = select i1 undef, i32 %conv112.2, i32 -16
+ %cond153.2 = select i1 undef, i32 %cond137.2, i32 undef
+ %add.ptr.2 = getelementptr inbounds i8, i8* %sdata, i32 %cond153.2
+ %add.ptr155.2 = getelementptr inbounds i8, i8* %add.ptr.2, i32 0
+ %18 = load i8, i8* %add.ptr155.2, align 1
+ %conv157.2 = uitofp i8 %18 to float
+ %incdec.ptr.2 = getelementptr inbounds float, float* %sFVData.0840, i32 3
+ store float %conv157.2, float* %incdec.ptr.1, align 4
+ %cmp132.3 = icmp sgt i32 undef, -16
+ %cond137.3 = select i1 %cmp132.3, i32 undef, i32 -16
+ %cond153.3 = select i1 undef, i32 %cond137.3, i32 undef
+ %add.ptr.3 = getelementptr inbounds i8, i8* %sdata, i32 %cond153.3
+ %add.ptr155.3 = getelementptr inbounds i8, i8* %add.ptr.3, i32 0
+ %19 = load i8, i8* %add.ptr155.3, align 1
+ %conv157.3 = uitofp i8 %19 to float
+ store float %conv157.3, float* %incdec.ptr.2, align 4
+ %incdec.ptr.5 = getelementptr inbounds float, float* %sFVData.0840, i32 6
+ %20 = load float, float* %arrayidx15, align 4
+ %mul65.6 = fmul fast float %20, %conv64.6
+ %21 = load float, float* %arrayidx20, align 4
+ %add69.6 = fadd fast float %mul65.6, %21
+ %conv78.6 = fdiv fast float 1.000000e+00, 0.000000e+00
+ %22 = load float, float* undef, align 4
+ %mul82.6 = fmul fast float %22, %add69.6
+ %add83.6 = fadd fast float %mul82.6, 0.000000e+00
+ %23 = load float, float* %arrayidx84, align 4
+ %add85.6 = fadd fast float %add83.6, %23
+ %mul86.6 = fmul fast float %add85.6, %conv78.6
+ %24 = load float, float* %arrayidx24, align 4
+ %mul98.6 = fmul fast float %mul86.6, %24
+ %25 = load float, float* %arrayidx28, align 4
+ %add102.6 = fadd fast float %mul98.6, %25
+ %conv111.6 = fptosi float %add102.6 to i32
+ %conv112.6 = fptosi float undef to i32
+ %cond.6 = select i1 undef, i32 %conv111.6, i32 -16
+ %cond.add116.6 = select i1 undef, i32 %cond.6, i32 %add116
+ %cmp132.6 = icmp sgt i32 %conv112.6, -16
+ %cond137.6 = select i1 %cmp132.6, i32 %conv112.6, i32 -16
+ %cond153.6 = select i1 undef, i32 %cond137.6, i32 undef
+ %add.ptr.6 = getelementptr inbounds i8, i8* %sdata, i32 %cond153.6
+ %mul154.6 = mul nsw i32 %cond.add116.6, %siW
+ %add.ptr155.6 = getelementptr inbounds i8, i8* %add.ptr.6, i32 %mul154.6
+ %26 = load i8, i8* %add.ptr155.6, align 1
+ %conv157.6 = uitofp i8 %26 to float
+ store float %conv157.6, float* %incdec.ptr.5, align 4
+ %exitcond874 = icmp eq i32 %dx.0838, 3
+ br i1 %exitcond874, label %for.cond.cleanup40, label %for.cond.cleanup56.for.body41_crit_edge
+
+for.cond.cleanup56.for.body41_crit_edge: ; preds = %for.body41
+ %.pre = load float, float* %arrayidx8, align 4
+ br label %for.body41
+
+if.then343: ; preds = %entry
+ ret void
+}
+
+attributes #0 = { sspstrong uwtable "no-frame-pointer-elim"="false" "target-cpu"="cortex-a7" }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!3, !3, i64 0}
diff --git a/test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir b/test/CodeGen/ARM/ARMLoadStoreDBG.mir
index e351713dc290..b4f19ecc5610 100644
--- a/test/CodeGen/MIR/ARM/ARMLoadStoreDBG.mir
+++ b/test/CodeGen/ARM/ARMLoadStoreDBG.mir
@@ -1,11 +1,11 @@
-# RUN: llc -start-after machine-cp -stop-after=if-converter -mtriple=thumbv7 %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: llc -mtriple thumbv7 -verify-machineinstrs -run-pass arm-ldst-opt %s -o - | FileCheck %s
--- |
; ModuleID = '/Volumes/Data/llvm/test/CodeGen/ARM/sched-it-debug-nodes.ll'
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv7"
-
+
%struct.s = type opaque
-
+
; Function Attrs: nounwind
define arm_aapcscc i32 @f(%struct.s* %s, i32 %u, i8* %b, i32 %n) #0 !dbg !4 {
entry:
@@ -15,35 +15,34 @@
tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !21, metadata !27), !dbg !28
%cmp = icmp ult i32 %n, 4, !dbg !29
br i1 %cmp, label %return, label %if.end, !dbg !31
-
+
if.end: ; preds = %entry
tail call arm_aapcscc void @g(%struct.s* %s, i8* %b, i32 %n) #3, !dbg !32
br label %return, !dbg !33
-
+
return: ; preds = %if.end, %entry
%retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
ret i32 %retval.0, !dbg !34
}
-
+
declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1
-
+
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
-
+
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
-
+
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22, !23, !24, !25}
!llvm.ident = !{!26}
-
- !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+
+ !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "<stdin>", directory: "/Users/compnerd/Source/llvm")
!2 = !{}
- !3 = !{!4}
- !4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !5, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, variables: !17)
+ !4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !5, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17)
!5 = !DISubroutineType(types: !6)
!6 = !{!7, !8, !11, !12, !16}
!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -81,28 +80,30 @@ name: f
alignment: 1
exposesReturnsTwice: false
hasInlineAsm: false
+allVRegsAllocated: true
isSSA: false
tracksRegLiveness: true
tracksSubRegLiveness: false
-liveins:
+liveins:
- { reg: '%r0' }
+ - { reg: '%r1' }
- { reg: '%r2' }
- { reg: '%r3' }
-calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13',
- '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4',
- '%r5', '%r6', '%r7', '%r8', '%r9', '%r10', '%r11',
- '%s16', '%s17', '%s18', '%s19', '%s20', '%s21',
- '%s22', '%s23', '%s24', '%s25', '%s26', '%s27',
- '%s28', '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11',
- '%d10_d12', '%d11_d13', '%d12_d14', '%d13_d15',
- '%q4_q5', '%q5_q6', '%q6_q7', '%q4_q5_q6_q7', '%r4_r5',
- '%r6_r7', '%r8_r9', '%r10_r11', '%d8_d9_d10', '%d9_d10_d11',
- '%d10_d11_d12', '%d11_d12_d13', '%d12_d13_d14',
- '%d13_d14_d15', '%d8_d10_d12', '%d9_d11_d13', '%d10_d12_d14',
- '%d11_d13_d15', '%d8_d10_d12_d14', '%d9_d11_d13_d15',
- '%d9_d10', '%d11_d12', '%d13_d14', '%d9_d10_d11_d12',
+calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13',
+ '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4',
+ '%r5', '%r6', '%r7', '%r8', '%r9', '%r10', '%r11',
+ '%s16', '%s17', '%s18', '%s19', '%s20', '%s21',
+ '%s22', '%s23', '%s24', '%s25', '%s26', '%s27',
+ '%s28', '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11',
+ '%d10_d12', '%d11_d13', '%d12_d14', '%d13_d15',
+ '%q4_q5', '%q5_q6', '%q6_q7', '%q4_q5_q6_q7', '%r4_r5',
+ '%r6_r7', '%r8_r9', '%r10_r11', '%d8_d9_d10', '%d9_d10_d11',
+ '%d10_d11_d12', '%d11_d12_d13', '%d12_d13_d14',
+ '%d13_d14_d15', '%d8_d10_d12', '%d9_d11_d13', '%d10_d12_d14',
+ '%d11_d13_d15', '%d8_d10_d12_d14', '%d9_d11_d13_d15',
+ '%d9_d10', '%d11_d12', '%d13_d14', '%d9_d10_d11_d12',
'%d11_d12_d13_d14' ]
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -116,34 +117,32 @@ frameInfo:
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
- savePoint: '%bb.2.if.end'
- restorePoint: '%bb.2.if.end'
-stack:
+stack:
- { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '%lr' }
- { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '%r7' }
body: |
bb.0.entry:
successors: %bb.1, %bb.2.if.end
- liveins: %r0, %r2, %r3, %lr, %r7
-
+ liveins: %r0, %r1, %r2, %r3, %lr, %r7
+
DBG_VALUE debug-use %r0, debug-use _, !18, !27, debug-location !28
DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
DBG_VALUE debug-use %r2, debug-use _, !20, !27, debug-location !28
DBG_VALUE debug-use %r3, debug-use _, !21, !27, debug-location !28
t2CMPri %r3, 4, 14, _, implicit-def %cpsr, debug-location !31
t2Bcc %bb.2.if.end, 2, killed %cpsr
-
+
bb.1:
liveins: %lr, %r7
-
+
DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
%r0 = t2MOVi -1, 14, _, _
DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
tBX_RET 14, _, implicit %r0, debug-location !34
-
+
bb.2.if.end:
liveins: %r0, %r2, %r3, %r7, %lr
-
+
%sp = frame-setup t2STMDB_UPD %sp, 14, _, killed %r7, killed %lr
frame-setup CFI_INSTRUCTION .cfi_def_cfa_offset 8
frame-setup CFI_INSTRUCTION .cfi_offset %lr, -4
@@ -153,11 +152,11 @@ body: |
DBG_VALUE debug-use %r2, debug-use _, !20, !27, debug-location !28
DBG_VALUE debug-use %r3, debug-use _, !21, !27, debug-location !28
%r1 = COPY killed %r2, debug-location !32
+ DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
%r2 = COPY killed %r3, debug-location !32
tBL 14, _, @g, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit-def %sp, debug-location !32
%r0 = t2MOVi 0, 14, _, _
%sp = t2LDMIA_UPD %sp, 14, _, def %r7, def %lr
- DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
tBX_RET 14, _, implicit %r0, debug-location !34
# Verify that the DBG_VALUE is ignored.
# CHECK: %sp = t2LDMIA_RET %sp, 14, _, def %r7, def %pc, implicit %r0
diff --git a/test/CodeGen/ARM/Windows/builtin_longjmp.ll b/test/CodeGen/ARM/Windows/builtin_longjmp.ll
new file mode 100644
index 000000000000..52b6f301bb77
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/builtin_longjmp.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -filetype asm -o - %s | FileCheck %s
+
+declare void @llvm.eh.sjlj.longjmp(i8*)
+
+define arm_aapcs_vfpcc void @test___builtin_longjump(i8* %b) {
+entry:
+ tail call void @llvm.eh.sjlj.longjmp(i8* %b)
+ unreachable
+}
+
+; CHECK: push.w {r11, lr}
+; CHECK: ldr.w r11, [r0]
+; CHECK: ldr.w sp, [r0, #8]
+; CHECK: ldr.w pc, [r0, #4]
+
diff --git a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
index a314259e499e..d303e9da8604 100644
--- a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
+++ b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple thumbv7--windows-itanium -code-model large -filetype obj -o - %s \
+; RUN: llc -mtriple thumbv7--windows-itanium -code-model large -verify-machineinstrs -filetype obj -o - %s \
; RUN: | llvm-objdump -no-show-raw-insn -d - | FileCheck %s
; ModuleID = 'reduced.c'
diff --git a/test/CodeGen/ARM/Windows/chkstk.ll b/test/CodeGen/ARM/Windows/chkstk.ll
index cb787e14b5ba..330c1f458500 100644
--- a/test/CodeGen/ARM/Windows/chkstk.ll
+++ b/test/CodeGen/ARM/Windows/chkstk.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 %s -o - \
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -verify-machineinstrs %s -o - \
; RUN: | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s
-; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -code-model=large %s -o - \
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -code-model=large -verify-machineinstrs %s -o - \
; RUN: | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s
define arm_aapcs_vfpcc void @check_watermark() {
diff --git a/test/CodeGen/ARM/Windows/dbzchk.ll b/test/CodeGen/ARM/Windows/dbzchk.ll
new file mode 100644
index 000000000000..599a7cf094c4
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/dbzchk.ll
@@ -0,0 +1,192 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -print-machineinstrs=expand-isel-pseudos -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s -check-prefix CHECK-DIV
+
+; int f(int n, int d) {
+; if (n / d)
+; return 1;
+; return 0;
+; }
+
+define arm_aapcs_vfpcc i32 @f(i32 %n, i32 %d) {
+entry:
+ %retval = alloca i32, align 4
+ %n.addr = alloca i32, align 4
+ %d.addr = alloca i32, align 4
+ store i32 %n, i32* %n.addr, align 4
+ store i32 %d, i32* %d.addr, align 4
+ %0 = load i32, i32* %n.addr, align 4
+ %1 = load i32, i32* %d.addr, align 4
+ %div = sdiv i32 %0, %1
+ %tobool = icmp ne i32 %div, 0
+ br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+ store i32 1, i32* %retval, align 4
+ br label %return
+
+if.end:
+ store i32 0, i32* %retval, align 4
+ br label %return
+
+return:
+ %2 = load i32, i32* %retval, align 4
+ ret i32 %2
+}
+
+; CHECK-DIV-DAG: BB#0
+; CHECK-DIV-DAG: Successors according to CFG: BB#5({{.*}}) BB#4
+; CHECK-DIV-DAG: BB#1
+; CHECK-DIV-DAG: Successors according to CFG: BB#3
+; CHECK-DIV-DAG: BB#2
+; CHECK-DIV-DAG: Successors according to CFG: BB#3
+; CHECK-DIV-DAG: BB#3
+; CHECK-DIV-DAG: BB#4
+; CHECK-DIV-DAG: Successors according to CFG: BB#1({{.*}}) BB#2
+; CHECK-DIV-DAG: BB#5
+
+; RUN: llc -mtriple thumbv7--windows-itanium -print-machineinstrs=expand-isel-pseudos -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s -check-prefix CHECK-MOD
+
+; int r;
+; int g(int l, int m) {
+; if (m <= 0)
+; return 0;
+; return (r = l % m);
+; }
+
+@r = common global i32 0, align 4
+
+define arm_aapcs_vfpcc i32 @g(i32 %l, i32 %m) {
+entry:
+ %cmp = icmp eq i32 %m, 0
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %rem = urem i32 %l, %m
+ store i32 %rem, i32* @r, align 4
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %rem, %if.end ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+; CHECK-MOD-DAG: BB#0
+; CHECK-MOD-DAG: Successors according to CFG: BB#2({{.*}}) BB#1
+; CHECK-MOD-DAG: BB#1
+; CHECK-MOD-DAG: Successors according to CFG: BB#4({{.*}}) BB#3
+; CHECK-MOD-DAG: BB#2
+; CHECK-MOD-DAG: BB#3
+; CHECK-MOD-DAG: Successors according to CFG: BB#2
+; CHECK-MOD-DAG: BB#4
+
+; RUN: llc -mtriple thumbv7--windows-itanium -print-machineinstrs=expand-isel-pseudos -verify-machineinstrs -filetype asm -o /dev/null %s 2>&1 | FileCheck %s -check-prefix CHECK-CFG
+; RUN: llc -mtriple thumbv7--windows-itanium -print-machineinstrs=expand-isel-pseudos -verify-machineinstrs -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-CFG-ASM
+
+; unsigned c;
+; extern unsigned long g(void);
+; int f(unsigned u, signed char b) {
+; if (b)
+; c = g() % u;
+; return c;
+; }
+
+@c = common global i32 0, align 4
+
+declare arm_aapcs_vfpcc i32 @i()
+
+define arm_aapcs_vfpcc i32 @h(i32 %u, i8 signext %b) #0 {
+entry:
+ %tobool = icmp eq i8 %b, 0
+ br i1 %tobool, label %entry.if.end_crit_edge, label %if.then
+
+entry.if.end_crit_edge:
+ %.pre = load i32, i32* @c, align 4
+ br label %if.end
+
+if.then:
+ %call = tail call arm_aapcs_vfpcc i32 @i()
+ %rem = urem i32 %call, %u
+ store i32 %rem, i32* @c, align 4
+ br label %if.end
+
+if.end:
+ %0 = phi i32 [ %.pre, %entry.if.end_crit_edge ], [ %rem, %if.then ]
+ ret i32 %0
+}
+
+attributes #0 = { optsize }
+
+; CHECK-CFG-DAG: BB#0
+; CHECK-CFG-DAG: t2Bcc <BB#2>
+; CHECK-CFG-DAG: t2B <BB#1>
+
+; CHECK-CFG-DAG: BB#1
+; CHECK-CFG-DAG: t2B <BB#3>
+
+; CHECK-CFG-DAG: BB#2
+; CHECK-CFG-DAG: tCBZ %vreg{{[0-9]}}, <BB#5>
+; CHECK-CFG-DAG: t2B <BB#4>
+
+; CHECK-CFG-DAG: BB#4
+
+; CHECK-CFG-DAG: BB#3
+; CHECK-CFG-DAG: tBX_RET
+
+; CHECK-CFG-DAG: BB#5
+; CHECK-CFG-DAG: t2UDF 249
+
+; CHECK-CFG-ASM-LABEL: h:
+; CHECK-CFG-ASM: cbz r{{[0-9]}}, .LBB2_2
+; CHECK-CFG-ASM: b .LBB2_4
+; CHECK-CFG-ASM-LABEL: .LBB2_2:
+; CHECK-CFG-ASM-NEXT: udf.w #249
+; CHECK-CFG-ASM-LABEL: .LBB2_4:
+; CHECK-CFG-ASM: bl __rt_udiv
+; CHECK-CFG-ASM: pop.w {{{.*}}, r11, pc}
+
+; RUN: llc -O0 -mtriple thumbv7--windows-itanium -verify-machineinstrs -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-WIN__DBZCHK
+
+; long k(void);
+; int l(void);
+; int j(int i) {
+; if (l() == -1)
+; return 0;
+; return k() % i;
+; }
+
+declare arm_aapcs_vfpcc i32 @k()
+declare arm_aapcs_vfpcc i32 @l()
+
+define arm_aapcs_vfpcc i32 @j(i32 %i) {
+entry:
+ %retval = alloca i32, align 4
+ %i.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ %call = call arm_aapcs_vfpcc i32 @l()
+ %cmp = icmp eq i32 %call, -1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ store i32 0, i32* %retval, align 4
+ br label %return
+
+if.end:
+ %call1 = call arm_aapcs_vfpcc i32 @k()
+ %0 = load i32, i32* %i.addr, align 4
+ %rem = srem i32 %call1, %0
+ store i32 %rem, i32* %retval, align 4
+ br label %return
+
+return:
+ %1 = load i32, i32* %retval, align 4
+ ret i32 %1
+}
+
+; CHECK-WIN__DBZCHK-LABEL: j:
+; CHECK-WIN__DBZCHK: cbz r{{[0-7]}}, .LBB
+; CHECK-WIN__DBZCHK-NOT: cbz r8, .LBB
+; CHECK-WIN__DBZCHK-NOT: cbz r9, .LBB
+; CHECK-WIN__DBZCHK-NOT: cbz r10, .LBB
+; CHECK-WIN__DBZCHK-NOT: cbz r11, .LBB
+; CHECK-WIN__DBZCHK-NOT: cbz ip, .LBB
+; CHECK-WIN__DBZCHK-NOT: cbz lr, .LBB
+
diff --git a/test/CodeGen/ARM/Windows/division.ll b/test/CodeGen/ARM/Windows/division.ll
index b3ef9c6d278b..f4704ea7ff4b 100644
--- a/test/CodeGen/ARM/Windows/division.ll
+++ b/test/CodeGen/ARM/Windows/division.ll
@@ -7,8 +7,11 @@ entry:
ret i32 %div
}
-; CHECK-LABEL: sdiv32
-; CHECK: b __rt_sdiv
+; CHECK-LABEL: sdiv32:
+; CHECK: cbz r0
+; CHECK: b
+; CHECK: udf.w #249
+; CHECK: bl __rt_sdiv
define arm_aapcs_vfpcc i32 @udiv32(i32 %divisor, i32 %divident) {
entry:
@@ -17,7 +20,10 @@ entry:
}
; CHECK-LABEL: udiv32:
-; CHECK: b __rt_udiv
+; CHECK: cbz r0
+; CHECK: b
+; CHECK: udf.w #249
+; CHECK: bl __rt_udiv
define arm_aapcs_vfpcc i64 @sdiv64(i64 %divisor, i64 %divident) {
entry:
@@ -25,7 +31,11 @@ entry:
ret i64 %div
}
-; CHECK-LABEL: sdiv64
+; CHECK-LABEL: sdiv64:
+; CHECK: orr.w r4, r0, r1
+; CHECK-NEXT: cbz r4
+; CHECK: b
+; CHECK: udf.w #249
; CHECK: bl __rt_sdiv64
define arm_aapcs_vfpcc i64 @udiv64(i64 %divisor, i64 %divident) {
@@ -35,4 +45,9 @@ entry:
}
; CHECK-LABEL: udiv64:
+; CHECK: orr.w r4, r0, r1
+; CHECK-NEXT: cbz r4
+; CHECK: b
+; CHECK: udf.w #249
; CHECK: bl __rt_udiv64
+
diff --git a/test/CodeGen/ARM/Windows/dllexport.ll b/test/CodeGen/ARM/Windows/dllexport.ll
new file mode 100644
index 000000000000..27496208862e
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/dllexport.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -filetype asm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-GNU
+; RUN: llc -mtriple thumbv7--windows-gnu -filetype asm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-GNU
+; RUN: llc -mtriple thumbv7--windows-msvc -filetype asm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-MSVC
+
+define void @f() {
+ ret void
+}
+
+define dllexport void @g() {
+ ret void
+}
+
+define dllexport void @h() unnamed_addr {
+ ret void
+}
+
+declare dllexport void @i()
+
+define linkonce_odr dllexport void @j() {
+ ret void
+}
+
+define linkonce_odr dllexport void @k() alwaysinline {
+ ret void
+}
+
+define weak_odr dllexport void @l() {
+ ret void
+}
+
+@m = dllexport global i32 0, align 4
+@n = dllexport unnamed_addr constant i32 0
+@o = common dllexport global i32 0, align 4
+@p = weak_odr dllexport global i32 0, align 4
+@q = weak_odr dllexport unnamed_addr constant i32 0
+
+@r = dllexport alias void (), void () * @f
+@s = dllexport alias void (), void () * @g
+@t = dllexport alias void (), void () * @f
+@u = weak_odr dllexport alias void (), void () * @g
+
+; CHECK: .section .drectve
+; CHECK-GNU-NOT: -export:f
+; CHECK-GNU: -export:g
+; CHECK-GNU-SAME: -export:h
+; CHECK-GNU-NOT: -export:i
+; CHECK-GNU-SAME: -export:j
+; CHECK-GNU-SAME: -export:k
+; CHECK-GNU-SAME: -export:l
+; CHECK-GNU-SAME: -export:m,data
+; CHECK-GNU-SAME: -export:n,data
+; CHECK-GNU-SAME: -export:o,data
+; CHECK-GNU-SAME: -export:p,data
+; CHECK-GNU-SAME: -export:q,data
+; CHECK-GNU-SAME: -export:r
+; CHECK-GNU-SAME: -export:s
+; CHECK-GNU-SAME: -export:t
+; CHECK-GNU-SAME: -export:u
+; CHECK-MSVC-NOT: /EXPORT:f
+; CHECK-MSVC: /EXPORT:g
+; CHECK-MSVC-SAME: /EXPORT:h
+; CHECK-MSVC-NOT: /EXPORT:i
+; CHECK-MSVC-SAME: /EXPORT:j
+; CHECK-MSVC-SAME: /EXPORT:k
+; CHECK-MSVC-SAME: /EXPORT:l
+; CHECK-MSVC-SAME: /EXPORT:m,DATA
+; CHECK-MSVC-SAME: /EXPORT:n,DATA
+; CHECK-MSVC-SAME: /EXPORT:o,DATA
+; CHECK-MSVC-SAME: /EXPORT:p,DATA
+; CHECK-MSVC-SAME: /EXPORT:q,DATA
+; CHECK-MSVC-SAME: /EXPORT:r
+; CHECK-MSVC-SAME: /EXPORT:s
+; CHECK-MSVC-SAME: /EXPORT:t
+; CHECK-MSVC-SAME: /EXPORT:u
+
diff --git a/test/CodeGen/ARM/Windows/long-calls.ll b/test/CodeGen/ARM/Windows/long-calls.ll
index 4e5bdce146f0..29e6f783ae5f 100644
--- a/test/CodeGen/ARM/Windows/long-calls.ll
+++ b/test/CodeGen/ARM/Windows/long-calls.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -mattr=+long-calls -o - %s \
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -relocation-model pic -mattr=+long-calls -o - %s \
; RUN: | FileCheck %s
declare arm_aapcs_vfpcc void @callee()
@@ -12,7 +12,7 @@ entry:
; CHECK-LABEL: caller
; CHECK: ldr [[REG:r[0-9]+]], [[CPI:LCPI[_0-9]+]]
; CHECK: bx [[REG]]
-; CHECK: .align 2
+; CHECK: .p2align 2
; CHECK: [[CPI]]:
; CHECK: .long callee
diff --git a/test/CodeGen/ARM/Windows/no-aeabi.ll b/test/CodeGen/ARM/Windows/no-aeabi.ll
index 3971b9ccf580..a4103b0a676e 100644
--- a/test/CodeGen/ARM/Windows/no-aeabi.ll
+++ b/test/CodeGen/ARM/Windows/no-aeabi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -verify-machineinstrs -o - %s | FileCheck %s
declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/Windows/overflow.ll b/test/CodeGen/ARM/Windows/overflow.ll
deleted file mode 100644
index 5f74f25ac224..000000000000
--- a/test/CodeGen/ARM/Windows/overflow.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc -mtriple thumbv7-windows-gnu -filetype asm -o - %s
-
-define i32 @divsoverflow32(i32 %a, i32 %b) {
- %1 = alloca i32, align 4
- %2 = alloca i32, align 4
- %3 = load i32, i32* %1, align 4
- %4 = load i32, i32* %2, align 4
- %5 = sub nsw i32 0, %4
- %6 = sdiv i32 -2147483647, %3
- %7 = icmp sgt i32 %5, %6
- br i1 %7, label %8, label %9
- call void (...) @abort_simpl32()
- unreachable
- %10 = load i32, i32* %1, align 4
- %11 = load i32, i32* %2, align 4
- %12 = mul nsw i32 %10, %11
- ret i32 %12
-}
-
-declare void @abort_simpl32(...)
-
-define i64 @divsoverflow64(i64 %a, i64 %b) {
- %1 = alloca i64, align 8
- %2 = alloca i64, align 8
- %3 = load i64, i64* %1, align 8
- %4 = load i64, i64* %2, align 8
- %5 = sub nsw i64 0, %4
- %6 = sdiv i64 -9223372036854775808, %3
- %7 = icmp sgt i64 %5, %6
- br i1 %7, label %8, label %9
- call void (...) @abort_simpl64()
- unreachable
- %10 = load i64, i64* %1, align 8
- %11 = load i64, i64* %2, align 8
- %12 = mul nsw i64 %10, %11
- ret i64 %12
-}
-
-declare void @abort_simpl64(...)
-
-define i32 @divuoverflow32(i32 %a, i32 %b) {
- %1 = alloca i32, align 4
- %2 = alloca i32, align 4
- %3 = load i32, i32* %1, align 4
- %4 = load i32, i32* %2, align 4
- %5 = sub nsw i32 0, %4
- %6 = udiv i32 4294967296, %3
- %7 = icmp sgt i32 %5, %6
- br i1 %7, label %8, label %9
- call void (...) @abort_uimpl32()
- unreachable
- %10 = load i32, i32* %1, align 4
- %11 = load i32, i32* %2, align 4
- %12 = mul nsw i32 %10, %11
- ret i32 %12
-}
-
-declare void @abort_uimpl32(...)
-
-define i64 @divuoverflow64(i64 %a, i64 %b) {
- %1 = alloca i64, align 8
- %2 = alloca i64, align 8
- %3 = load i64, i64* %1, align 8
- %4 = load i64, i64* %2, align 8
- %5 = sub nsw i64 0, %4
- %6 = udiv i64 18446744073709551616, %3
- %7 = icmp sgt i64 %5, %6
- br i1 %7, label %8, label %9
- call void (...) @abort_uimpl64()
- unreachable
- %10 = load i64, i64* %1, align 8
- %11 = load i64, i64* %2, align 8
- %12 = mul nsw i64 %10, %11
- ret i64 %12
-}
-
-declare void @abort_uimpl64(...)
diff --git a/test/CodeGen/ARM/Windows/tls.ll b/test/CodeGen/ARM/Windows/tls.ll
new file mode 100644
index 000000000000..689f4e291872
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/tls.ll
@@ -0,0 +1,157 @@
+; RUN: llc -mtriple thumbv7--windows-itanium %s -o - | FileCheck %s
+
+@i = thread_local global i32 0
+@j = external thread_local global i32
+@k = internal thread_local global i32 0
+@l = hidden thread_local global i32 0
+@m = external hidden thread_local global i32
+@n = thread_local global i16 0
+@o = thread_local global i8 0
+
+define i32 @f() {
+ %1 = load i32, i32* @i
+ ret i32 %1
+}
+
+; CHECK: mrc p15, #0, [[TEB:r[0-9]]], c13, c0, #2
+
+; CHECK: movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
+; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
+
+; CHECK: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+
+; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:LCPI[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+
+; CHECK: [[CPI]]:
+; CHECK-NEXT: .long i(SECREL32)
+
+define i32 @e() {
+ %1 = load i32, i32* @j
+ ret i32 %1
+}
+
+; CHECK: mrc p15, #0, [[TEB:r[0-9]]], c13, c0, #2
+
+; CHECK: movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
+; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
+
+; CHECK: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+
+; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:LCPI[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+
+; CHECK: [[CPI]]:
+; CHECK-NEXT: .long j(SECREL32)
+
+define i32 @d() {
+ %1 = load i32, i32* @k
+ ret i32 %1
+}
+
+; CHECK: mrc p15, #0, [[TEB:r[0-9]]], c13, c0, #2
+
+; CHECK: movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
+; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
+
+; CHECK: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+
+; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:LCPI[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+
+; CHECK: [[CPI]]:
+; CHECK-NEXT: .long k(SECREL32)
+
+define i32 @c() {
+ %1 = load i32, i32* @l
+ ret i32 %1
+}
+
+; CHECK: mrc p15, #0, [[TEB:r[0-9]]], c13, c0, #2
+
+; CHECK: movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
+; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
+
+; CHECK: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+
+; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:LCPI[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+
+; CHECK: [[CPI]]:
+; CHECK-NEXT: .long l(SECREL32)
+
+define i32 @b() {
+ %1 = load i32, i32* @m
+ ret i32 %1
+}
+
+; CHECK: mrc p15, #0, [[TEB:r[0-9]]], c13, c0, #2
+
+; CHECK: movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
+; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
+
+; CHECK: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+
+; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:LCPI[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+
+; CHECK: [[CPI]]:
+; CHECK: .long m(SECREL32)
+
+define i16 @a() {
+ %1 = load i16, i16* @n
+ ret i16 %1
+}
+
+; CHECK: mrc p15, #0, [[TEB:r[0-9]]], c13, c0, #2
+
+; CHECK: movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
+; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
+
+; CHECK: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+
+; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:LCPI[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ldrh r0, {{\[}}[[TLS]], [[SLOT]]]
+
+; CHECK: [[CPI]]:
+; CHECK: .long n(SECREL32)
+
+define i8 @Z() {
+ %1 = load i8, i8* @o
+ ret i8 %1
+}
+
+; CHECK: mrc p15, #0, [[TEB:r[0-9]]], c13, c0, #2
+
+; CHECK: movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
+; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
+
+; CHECK: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+
+; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:LCPI[0-9]+_[0-9]+]]
+
+; CHECK-NEXT: ldrb r0, {{\[}}[[TLS]], [[SLOT]]]
+
+; CHECK: [[CPI]]:
+; CHECK-NEXT: .long o(SECREL32)
+
diff --git a/test/CodeGen/ARM/align.ll b/test/CodeGen/ARM/align.ll
index 9589e72df2f5..74525909148d 100644
--- a/test/CodeGen/ARM/align.ll
+++ b/test/CodeGen/ARM/align.ll
@@ -8,33 +8,33 @@
; no alignment
@c = global i16 2
-;ELF: .align 1
+;ELF: .p2align 1
;ELF: c:
-;DARWIN: .align 1
+;DARWIN: .p2align 1
;DARWIN: _c:
@d = global i32 3
-;ELF: .align 2
+;ELF: .p2align 2
;ELF: d:
-;DARWIN: .align 2
+;DARWIN: .p2align 2
;DARWIN: _d:
@e = global i64 4
-;ELF: .align 3
+;ELF: .p2align 3
;ELF: e
-;DARWIN: .align 3
+;DARWIN: .p2align 3
;DARWIN: _e:
@f = global float 5.0
-;ELF: .align 2
+;ELF: .p2align 2
;ELF: f:
-;DARWIN: .align 2
+;DARWIN: .p2align 2
;DARWIN: _f:
@g = global double 6.0
-;ELF: .align 3
+;ELF: .p2align 3
;ELF: g:
-;DARWIN: .align 3
+;DARWIN: .p2align 3
;DARWIN: _g:
@bar = common global [75 x i8] zeroinitializer, align 128
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 04eae8f9afec..151cc1b12ed2 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -49,7 +49,7 @@ tailrecurse.switch: ; preds = %tailrecurse
; V8-NEXT: beq
; V8-NEXT: %tailrecurse.switch
; V8: cmp
-; V8-NEXT: bne
+; V8-NEXT: beq
; V8-NEXT: b
; The trailing space in the last line checks that the branch is unconditional
switch i32 %and, label %sw.epilog [
diff --git a/test/CodeGen/ARM/arm-eabi.ll b/test/CodeGen/ARM/arm-eabi.ll
index d1e7a947553f..898055dd1092 100644
--- a/test/CodeGen/ARM/arm-eabi.ll
+++ b/test/CodeGen/ARM/arm-eabi.ll
@@ -3,21 +3,29 @@
; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-musleabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-musleabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
; RUN: llc < %s -mtriple=arm-none-eabi -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
; RUN: llc < %s -mtriple=arm-none-eabihf -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
; RUN: llc < %s -mtriple=arm-none-androideabi -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
; RUN: llc < %s -mtriple=arm-none-gnueabi -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
; RUN: llc < %s -mtriple=arm-none-gnueabihf -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-musleabi -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
+; RUN: llc < %s -mtriple=arm-none-musleabihf -meabi=gnu -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI
; RUN: llc < %s -mtriple=arm-none-eabi -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-eabihf -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-androideabi -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-gnueabi -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-gnueabihf -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-musleabi -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-musleabihf -meabi=4 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-eabi -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-eabihf -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-androideabi -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-gnueabi -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
; RUN: llc < %s -mtriple=arm-none-gnueabihf -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-musleabi -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=arm-none-musleabihf -meabi=5 -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI
%struct.my_s = type { [18 x i32] }
diff --git a/test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll b/test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll
new file mode 100644
index 000000000000..620cb6356411
--- /dev/null
+++ b/test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -mtriple=arm-eabi -mattr=+neon -interleaved-access -S | FileCheck %s
+
+; CHECK-LABEL: @extract_user_basic(
+; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi(
+; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br label %if.merge
+
+if.merge:
+ %E2 = extractelement <8 x i32> %L, i32 2
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi_no_dom(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E2 = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_wrong_const_index(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_wrong_const_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 1
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_undef_index(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_undef_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 undef
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_var_index(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 %I
+ ret void
+}
diff --git a/test/CodeGen/ARM/arm-interleaved-accesses.ll b/test/CodeGen/ARM/arm-interleaved-accesses.ll
index 002e71f6d9b8..6f3d537176c0 100644
--- a/test/CodeGen/ARM/arm-interleaved-accesses.ll
+++ b/test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -304,3 +304,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
store <3 x float> %tmp1, <3 x float>* %p, align 16
ret void
}
+
+; NEON-LABEL: load_factor2_with_extract_user:
+; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64]
+; NEON: vmov.32 r0, d16[1]
+; NONEON-LABEL: load_factor2_with_extract_user:
+; NONEON-NOT: vld2
+define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
+ %1 = load <8 x i32>, <8 x i32>* %a, align 8
+ %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = extractelement <8 x i32> %1, i32 2
+ ret i32 %3
+}
diff --git a/test/CodeGen/ARM/arm-shrink-wrapping.ll b/test/CodeGen/ARM/arm-shrink-wrapping.ll
index 9375df4b15cb..cb608fc18d95 100644
--- a/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -629,7 +629,7 @@ end:
declare double @llvm.pow.f64(double, double)
; This function needs to spill floating point registers to
-; exerce the path where we were deferencing the end iterator
+; exercise the path where we were dereferencing the end iterator
; to access debug info location while inserting the spill code
; during PEI with shrink-wrapping enable.
; CHECK-LABEL: debug_info:
@@ -677,7 +677,7 @@ bb13: ; preds = %bb3, %bb
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "a.cpp", directory: "b")
!2 = !{}
!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 573cd45c0825..8841483c97a4 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -172,31 +172,31 @@ define i64 @test6(i64* %ptr, i64 %val) {
define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
; CHECK-LABEL: test7:
; CHECK-DAG: mov [[VAL1LO:r[0-9]+]], r1
-; CHECK-DAG: dmb {{ish$}}
; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE-DAG: eor [[MISMATCH_LO:r[0-9]+]], [[REG1]], [[VAL1LO]]
-; CHECK-LE-DAG: eor [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
-; CHECK-BE-DAG: eor [[MISMATCH_LO:r[0-9]+]], [[REG2]], r2
-; CHECK-BE-DAG: eor [[MISMATCH_HI:r[0-9]+]], [[REG1]], r1
+; CHECK-LE-DAG: eor [[MISMATCH_LO:.*]], [[REG1]], [[VAL1LO]]
+; CHECK-LE-DAG: eor [[MISMATCH_HI:.*]], [[REG2]], r2
+; CHECK-BE-DAG: eor [[MISMATCH_LO:.*]], [[REG2]], r2
+; CHECK-BE-DAG: eor [[MISMATCH_HI:.*]], [[REG1]], r1
; CHECK: orrs {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
; CHECK: bne
+; CHECK-DAG: dmb {{ish$}}
; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
; CHECK: cmp
-; CHECK: bne
+; CHECK: beq
; CHECK: dmb {{ish$}}
; CHECK-THUMB-LABEL: test7:
-; CHECK-THUMB: dmb {{ish$}}
; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG1]], r2
; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG2]], r3
-; CHECK-THUMB-LE: orrs [[MISMATCH_HI]], [[MISMATCH_LO]]
+; CHECK-THUMB-LE: orrs.w {{.*}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
; CHECK-THUMB: bne
+; CHECK-THUMB: dmb {{ish$}}
; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
+; CHECK-THUMB: beq
; CHECK-THUMB: dmb {{ish$}}
%pair = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
@@ -251,21 +251,18 @@ define i64 @test10(i64* %ptr, i64 %val) {
; CHECK-LABEL: test10:
; CHECK: dmb {{ish$}}
; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK: mov [[CARRY_HI:[a-z0-9]+]], #0
; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: cmp [[REG1]], r1
-; CHECK-BE: cmp [[REG2]], r2
-; CHECK: movwls [[CARRY_LO]], #1
-; CHECK-LE: cmp [[REG2]], r2
-; CHECK-BE: cmp [[REG1]], r1
-; CHECK: movwle [[CARRY_HI]], #1
-; CHECK: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK: cmp [[CARRY_HI]], #0
+; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
+; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
+; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
+; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
+; CHECK: mov [[CMP:[a-z0-9]+]], #0
+; CHECK: movwge [[CMP]], #1
+; CHECK: cmp [[CMP]], #0
; CHECK: movne [[OUT_HI]], [[REG2]]
; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK: cmp
; CHECK: bne
; CHECK: dmb {{ish$}}
@@ -273,21 +270,18 @@ define i64 @test10(i64* %ptr, i64 %val) {
; CHECK-THUMB-LABEL: test10:
; CHECK-THUMB: dmb {{ish$}}
; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov.w [[CARRY_LO:[a-z0-9]+|lr]], #0
-; CHECK-THUMB: movs [[CARRY_HI:[a-z0-9]+|lr]], #0
-; CHECK-THUMB-LE: cmp [[REG1]], r2
-; CHECK-THUMB-BE: cmp [[REG2]], r3
-; CHECK-THUMB: movls.w [[CARRY_LO]], #1
-; CHECK-THUMB-LE: cmp [[REG2]], r3
-; CHECK-THUMB-BE: cmp [[REG1]], r2
-; CHECK-THUMB: movle [[CARRY_HI]], #1
-; CHECK-THUMB: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: cmp [[CARRY_HI]], #0
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
+; CHECK-THUMB: movge.w [[CMP]], #1
+; CHECK-THUMB: cmp.w [[CMP]], #0
+; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK-THUMB: cmp
; CHECK-THUMB: bne
; CHECK-THUMB: dmb {{ish$}}
@@ -300,21 +294,18 @@ define i64 @test11(i64* %ptr, i64 %val) {
; CHECK-LABEL: test11:
; CHECK: dmb {{ish$}}
; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK: mov [[CARRY_HI:[a-z0-9]+]], #0
; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: cmp [[REG1]], r1
-; CHECK-BE: cmp [[REG2]], r2
-; CHECK: movwls [[CARRY_LO]], #1
-; CHECK-LE: cmp [[REG2]], r2
-; CHECK-BE: cmp [[REG1]], r1
-; CHECK: movwls [[CARRY_HI]], #1
-; CHECK: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK: cmp [[CARRY_HI]], #0
+; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
+; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
+; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
+; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
+; CHECK: mov [[CMP:[a-z0-9]+]], #0
+; CHECK: movwhs [[CMP]], #1
+; CHECK: cmp [[CMP]], #0
; CHECK: movne [[OUT_HI]], [[REG2]]
; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK: cmp
; CHECK: bne
; CHECK: dmb {{ish$}}
@@ -322,21 +313,18 @@ define i64 @test11(i64* %ptr, i64 %val) {
; CHECK-THUMB-LABEL: test11:
; CHECK-THUMB: dmb {{ish$}}
; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov.w [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK-THUMB: movs [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB-LE: cmp [[REG1]], r2
-; CHECK-THUMB-BE: cmp [[REG2]], r3
-; CHECK-THUMB: movls.w [[CARRY_LO]], #1
-; CHECK-THUMB-LE: cmp [[REG2]], r3
-; CHECK-THUMB-BE: cmp [[REG1]], r2
-; CHECK-THUMB: movls [[CARRY_HI]], #1
-; CHECK-THUMB: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: cmp [[CARRY_HI]], #0
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
+; CHECK-THUMB: movhs.w [[CMP]], #1
+; CHECK-THUMB: cmp.w [[CMP]], #0
+; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK-THUMB: cmp
; CHECK-THUMB: bne
; CHECK-THUMB: dmb {{ish$}}
@@ -349,21 +337,18 @@ define i64 @test12(i64* %ptr, i64 %val) {
; CHECK-LABEL: test12:
; CHECK: dmb {{ish$}}
; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK: mov [[CARRY_HI:[a-z0-9]+]], #0
; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: cmp [[REG1]], r1
-; CHECK-BE: cmp [[REG2]], r2
-; CHECK: movwhi [[CARRY_LO]], #1
-; CHECK-LE: cmp [[REG2]], r2
-; CHECK-BE: cmp [[REG1]], r1
-; CHECK: movwgt [[CARRY_HI]], #1
-; CHECK: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK: cmp [[CARRY_HI]], #0
+; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
+; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
+; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
+; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
+; CHECK: mov [[CMP:[a-z0-9]+]], #0
+; CHECK: movwlt [[CMP]], #1
+; CHECK: cmp [[CMP]], #0
; CHECK: movne [[OUT_HI]], [[REG2]]
; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK: cmp
; CHECK: bne
; CHECK: dmb {{ish$}}
@@ -371,21 +356,18 @@ define i64 @test12(i64* %ptr, i64 %val) {
; CHECK-THUMB-LABEL: test12:
; CHECK-THUMB: dmb {{ish$}}
; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov.w [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK-THUMB: movs [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB-LE: cmp [[REG1]], r2
-; CHECK-THUMB-BE: cmp [[REG2]], r3
-; CHECK-THUMB: movhi.w [[CARRY_LO]], #1
-; CHECK-THUMB-LE: cmp [[REG2]], r3
-; CHECK-THUMB-BE: cmp [[REG1]], r2
-; CHECK-THUMB: movgt [[CARRY_HI]], #1
-; CHECK-THUMB: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: cmp [[CARRY_HI]], #0
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
+; CHECK-THUMB: movlt.w [[CMP]], #1
+; CHECK-THUMB: cmp.w [[CMP]], #0
+; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK-THUMB: cmp
; CHECK-THUMB: bne
; CHECK-THUMB: dmb {{ish$}}
@@ -398,21 +380,18 @@ define i64 @test13(i64* %ptr, i64 %val) {
; CHECK-LABEL: test13:
; CHECK: dmb {{ish$}}
; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: mov [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK: mov [[CARRY_HI:[a-z0-9]+]], #0
; CHECK: mov [[OUT_HI:[a-z0-9]+]], r2
-; CHECK-LE: cmp [[REG1]], r1
-; CHECK-BE: cmp [[REG2]], r2
-; CHECK: movwhi [[CARRY_LO]], #1
-; CHECK-LE: cmp [[REG2]], r2
-; CHECK-BE: cmp [[REG1]], r1
-; CHECK: movwhi [[CARRY_HI]], #1
-; CHECK: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK: cmp [[CARRY_HI]], #0
+; CHECK-LE: subs {{[^,]+}}, r1, [[REG1]]
+; CHECK-BE: subs {{[^,]+}}, r2, [[REG2]]
+; CHECK-LE: sbcs {{[^,]+}}, r2, [[REG2]]
+; CHECK-BE: sbcs {{[^,]+}}, r1, [[REG1]]
+; CHECK: mov [[CMP:[a-z0-9]+]], #0
+; CHECK: movwlo [[CMP]], #1
+; CHECK: cmp [[CMP]], #0
; CHECK: movne [[OUT_HI]], [[REG2]]
; CHECK: mov [[OUT_LO:[a-z0-9]+]], r1
; CHECK: movne [[OUT_LO]], [[REG1]]
-; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK: cmp
; CHECK: bne
; CHECK: dmb {{ish$}}
@@ -420,21 +399,18 @@ define i64 @test13(i64* %ptr, i64 %val) {
; CHECK-THUMB-LABEL: test13:
; CHECK-THUMB: dmb {{ish$}}
; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov.w [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK-THUMB: movs [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB-LE: cmp [[REG1]], r2
-; CHECK-THUMB-BE: cmp [[REG2]], r3
-; CHECK-THUMB: movhi.w [[CARRY_LO]], #1
-; CHECK-THUMB-LE: cmp [[REG2]], r3
-; CHECK-THUMB-BE: cmp [[REG1]], r2
-; CHECK-THUMB: movhi [[CARRY_HI]], #1
-; CHECK-THUMB: moveq [[CARRY_HI]], [[CARRY_LO]]
-; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
-; CHECK-THUMB: cmp [[CARRY_HI]], #0
-; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB: mov [[OUT_LO:[a-z0-9]+]], r2
+; CHECK-THUMB-LE: subs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB-BE: subs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-LE: sbcs.w {{[^,]+}}, r3, [[REG2]]
+; CHECK-THUMB-BE: sbcs.w {{[^,]+}}, r2, [[REG1]]
+; CHECK-THUMB: mov.w [[CMP:[a-z0-9]+]], #0
+; CHECK-THUMB: movlo.w [[CMP]], #1
+; CHECK-THUMB: cmp.w [[CMP]], #0
+; CHECK-THUMB: mov [[OUT_HI:[a-z0-9]+]], r3
; CHECK-THUMB: movne [[OUT_HI]], [[REG2]]
; CHECK-THUMB: movne [[OUT_LO]], [[REG1]]
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[OUT_LO]], [[OUT_HI]]
; CHECK-THUMB: cmp
; CHECK-THUMB: bne
; CHECK-THUMB: dmb {{ish$}}
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 791389456619..17324d64153d 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix CHECK-ARMV7
; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-T2
; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-T1
-; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-M0
+; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-T1
; RUN: llc < %s -mtriple=thumbv7--none-eabi -thread-model single -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-BAREMETAL
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
@@ -29,8 +29,7 @@ entry:
; CHECK: ldrex
; CHECK: add
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_add_4
- ; CHECK-M0: bl ___sync_fetch_and_add_4
+ ; CHECK-T1: bl ___sync_fetch_and_add_4
; CHECK-BAREMETAL: add
; CHECK-BAREMETAL-NOT: __sync
%0 = atomicrmw add i32* %val1, i32 %tmp monotonic
@@ -38,8 +37,7 @@ entry:
; CHECK: ldrex
; CHECK: sub
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_sub_4
- ; CHECK-M0: bl ___sync_fetch_and_sub_4
+ ; CHECK-T1: bl ___sync_fetch_and_sub_4
; CHECK-BAREMETAL: sub
; CHECK-BAREMETAL-NOT: __sync
%1 = atomicrmw sub i32* %val2, i32 30 monotonic
@@ -47,8 +45,7 @@ entry:
; CHECK: ldrex
; CHECK: add
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_add_4
- ; CHECK-M0: bl ___sync_fetch_and_add_4
+ ; CHECK-T1: bl ___sync_fetch_and_add_4
; CHECK-BAREMETAL: add
; CHECK-BAREMETAL-NOT: __sync
%2 = atomicrmw add i32* %val2, i32 1 monotonic
@@ -56,8 +53,7 @@ entry:
; CHECK: ldrex
; CHECK: sub
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_sub_4
- ; CHECK-M0: bl ___sync_fetch_and_sub_4
+ ; CHECK-T1: bl ___sync_fetch_and_sub_4
; CHECK-BAREMETAL: sub
; CHECK-BAREMETAL-NOT: __sync
%3 = atomicrmw sub i32* %val2, i32 1 monotonic
@@ -65,8 +61,7 @@ entry:
; CHECK: ldrex
; CHECK: and
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_and_4
- ; CHECK-M0: bl ___sync_fetch_and_and_4
+ ; CHECK-T1: bl ___sync_fetch_and_and_4
; CHECK-BAREMETAL: and
; CHECK-BAREMETAL-NOT: __sync
%4 = atomicrmw and i32* %andt, i32 4080 monotonic
@@ -74,8 +69,7 @@ entry:
; CHECK: ldrex
; CHECK: or
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_or_4
- ; CHECK-M0: bl ___sync_fetch_and_or_4
+ ; CHECK-T1: bl ___sync_fetch_and_or_4
; CHECK-BAREMETAL: or
; CHECK-BAREMETAL-NOT: __sync
%5 = atomicrmw or i32* %ort, i32 4080 monotonic
@@ -83,8 +77,7 @@ entry:
; CHECK: ldrex
; CHECK: eor
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_xor_4
- ; CHECK-M0: bl ___sync_fetch_and_xor_4
+ ; CHECK-T1: bl ___sync_fetch_and_xor_4
; CHECK-BAREMETAL: eor
; CHECK-BAREMETAL-NOT: __sync
%6 = atomicrmw xor i32* %xort, i32 4080 monotonic
@@ -92,8 +85,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_min_4
- ; CHECK-M0: bl ___sync_fetch_and_min_4
+ ; CHECK-T1: bl ___sync_fetch_and_min_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%7 = atomicrmw min i32* %val2, i32 16 monotonic
@@ -102,8 +94,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_min_4
- ; CHECK-M0: bl ___sync_fetch_and_min_4
+ ; CHECK-T1: bl ___sync_fetch_and_min_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%8 = atomicrmw min i32* %val2, i32 %neg monotonic
@@ -111,8 +102,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_max_4
- ; CHECK-M0: bl ___sync_fetch_and_max_4
+ ; CHECK-T1: bl ___sync_fetch_and_max_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%9 = atomicrmw max i32* %val2, i32 1 monotonic
@@ -120,8 +110,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_max_4
- ; CHECK-M0: bl ___sync_fetch_and_max_4
+ ; CHECK-T1: bl ___sync_fetch_and_max_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%10 = atomicrmw max i32* %val2, i32 0 monotonic
@@ -129,8 +118,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umin_4
- ; CHECK-M0: bl ___sync_fetch_and_umin_4
+ ; CHECK-T1: bl ___sync_fetch_and_umin_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%11 = atomicrmw umin i32* %val2, i32 16 monotonic
@@ -139,8 +127,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umin_4
- ; CHECK-M0: bl ___sync_fetch_and_umin_4
+ ; CHECK-T1: bl ___sync_fetch_and_umin_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%12 = atomicrmw umin i32* %val2, i32 %uneg monotonic
@@ -148,8 +135,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umax_4
- ; CHECK-M0: bl ___sync_fetch_and_umax_4
+ ; CHECK-T1: bl ___sync_fetch_and_umax_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%13 = atomicrmw umax i32* %val2, i32 1 monotonic
@@ -157,8 +143,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umax_4
- ; CHECK-M0: bl ___sync_fetch_and_umax_4
+ ; CHECK-T1: bl ___sync_fetch_and_umax_4
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%14 = atomicrmw umax i32* %val2, i32 0 monotonic
@@ -175,8 +160,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umin_2
- ; CHECK-M0: bl ___sync_fetch_and_umin_2
+ ; CHECK-T1: bl ___sync_fetch_and_umin_2
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%0 = atomicrmw umin i16* %val, i16 16 monotonic
@@ -185,8 +169,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umin_2
- ; CHECK-M0: bl ___sync_fetch_and_umin_2
+ ; CHECK-T1: bl ___sync_fetch_and_umin_2
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%1 = atomicrmw umin i16* %val, i16 %uneg monotonic
@@ -194,8 +177,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umax_2
- ; CHECK-M0: bl ___sync_fetch_and_umax_2
+ ; CHECK-T1: bl ___sync_fetch_and_umax_2
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%2 = atomicrmw umax i16* %val, i16 1 monotonic
@@ -203,8 +185,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umax_2
- ; CHECK-M0: bl ___sync_fetch_and_umax_2
+ ; CHECK-T1: bl ___sync_fetch_and_umax_2
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%3 = atomicrmw umax i16* %val, i16 0 monotonic
@@ -220,8 +201,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umin_1
- ; CHECK-M0: bl ___sync_fetch_and_umin_1
+ ; CHECK-T1: bl ___sync_fetch_and_umin_1
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%0 = atomicrmw umin i8* %val, i8 16 monotonic
@@ -229,8 +209,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umin_1
- ; CHECK-M0: bl ___sync_fetch_and_umin_1
+ ; CHECK-T1: bl ___sync_fetch_and_umin_1
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%uneg = sub i8 0, 1
@@ -239,8 +218,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umax_1
- ; CHECK-M0: bl ___sync_fetch_and_umax_1
+ ; CHECK-T1: bl ___sync_fetch_and_umax_1
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%2 = atomicrmw umax i8* %val, i8 1 monotonic
@@ -248,8 +226,7 @@ entry:
; CHECK: ldrex
; CHECK: cmp
; CHECK: strex
- ; CHECK-T1: blx ___sync_fetch_and_umax_1
- ; CHECK-M0: bl ___sync_fetch_and_umax_1
+ ; CHECK-T1: bl ___sync_fetch_and_umax_1
; CHECK-BAREMETAL: cmp
; CHECK-BAREMETAL-NOT: __sync
%3 = atomicrmw umax i8* %val, i8 0 monotonic
@@ -272,31 +249,37 @@ define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
%pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
%oldval = extractvalue { i32, i1 } %pair, 0
-; CHECK-ARMV7: dmb ish
-; CHECK-ARMV7: [[LOOP_BB:\.?LBB[0-9]+_1]]:
; CHECK-ARMV7: ldrex [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
; CHECK-ARMV7: cmp [[OLDVAL]], r1
; CHECK-ARMV7: bne [[FAIL_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK-ARMV7: dmb ish
+; CHECK-ARMV7: [[LOOP_BB:\.?LBB.*]]:
; CHECK-ARMV7: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
; CHECK-ARMV7: cmp [[SUCCESS]], #0
-; CHECK-ARMV7: bne [[LOOP_BB]]
-; CHECK-ARMV7: dmb ish
-; CHECK-ARMV7: bx lr
+; CHECK-ARMV7: beq [[SUCCESS_BB:\.?LBB.*]]
+; CHECK-ARMV7: ldrex [[OLDVAL]], [r[[ADDR]]]
+; CHECK-ARMV7: cmp [[OLDVAL]], r1
+; CHECK-ARMV7: beq [[LOOP_BB]]
; CHECK-ARMV7: [[FAIL_BB]]:
; CHECK-ARMV7: clrex
; CHECK-ARMV7: bx lr
+; CHECK-ARMV7: [[SUCCESS_BB]]:
+; CHECK-ARMV7: dmb ish
+; CHECK-ARMV7: bx lr
-; CHECK-T2: dmb ish
-; CHECK-T2: [[LOOP_BB:\.?LBB[0-9]+_1]]:
; CHECK-T2: ldrex [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
; CHECK-T2: cmp [[OLDVAL]], r1
-; CHECK-T2: clrexne
-; CHECK-T2: bxne lr
+; CHECK-T2: bne [[FAIL_BB:\.?LBB.*]]
+; CHECK-T2: dmb ish
+; CHECK-T2: [[LOOP_BB:\.?LBB.*]]:
; CHECK-T2: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
; CHECK-T2: cmp [[SUCCESS]], #0
; CHECK-T2: dmbeq ish
; CHECK-T2: bxeq lr
-; CHECK-T2: b [[LOOP_BB]]
+; CHECK-T2: ldrex [[OLDVAL]], [r[[ADDR]]]
+; CHECK-T2: cmp [[OLDVAL]], r1
+; CHECK-T2: beq [[LOOP_BB]]
+; CHECK-T2: clrex
ret i32 %oldval
}
@@ -336,8 +319,8 @@ define i32 @load_load_add_acquire(i32* %mem1, i32* %mem2) nounwind {
; CHECK: dmb
; CHECK: add r0,
-; CHECK-M0: ___sync_val_compare_and_swap_4
-; CHECK-M0: ___sync_val_compare_and_swap_4
+; CHECK-T1: ___sync_val_compare_and_swap_4
+; CHECK-T1: ___sync_val_compare_and_swap_4
; CHECK-BAREMETAL: ldr {{r[0-9]}}, [r0]
; CHECK-BAREMETAL-NOT: dmb
@@ -358,8 +341,8 @@ define void @store_store_release(i32* %mem1, i32 %val1, i32* %mem2, i32 %val2) {
; CHECK: dmb
; CHECK: str r3, [r2]
-; CHECK-M0: ___sync_lock_test_and_set
-; CHECK-M0: ___sync_lock_test_and_set
+; CHECK-T1: ___sync_lock_test_and_set
+; CHECK-T1: ___sync_lock_test_and_set
; CHECK-BAREMETAL-NOT: dmb
; CHECK-BAREMTEAL: str r1, [r0]
@@ -379,9 +362,9 @@ define void @load_fence_store_monotonic(i32* %mem1, i32* %mem2) {
; CHECK: dmb
; CHECK: str [[R0]], [r1]
-; CHECK-M0: ldr [[R0:r[0-9]]], [r0]
-; CHECK-M0: dmb
-; CHECK-M0: str [[R0]], [r1]
+; CHECK-T1: ldr [[R0:r[0-9]]], [{{r[0-9]+}}]
+; CHECK-T1: {{dmb|bl ___sync_synchronize}}
+; CHECK-T1: str [[R0]], [{{r[0-9]+}}]
; CHECK-BAREMETAL: ldr [[R0:r[0-9]]], [r0]
; CHECK-BAREMETAL-NOT: dmb
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index efdb75b63222..77b850bd617b 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -667,19 +667,14 @@ define void @test_atomic_load_min_i64(i64 %offset) nounwind {
; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM-LE: cmp [[OLD1]], r0
-; CHECK-ARM-LE: movwls [[LOCARRY]], #1
-; CHECK-ARM-LE: cmp [[OLD2]], r1
-; CHECK-ARM-LE: movwle [[HICARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD2]], r1
-; CHECK-ARM-BE: movwls [[LOCARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD1]], r0
-; CHECK-ARM-BE: movwle [[HICARRY]], #1
-; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
-; CHECK-ARM: cmp [[HICARRY]], #0
; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
+; CHECK-ARM: movwge [[CMP:r[0-9]+|lr]], #1
+; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
; CHECK-ARM: movne [[MINHI]], [[OLD2]]
; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
; CHECK-ARM: movne [[MINLO]], [[OLD1]]
@@ -785,19 +780,14 @@ define void @test_atomic_load_max_i64(i64 %offset) nounwind {
; CHECK: ldrexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM-LE: cmp [[OLD1]], r0
-; CHECK-ARM-LE: movwhi [[LOCARRY]], #1
-; CHECK-ARM-LE: cmp [[OLD2]], r1
-; CHECK-ARM-LE: movwgt [[HICARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD2]], r1
-; CHECK-ARM-BE: movwhi [[LOCARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD1]], r0
-; CHECK-ARM-BE: movwgt [[HICARRY]], #1
-; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
-; CHECK-ARM: cmp [[HICARRY]], #0
; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
+; CHECK-ARM: movwlt [[CMP:r[0-9]+|lr]], #1
+; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
; CHECK-ARM: movne [[MINHI]], [[OLD2]]
; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
; CHECK-ARM: movne [[MINLO]], [[OLD1]]
@@ -903,19 +893,14 @@ define void @test_atomic_load_umin_i64(i64 %offset) nounwind {
; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM-LE: cmp [[OLD1]], r0
-; CHECK-ARM-LE: movwls [[LOCARRY]], #1
-; CHECK-ARM-LE: cmp [[OLD2]], r1
-; CHECK-ARM-LE: movwls [[HICARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD2]], r1
-; CHECK-ARM-BE: movwls [[LOCARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD1]], r0
-; CHECK-ARM-BE: movwls [[HICARRY]], #1
-; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
-; CHECK-ARM: cmp [[HICARRY]], #0
; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
+; CHECK-ARM: movwhs [[CMP:r[0-9]+|lr]], #1
+; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
; CHECK-ARM: movne [[MINHI]], [[OLD2]]
; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
; CHECK-ARM: movne [[MINLO]], [[OLD1]]
@@ -1021,19 +1006,14 @@ define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
; r0, r1 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM-LE: cmp [[OLD1]], r0
-; CHECK-ARM-LE: movwhi [[LOCARRY]], #1
-; CHECK-ARM-LE: cmp [[OLD2]], r1
-; CHECK-ARM-LE: movwhi [[HICARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD2]], r1
-; CHECK-ARM-BE: movwhi [[LOCARRY]], #1
-; CHECK-ARM-BE: cmp [[OLD1]], r0
-; CHECK-ARM-BE: movwhi [[HICARRY]], #1
-; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
-; CHECK-ARM: cmp [[HICARRY]], #0
; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
+; CHECK-ARM-LE: subs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM-LE: sbcs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: subs {{[^,]+}}, r1, [[OLD2]]
+; CHECK-ARM-BE: sbcs {{[^,]+}}, r0, [[OLD1]]
+; CHECK-ARM: mov [[CMP:r[0-9]+|lr]], #0
+; CHECK-ARM: movwlo [[CMP:r[0-9]+|lr]], #1
+; CHECK-ARM: cmp [[CMP:r[0-9]+|lr]], #0
; CHECK-ARM: movne [[MINHI]], [[OLD2]]
; CHECK-ARM: mov [[MINLO:r[0-9]+]], r0
; CHECK-ARM: movne [[MINLO]], [[OLD1]]
diff --git a/test/CodeGen/ARM/bfx.ll b/test/CodeGen/ARM/bfx.ll
index edb0c1a5a54a..629060218e43 100644
--- a/test/CodeGen/ARM/bfx.ll
+++ b/test/CodeGen/ARM/bfx.ll
@@ -51,3 +51,19 @@ entry:
%add7 = add i32 %add, %2
ret i32 %add7
}
+
+define i32 @ubfx3(i32 %a) {
+; CHECK: ubfx3
+; CHECK: ubfx r0, r0, #11, #1
+ %t1 = and i32 %a, 2048
+ %t2 = lshr i32 %t1, 11
+ ret i32 %t2
+}
+
+define i32 @ubfx4(i32 %a) {
+; CHECK: ubfx4
+; CHECK: ubfx r0, r0, #7, #3
+ %t1 = and i32 %a, 896
+ %t2 = lshr i32 %t1, 7
+ ret i32 %t2
+}
diff --git a/test/CodeGen/ARM/build-attributes-encoding.s b/test/CodeGen/ARM/build-attributes-encoding.s
index 29f13f09d319..5649726c12bb 100644
--- a/test/CodeGen/ARM/build-attributes-encoding.s
+++ b/test/CodeGen/ARM/build-attributes-encoding.s
@@ -54,6 +54,9 @@
// Tag_DIV_use (=44)
.eabi_attribute 44, 2
+// Tag_DSP_extension (=46)
+.eabi_attribute 46, 1
+
// Tag_Virtualization_use (=68)
.eabi_attribute 68, 3
@@ -71,15 +74,15 @@
// CHECK-NEXT: ]
// CHECK-NEXT: Address: 0x0
// CHECK-NEXT: Offset: 0x34
-// CHECK-NEXT: Size: 71
+// CHECK-NEXT: Size: 73
// CHECK-NEXT: Link: 0
// CHECK-NEXT: Info: 0
// CHECK-NEXT: AddressAlignment: 1
// CHECK-NEXT: EntrySize: 0
// CHECK-NEXT: SectionData (
-// CHECK-NEXT: 0000: 41460000 00616561 62690001 3C000000
+// CHECK-NEXT: 0000: 41480000 00616561 62690001 3E000000
// CHECK-NEXT: 0010: 05636F72 7465782D 61380006 0A074108
// CHECK-NEXT: 0020: 0109020A 030C0214 01150117 01180119
-// CHECK-NEXT: 0030: 011B001C 0124012A 012C0244 036EA001
-// CHECK-NEXT: 0040: 81013100 FA0101
+// CHECK-NEXT: 0030: 011B001C 0124012A 012C022E 0144036E
+// CHECK-NEXT: 0040: A0018101 3100FA01 01
// CHECK-NEXT: )
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index a74b3e441a13..b3b39a0d550f 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -27,12 +27,21 @@
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-fp-armv8,-crypto | FileCheck %s --check-prefix=V8-NEON
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-crypto | FileCheck %s --check-prefix=V8-FPARMv8-NEON
; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8-FPARMv8-NEON-CRYPTO
+; RUN: llc < %s -mtriple=thumbv8m.base-linux-gnueabi | FileCheck %s --check-prefix=V8MBASELINE
+; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi | FileCheck %s --check-prefix=V8MMAINLINE
+; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi -mattr=+dsp | FileCheck %s --check-prefix=V8MMAINLINE_DSP
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT-FAST
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-neon,+d16 | FileCheck %s --check-prefix=CORTEX-A5-NONEON
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A5-NOFPU
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-NOFPU-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-SOFT-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A8-HARD
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-HARD-FAST
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-SOFT-FAST
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
@@ -96,6 +105,12 @@
; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7
; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST
; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 | FileCheck %s --check-prefix=CORTEX-R8
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R8-FAST
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 | FileCheck %s --check-prefix=CORTEX-A32
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A32-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -108,6 +123,7 @@
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=CORTEX-A72
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a73 | FileCheck %s --check-prefix=CORTEX-A73
; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1 | FileCheck %s --check-prefix=EXYNOS-M1
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-M1-FAST
@@ -124,7 +140,6 @@
; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,+d16,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
-; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=RELOC-OTHER
; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=PCS-R9-USE
@@ -135,6 +150,8 @@
; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi | FileCheck %s --check-prefix=NO-STRICT-ALIGN
; ARMv8a (AArch32)
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a32 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a32 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
@@ -373,6 +390,31 @@
; V8-FPARMv8-NEON-CRYPTO: .fpu crypto-neon-fp-armv8
; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 12, 3
+; V8MBASELINE: .syntax unified
+; '6' is Tag_CPU_arch, '16' is ARM v8-M Baseline
+; V8MBASELINE: .eabi_attribute 6, 16
+; '7' is Tag_CPU_arch_profile, '77' is 'M'
+; V8MBASELINE: .eabi_attribute 7, 77
+; '8' is Tag_ARM_ISA_use
+; V8MBASELINE: .eabi_attribute 8, 0
+; '9' is Tag_Thumb_ISA_use
+; V8MBASELINE: .eabi_attribute 9, 3
+
+; V8MMAINLINE: .syntax unified
+; '6' is Tag_CPU_arch, '17' is ARM v8-M Mainline
+; V8MMAINLINE: .eabi_attribute 6, 17
+; V8MMAINLINE: .eabi_attribute 7, 77
+; V8MMAINLINE: .eabi_attribute 8, 0
+; V8MMAINLINE: .eabi_attribute 9, 3
+; V8MMAINLINE_DSP-NOT: .eabi_attribute 46
+
+; V8MMAINLINE_DSP: .syntax unified
+; V8MBASELINE_DSP: .eabi_attribute 6, 17
+; V8MBASELINE_DSP: .eabi_attribute 7, 77
+; V8MMAINLINE_DSP: .eabi_attribute 8, 0
+; V8MMAINLINE_DSP: .eabi_attribute 9, 3
+; V8MMAINLINE_DSP: .eabi_attribute 46, 1
+
; Tag_CPU_unaligned_access
; NO-STRICT-ALIGN: .eabi_attribute 34, 1
; STRICT-ALIGN: .eabi_attribute 34, 0
@@ -462,6 +504,9 @@
; CORTEX-A7-NOFPU: .eabi_attribute 44, 2
; CORTEX-A7-FPUV4: .eabi_attribute 44, 2
+; Tag_DSP_extension
+; CORTEX-A7-CHECK-NOT: .eabi_attribute 46
+
; Tag_Virtualization_use
; CORTEX-A7-CHECK: .eabi_attribute 68, 3
; CORTEX-A7-NOFPU: .eabi_attribute 68, 3
@@ -486,7 +531,7 @@
; CORTEX-A5-DEFAULT: .eabi_attribute 68, 1
; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 19
-;; The A5 defaults to a VFPv4 FPU, so it flushed preserving sign when -ffast-math
+;; The A5 defaults to a VFPv4 FPU, so it flushed preserving the sign when -ffast-math
;; is given.
; CORTEX-A5-DEFAULT-FAST: .eabi_attribute 20, 2
; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 21
@@ -543,6 +588,28 @@
; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 22
; CORTEX-A5-NOFPU-FAST: .eabi_attribute 23, 1
+; CORTEX-A8-SOFT: .cpu cortex-a8
+; CORTEX-A8-SOFT: .eabi_attribute 6, 10
+; CORTEX-A8-SOFT: .eabi_attribute 7, 65
+; CORTEX-A8-SOFT: .eabi_attribute 8, 1
+; CORTEX-A8-SOFT: .eabi_attribute 9, 2
+; CORTEX-A8-SOFT: .fpu neon
+; CORTEX-A8-SOFT-NOT: .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A8-SOFT: .eabi_attribute 20, 1
+; CORTEX-A8-SOFT: .eabi_attribute 21, 1
+; CORTEX-A8-SOFT-NOT: .eabi_attribute 22
+; CORTEX-A8-SOFT: .eabi_attribute 23, 3
+; CORTEX-A8-SOFT: .eabi_attribute 24, 1
+; CORTEX-A8-SOFT: .eabi_attribute 25, 1
+; CORTEX-A8-SOFT-NOT: .eabi_attribute 27
+; CORTEX-A8-SOFT-NOT: .eabi_attribute 28
+; CORTEX-A8-SOFT-NOT: .eabi_attribute 36, 1
+; CORTEX-A8-SOFT: .eabi_attribute 38, 1
+; CORTEX-A8-SOFT-NOT: .eabi_attribute 42, 1
+; CORTEX-A8-SOFT-NOT: .eabi_attribute 44
+; CORTEX-A8-SOFT: .eabi_attribute 68, 1
+
; CORTEX-A9-SOFT: .cpu cortex-a9
; CORTEX-A9-SOFT: .eabi_attribute 6, 10
; CORTEX-A9-SOFT: .eabi_attribute 7, 65
@@ -565,14 +632,39 @@
; CORTEX-A9-SOFT-NOT: .eabi_attribute 44
; CORTEX-A9-SOFT: .eabi_attribute 68, 1
+; CORTEX-A8-SOFT-FAST-NOT: .eabi_attribute 19
; CORTEX-A9-SOFT-FAST-NOT: .eabi_attribute 19
-;; The A9 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when
;; -ffast-math is specified.
+; CORTEX-A8-SOFT-FAST: .eabi_attribute 20, 2
; CORTEX-A9-SOFT-FAST: .eabi_attribute 20, 2
; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 21
; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 22
; CORTEX-A5-SOFT-FAST: .eabi_attribute 23, 1
+; CORTEX-A8-HARD: .cpu cortex-a8
+; CORTEX-A8-HARD: .eabi_attribute 6, 10
+; CORTEX-A8-HARD: .eabi_attribute 7, 65
+; CORTEX-A8-HARD: .eabi_attribute 8, 1
+; CORTEX-A8-HARD: .eabi_attribute 9, 2
+; CORTEX-A8-HARD: .fpu neon
+; CORTEX-A8-HARD-NOT: .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A8-HARD: .eabi_attribute 20, 1
+; CORTEX-A8-HARD: .eabi_attribute 21, 1
+; CORTEX-A8-HARD-NOT: .eabi_attribute 22
+; CORTEX-A8-HARD: .eabi_attribute 23, 3
+; CORTEX-A8-HARD: .eabi_attribute 24, 1
+; CORTEX-A8-HARD: .eabi_attribute 25, 1
+; CORTEX-A8-HARD-NOT: .eabi_attribute 27
+; CORTEX-A8-HARD: .eabi_attribute 28, 1
+; CORTEX-A8-HARD-NOT: .eabi_attribute 36, 1
+; CORTEX-A8-HARD: .eabi_attribute 38, 1
+; CORTEX-A8-HARD-NOT: .eabi_attribute 42, 1
+; CORTEX-A8-HARD: .eabi_attribute 68, 1
+
+
+
; CORTEX-A9-HARD: .cpu cortex-a9
; CORTEX-A9-HARD: .eabi_attribute 6, 10
; CORTEX-A9-HARD: .eabi_attribute 7, 65
@@ -594,8 +686,16 @@
; CORTEX-A9-HARD: .eabi_attribute 42, 1
; CORTEX-A9-HARD: .eabi_attribute 68, 1
+; CORTEX-A8-HARD-FAST-NOT: .eabi_attribute 19
+;; The A8 defaults to a VFPv3 FPU, so it flushes preserving the sign when
+;; -ffast-math is specified.
+; CORTEX-A8-HARD-FAST: .eabi_attribute 20, 2
+; CORTEX-A8-HARD-FAST-NOT: .eabi_attribute 21
+; CORTEX-A8-HARD-FAST-NOT: .eabi_attribute 22
+; CORTEX-A8-HARD-FAST: .eabi_attribute 23, 1
+
; CORTEX-A9-HARD-FAST-NOT: .eabi_attribute 19
-;; The A9 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when
;; -ffast-math is specified.
; CORTEX-A9-HARD-FAST: .eabi_attribute 20, 2
; CORTEX-A9-HARD-FAST-NOT: .eabi_attribute 21
@@ -621,7 +721,7 @@
; CORTEX-A12-DEFAULT: .eabi_attribute 68, 3
; CORTEX-A12-DEFAULT-FAST-NOT: .eabi_attribute 19
-;; The A12 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; The A12 defaults to a VFPv3 FPU, so it flushes preserving the sign when
;; -ffast-math is specified.
; CORTEX-A12-DEFAULT-FAST: .eabi_attribute 20, 2
; CORTEX-A12-HARD-FAST-NOT: .eabi_attribute 21
@@ -678,7 +778,7 @@
; CORTEX-A15: .eabi_attribute 68, 3
; CORTEX-A15-FAST-NOT: .eabi_attribute 19
-;; The A15 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; The A15 defaults to a VFPv3 FPU, so it flushes preserving the sign when
;; -ffast-math is specified.
; CORTEX-A15-FAST: .eabi_attribute 20, 2
; CORTEX-A15-FAST-NOT: .eabi_attribute 21
@@ -704,7 +804,7 @@
; CORTEX-A17-DEFAULT: .eabi_attribute 68, 3
; CORTEX-A17-FAST-NOT: .eabi_attribute 19
-;; The A17 defaults to a VFPv3 FPU, so it flushes preseving sign when
+;; The A17 defaults to a VFPv3 FPU, so it flushes preserving the sign when
;; -ffast-math is specified.
; CORTEX-A17-FAST: .eabi_attribute 20, 2
; CORTEX-A17-FAST-NOT: .eabi_attribute 21
@@ -950,7 +1050,7 @@
; CORTEX-M4-SOFT-NOT: .eabi_attribute 68
; CORTEX-M4-SOFT-FAST-NOT: .eabi_attribute 19
-;; The M4 defaults to a VFPv4 FPU, so it flushes preseving sign when
+;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when
;; -ffast-math is specified.
; CORTEX-M4-SOFT-FAST: .eabi_attribute 20, 2
; CORTEX-M4-SOFT-FAST-NOT: .eabi_attribute 21
@@ -980,7 +1080,7 @@
; CORTEX-M4-HARD-NOT: .eabi_attribute 68
; CORTEX-M4-HARD-FAST-NOT: .eabi_attribute 19
-;; The M4 defaults to a VFPv4 FPU, so it flushes preseving sign when
+;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when
;; -ffast-math is specified.
; CORTEX-M4-HARD-FAST: .eabi_attribute 20, 2
; CORTEX-M4-HARD-FAST-NOT: .eabi_attribute 21
@@ -1100,7 +1200,7 @@
; CORTEX-R7: .eabi_attribute 7, 82
; CORTEX-R7: .eabi_attribute 8, 1
; CORTEX-R7: .eabi_attribute 9, 2
-; CORTEX-R7: .fpu vfpv3xd
+; CORTEX-R7: .fpu vfpv3-d16-fp16
; CORTEX-R7-NOT: .eabi_attribute 19
;; We default to IEEE 754 compliance
; CORTEX-R7: .eabi_attribute 20, 1
@@ -1109,7 +1209,6 @@
; CORTEX-R7: .eabi_attribute 23, 3
; CORTEX-R7: .eabi_attribute 24, 1
; CORTEX-R7: .eabi_attribute 25, 1
-; CORTEX-R7: .eabi_attribute 27, 1
; CORTEX-R7-NOT: .eabi_attribute 28
; CORTEX-R7: .eabi_attribute 36, 1
; CORTEX-R7: .eabi_attribute 38, 1
@@ -1124,6 +1223,64 @@
; CORTEX-R7-FAST-NOT: .eabi_attribute 22
; CORTEX-R7-FAST: .eabi_attribute 23, 1
+; CORTEX-R8: .cpu cortex-r8
+; CORTEX-R8: .eabi_attribute 6, 10
+; CORTEX-R8: .eabi_attribute 7, 82
+; CORTEX-R8: .eabi_attribute 8, 1
+; CORTEX-R8: .eabi_attribute 9, 2
+; CORTEX-R8: .fpu vfpv3-d16-fp16
+; CORTEX-R8-NOT: .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-R8: .eabi_attribute 20, 1
+; CORTEX-R8: .eabi_attribute 21, 1
+; CORTEX-R8-NOT: .eabi_attribute 22
+; CORTEX-R8: .eabi_attribute 23, 3
+; CORTEX-R8: .eabi_attribute 24, 1
+; CORTEX-R8: .eabi_attribute 25, 1
+; CORTEX-R8-NOT: .eabi_attribute 28
+; CORTEX-R8: .eabi_attribute 36, 1
+; CORTEX-R8: .eabi_attribute 38, 1
+; CORTEX-R8: .eabi_attribute 42, 1
+; CORTEX-R8: .eabi_attribute 44, 2
+; CORTEX-R8-NOT: .eabi_attribute 68
+
+; CORTEX-R8-FAST-NOT: .eabi_attribute 19
+;; The R8 has the VFPv3 FP unit, which always flushes preserving sign.
+; CORTEX-R8-FAST: .eabi_attribute 20, 2
+; CORTEX-R8-FAST-NOT: .eabi_attribute 21
+; CORTEX-R8-FAST-NOT: .eabi_attribute 22
+; CORTEX-R8-FAST: .eabi_attribute 23, 1
+
+; CORTEX-A32: .cpu cortex-a32
+; CORTEX-A32: .eabi_attribute 6, 14
+; CORTEX-A32: .eabi_attribute 7, 65
+; CORTEX-A32: .eabi_attribute 8, 1
+; CORTEX-A32: .eabi_attribute 9, 2
+; CORTEX-A32: .fpu crypto-neon-fp-armv8
+; CORTEX-A32: .eabi_attribute 12, 3
+; CORTEX-A32-NOT: .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A32: .eabi_attribute 20, 1
+; CORTEX-A32: .eabi_attribute 21, 1
+; CORTEX-A32-NOT: .eabi_attribute 22
+; CORTEX-A32: .eabi_attribute 23, 3
+; CORTEX-A32: .eabi_attribute 24, 1
+; CORTEX-A32: .eabi_attribute 25, 1
+; CORTEX-A32-NOT: .eabi_attribute 27
+; CORTEX-A32-NOT: .eabi_attribute 28
+; CORTEX-A32: .eabi_attribute 36, 1
+; CORTEX-A32: .eabi_attribute 38, 1
+; CORTEX-A32: .eabi_attribute 42, 1
+; CORTEX-A32-NOT: .eabi_attribute 44
+; CORTEX-A32: .eabi_attribute 68, 3
+
+; CORTEX-A32-FAST-NOT: .eabi_attribute 19
+;; The A32 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-A32-FAST: .eabi_attribute 20, 2
+; CORTEX-A32-FAST-NOT: .eabi_attribute 21
+; CORTEX-A32-FAST-NOT: .eabi_attribute 22
+; CORTEX-A32-FAST: .eabi_attribute 23, 1
+
; CORTEX-A35: .cpu cortex-a35
; CORTEX-A35: .eabi_attribute 6, 14
; CORTEX-A35: .eabi_attribute 7, 65
@@ -1244,6 +1401,30 @@
; CORTEX-A72-FAST-NOT: .eabi_attribute 22
; CORTEX-A72-FAST: .eabi_attribute 23, 1
+; CORTEX-A73: .cpu cortex-a73
+; CORTEX-A73: .eabi_attribute 6, 14
+; CORTEX-A73: .eabi_attribute 7, 65
+; CORTEX-A73: .eabi_attribute 8, 1
+; CORTEX-A73: .eabi_attribute 9, 2
+; CORTEX-A73: .fpu crypto-neon-fp-armv8
+; CORTEX-A73: .eabi_attribute 12, 3
+; CORTEX-A73-NOT: .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A73: .eabi_attribute 20, 1
+; CORTEX-A73: .eabi_attribute 21, 1
+; CORTEX-A73-NOT: .eabi_attribute 22
+; CORTEX-A73: .eabi_attribute 23, 3
+; CORTEX-A73: .eabi_attribute 24, 1
+; CORTEX-A73: .eabi_attribute 25, 1
+; CORTEX-A73-NOT: .eabi_attribute 27
+; CORTEX-A73-NOT: .eabi_attribute 28
+; CORTEX-A73: .eabi_attribute 36, 1
+; CORTEX-A73: .eabi_attribute 38, 1
+; CORTEX-A73: .eabi_attribute 42, 1
+; CORTEX-A73-NOT: .eabi_attribute 44
+; CORTEX-A73: .eabi_attribute 14, 0
+; CORTEX-A73: .eabi_attribute 68, 3
+
; EXYNOS-M1: .cpu exynos-m1
; EXYNOS-M1: .eabi_attribute 6, 14
; EXYNOS-M1: .eabi_attribute 7, 65
diff --git a/test/CodeGen/ARM/byval_load_align.ll b/test/CodeGen/ARM/byval_load_align.ll
index 2c0910c71d2f..d00d926c7a05 100644
--- a/test/CodeGen/ARM/byval_load_align.ll
+++ b/test/CodeGen/ARM/byval_load_align.ll
@@ -7,7 +7,7 @@
; CHECK: ldr r2, [r[[REG]], #4]
; CHECK: ldr r3, [r[[REG]], #8]
; CHECK-NOT: ldm
-; CHECK: .align 1 @ @sID
+; CHECK: .p2align 1 @ @sID
%struct.ModuleID = type { [32 x i8], [32 x i8], i16 }
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index 8821029520fe..53fa8920ec04 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -12,7 +12,7 @@ declare void @g(i32, i32, i32, i32)
define void @t1() {
; CHECKELF-LABEL: t1:
-; CHECKELF: bl g(PLT)
+; CHECKELF: bl g
call void @g( i32 1, i32 2, i32 3, i32 4 )
ret void
}
@@ -33,7 +33,7 @@ define void @t3() {
; CHECKV6-LABEL: t3:
; CHECKV6: b _t2
; CHECKELF-LABEL: t3:
-; CHECKELF: b t2(PLT)
+; CHECKELF: b t2
; CHECKT2D-LABEL: t3:
; CHECKT2D: b.w _t2
@@ -47,7 +47,7 @@ entry:
; CHECKV6-LABEL: t4:
; CHECKV6: b _sin
; CHECKELF-LABEL: t4:
-; CHECKELF: b sin(PLT)
+; CHECKELF: b sin
%0 = tail call double @sin(double %a) nounwind readonly ; <double> [#uses=1]
ret double %0
}
@@ -57,7 +57,7 @@ entry:
; CHECKV6-LABEL: t5:
; CHECKV6: b _sinf
; CHECKELF-LABEL: t5:
-; CHECKELF: b sinf(PLT)
+; CHECKELF: b sinf
%0 = tail call float @sinf(float %a) nounwind readonly ; <float> [#uses=1]
ret float %0
}
@@ -71,7 +71,7 @@ entry:
; CHECKV6-LABEL: t6:
; CHECKV6: b ___divsi3
; CHECKELF-LABEL: t6:
-; CHECKELF: b __aeabi_idiv(PLT)
+; CHECKELF: b __aeabi_idiv
%0 = sdiv i32 %a, %b
ret i32 %0
}
@@ -87,7 +87,7 @@ entry:
; CHECKT2D-NEXT: bne.w _foo
; CHECKT2D-NEXT: push
; CHECKT2D-NEXT: mov r7, sp
-; CHECKT2D-NEXT: blx _foo
+; CHECKT2D-NEXT: bl _foo
br i1 undef, label %bb, label %bb1.lr.ph
bb1.lr.ph:
@@ -150,8 +150,8 @@ declare i32 @c(i32)
define i32 @t9() nounwind {
; CHECKT2D-LABEL: t9:
-; CHECKT2D: blx __ZN9MutexLockC1Ev
-; CHECKT2D: blx __ZN9MutexLockD1Ev
+; CHECKT2D: bl __ZN9MutexLockC1Ev
+; CHECKT2D: bl __ZN9MutexLockD1Ev
; CHECKT2D: b.w ___divsi3
%lock = alloca %class.MutexLock, align 1
%1 = call %class.MutexLock* @_ZN9MutexLockC1Ev(%class.MutexLock* %lock)
@@ -170,7 +170,7 @@ declare %class.MutexLock* @_ZN9MutexLockD1Ev(%class.MutexLock*) unnamed_addr nou
; otherwise the call to floorf is lost.
define float @libcall_tc_test2(float* nocapture %a, float %b) {
; CHECKT2D-LABEL: libcall_tc_test2:
-; CHECKT2D: blx _floorf
+; CHECKT2D: bl _floorf
; CHECKT2D: b.w _truncf
%1 = load float, float* %a, align 4
%call = tail call float @floorf(float %1)
diff --git a/test/CodeGen/ARM/call.ll b/test/CodeGen/ARM/call.ll
index 87252a91e1b0..05ea556e234c 100644
--- a/test/CodeGen/ARM/call.ll
+++ b/test/CodeGen/ARM/call.ll
@@ -12,7 +12,7 @@
declare void @g(i32, i32, i32, i32)
define void @f() {
-; CHECKELF: PLT
+; CHECKELF: bl g
call void @g( i32 1, i32 2, i32 3, i32 4 )
ret void
}
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index 7ea9be2c61e6..558e2b0e43f7 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -11,9 +11,10 @@ entry:
define i64 @f2(i64 %a, i64 %b) {
; CHECK-LABEL: f2:
-; CHECK: adc r
-; CHECK: subs r
-; CHECK: sbc r
+; CHECK: lsl r
+; CHECK: orr r
+; CHECK: rsbs r
+; CHECK: sbc r
entry:
%tmp1 = shl i64 %a, 1
%tmp2 = sub i64 %tmp1, %b
diff --git a/test/CodeGen/ARM/cdp.ll b/test/CodeGen/ARM/cdp.ll
new file mode 100644
index 000000000000..99ec3b284462
--- /dev/null
+++ b/test/CodeGen/ARM/cdp.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s
+; RUN: not llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.cdp
+define void @cdp(i32 %a) #0 {
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ %1 = load i32, i32* %a.addr, align 4
+ call void @llvm.arm.cdp(i32 %1, i32 2, i32 3, i32 4, i32 5, i32 6)
+ ret void
+}
+
+declare void @llvm.arm.cdp(i32, i32, i32, i32, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/cdp2.ll b/test/CodeGen/ARM/cdp2.ll
new file mode 100644
index 000000000000..c2a00d0fdd72
--- /dev/null
+++ b/test/CodeGen/ARM/cdp2.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s
+; RUN: not llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.cdp2
+define void @cdp2(i32 %a) #0 {
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ %1 = load i32, i32* %a.addr, align 4
+ call void @llvm.arm.cdp2(i32 %1, i32 2, i32 3, i32 4, i32 5, i32 6)
+ ret void
+}
+
+declare void @llvm.arm.cdp2(i32, i32, i32, i32, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll
new file mode 100644
index 000000000000..ec3005dd8ada
--- /dev/null
+++ b/test/CodeGen/ARM/cmpxchg-O0.ll
@@ -0,0 +1,113 @@
+; RUN: llc -verify-machineinstrs -mtriple=armv7-linux-gnu -O0 %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=thumbv8-linux-gnu -O0 %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=thumbv6m-none-eabi -O0 %s -o - | FileCheck %s --check-prefix=CHECK-T1
+
+; CHECK-T1-NOT: ldrex
+; CHECK-T1-NOT: strex
+
+define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_8:
+; CHECK: dmb ish
+; CHECK: uxtb [[DESIRED:r[0-9]+]], [[DESIRED]]
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrexb [[OLD:r[0-9]+]], [r0]
+; CHECK: cmp [[OLD]], [[DESIRED]]
+; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: strexb [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
+; CHECK: bne [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
+; CHECK: {{moveq|movweq}} {{r[0-9]+}}, #1
+; CHECK: dmb ish
+ %res = cmpxchg i8* %addr, i8 %desired, i8 %new seq_cst monotonic
+ ret { i8, i1 } %res
+}
+
+define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_16:
+; CHECK: dmb ish
+; CHECK: uxth [[DESIRED:r[0-9]+]], [[DESIRED]]
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrexh [[OLD:r[0-9]+]], [r0]
+; CHECK: cmp [[OLD]], [[DESIRED]]
+; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: strexh [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
+; CHECK: bne [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
+; CHECK: {{moveq|movweq}} {{r[0-9]+}}, #1
+; CHECK: dmb ish
+ %res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst monotonic
+ ret { i16, i1 } %res
+}
+
+define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_32:
+; CHECK: dmb ish
+; CHECK-NOT: uxt
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrex [[OLD:r[0-9]+]], [r0]
+; CHECK: cmp [[OLD]], [[DESIRED]]
+; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
+; CHECK: bne [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
+; CHECK: {{moveq|movweq}} {{r[0-9]+}}, #1
+; CHECK: dmb ish
+ %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+ ret { i32, i1 } %res
+}
+
+define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind {
+; CHECK-LABEL: test_cmpxchg_64:
+; CHECK: dmb ish
+; CHECK-NOT: uxt
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [r0]
+; CHECK: cmp [[OLDLO]], r6
+; CHECK: sbcs{{(\.w)?}} [[STATUS:r[0-9]+]], [[OLDHI]], r7
+; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: strexd [[STATUS]], r4, r5, [r0]
+; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
+; CHECK: bne [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: dmb ish
+ %res = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst monotonic
+ ret { i64, i1 } %res
+}
+
+define { i64, i1 } @test_nontrivial_args(i64* %addr, i64 %desired, i64 %new) {
+; CHECK-LABEL: test_nontrivial_args:
+; CHECK: dmb ish
+; CHECK-NOT: uxt
+; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [r0]
+; CHECK: cmp [[OLDLO]], {{r[0-9]+}}
+; CHECK: sbcs{{(\.w)?}} [[STATUS:r[0-9]+]], [[OLDHI]], {{r[0-9]+}}
+; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: strexd [[STATUS]], {{r[0-9]+}}, {{r[0-9]+}}, [r0]
+; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
+; CHECK: bne [[RETRY]]
+; CHECK: [[DONE]]:
+; CHECK: dmb ish
+
+ %desired1 = add i64 %desired, 1
+ %new1 = add i64 %new, 1
+ %res = cmpxchg i64* %addr, i64 %desired1, i64 %new1 seq_cst seq_cst
+ ret { i64, i1 } %res
+}
+
+; The following used to trigger an assertion when creating a spill on thumb2
+; for a physreg with RC==GPRPairRegClass.
+; CHECK-LABEL: test_cmpxchg_spillbug:
+; CHECK: ldrexd
+; CHECK: strexd
+; CHECK: bne
+define void @test_cmpxchg_spillbug() {
+ %v = cmpxchg i64* undef, i64 undef, i64 undef seq_cst seq_cst
+ ret void
+}
diff --git a/test/CodeGen/ARM/cmpxchg-idioms.ll b/test/CodeGen/ARM/cmpxchg-idioms.ll
index 81e05acfef79..283202f0cc1f 100644
--- a/test/CodeGen/ARM/cmpxchg-idioms.ll
+++ b/test/CodeGen/ARM/cmpxchg-idioms.ll
@@ -3,26 +3,31 @@
define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
; CHECK-LABEL: test_return:
-; CHECK: dmb ishst
-
-; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
; CHECK: cmp [[LOADED]], r1
; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: strex [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
-; CHECK: cmp [[STATUS]], #0
-; CHECK: bne [[LOOP]]
+; CHECK: cbz [[STATUS]], [[SUCCESS:LBB[0-9]+_[0-9]+]]
+; CHECK: ldrex [[LOADED]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: beq [[LOOP]]
+
+; CHECK: [[FAILED]]:
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: clrex
; CHECK: dmb ish
-; CHECK: movs r0, #1
+; CHECK: movs r0, #0
; CHECK: bx lr
-; CHECK: [[FAILED]]:
+; CHECK: [[SUCCESS]]:
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
; CHECK: dmb ish
-; CHECK: movs r0, #0
+; CHECK: movs r0, #1
; CHECK: bx lr
%pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
@@ -34,26 +39,33 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
; CHECK-LABEL: test_return_bool:
-; CHECK: dmb ishst
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
-; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
; CHECK: cmp [[LOADED]], [[OLDBYTE]]
; CHECK: bne [[FAIL:LBB[0-9]+_[0-9]+]]
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: strexb [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
-; CHECK: cmp [[STATUS]], #0
-; CHECK: bne [[LOOP]]
+; CHECK: cbz [[STATUS]], [[SUCCESS:LBB[0-9]+_[0-9]+]]
+
+; CHECK: ldrexb [[LOADED]], [r0]
+; CHECK: cmp [[LOADED]], [[OLDBYTE]]
+; CHECK: beq [[LOOP]]
+
; FIXME: this eor is redundant. Need to teach DAG combine that.
-; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
-; CHECK: movs [[TMP:r[0-9]+]], #1
+; CHECK: [[FAIL]]:
+; CHECK: clrex
+; CHECK: movs [[TMP:r[0-9]+]], #0
; CHECK: eor r0, [[TMP]], #1
; CHECK: bx lr
-; CHECK: [[FAIL]]:
-; CHECK: movs [[TMP:r[0-9]+]], #0
+; CHECK: [[SUCCESS]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs [[TMP:r[0-9]+]], #1
; CHECK: eor r0, [[TMP]], #1
; CHECK: bx lr
@@ -67,26 +79,31 @@ define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
; CHECK-LABEL: test_conditional:
-; CHECK: dmb ishst
-
-; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
; CHECK: cmp [[LOADED]], r1
; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
-; CHECK: cmp [[STATUS]], #0
-; CHECK: bne [[LOOP]]
+; CHECK: cbz [[STATUS]], [[SUCCESS:LBB[0-9]+_[0-9]+]]
-; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
-; CHECK: dmb ish
-; CHECK: b.w _bar
+; CHECK: ldrex [[LOADED]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: beq [[LOOP]]
; CHECK: [[FAILED]]:
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: clrex
; CHECK: dmb ish
; CHECK: b.w _baz
+; CHECK: [[SUCCESS]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: dmb ish
+; CHECK: b.w _bar
+
%pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
%success = extractvalue { i32, i1 } %pair, 1
br i1 %success, label %true, label %false
diff --git a/test/CodeGen/ARM/cmpxchg-weak.ll b/test/CodeGen/ARM/cmpxchg-weak.ll
index 1eac9c41cf92..4038528c91bc 100644
--- a/test/CodeGen/ARM/cmpxchg-weak.ll
+++ b/test/CodeGen/ARM/cmpxchg-weak.ll
@@ -6,11 +6,11 @@ define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) {
%pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
%oldval = extractvalue { i32, i1 } %pair, 0
; CHECK-NEXT: BB#0:
-; CHECK-NEXT: dmb ish
; CHECK-NEXT: ldrex [[LOADED:r[0-9]+]], [r0]
; CHECK-NEXT: cmp [[LOADED]], r1
; CHECK-NEXT: bne [[LDFAILBB:LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: BB#1:
+; CHECK-NEXT: dmb ish
; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r2, [r0]
; CHECK-NEXT: cmp [[SUCCESS]], #0
; CHECK-NEXT: bne [[FAILBB:LBB[0-9]+_[0-9]+]]
@@ -36,13 +36,13 @@ define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) {
%success = extractvalue { i32, i1 } %pair, 1
; CHECK-NEXT: BB#0:
-; CHECK-NEXT: dmb ish
; CHECK-NEXT: ldrex [[LOADED:r[0-9]+]], [r1]
; CHECK-NEXT: cmp [[LOADED]], r2
; CHECK-NEXT: bne [[LDFAILBB:LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: BB#1:
-; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r3, [r1]
+; CHECK-NEXT: dmb ish
; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r3, [r1]
; CHECK-NEXT: cmp [[SUCCESS]], #0
; CHECK-NEXT: bxne lr
; CHECK-NEXT: dmb ish
diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 4468f1ec9c42..cd45af338fde 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -79,11 +79,10 @@ attributes #3 = { nounwind }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!33}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 182024) (llvm/trunk 182023)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !15, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 182024) (llvm/trunk 182023)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !15, imports: !2)
!1 = !DIFile(filename: "pr16110.c", directory: "/d/b")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "pr16110", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 7, file: !1, scope: !5, type: !6, variables: !9)
+!4 = distinct !DISubprogram(name: "pr16110", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, variables: !9)
!5 = !DIFile(filename: "pr16110.c", directory: "/d/b")
!6 = !DISubroutineType(types: !7)
!7 = !{!8}
diff --git a/test/CodeGen/ARM/code-placement.ll b/test/CodeGen/ARM/code-placement.ll
index bf5cf52d8b54..8eaf3d5ab6b2 100644
--- a/test/CodeGen/ARM/code-placement.ll
+++ b/test/CodeGen/ARM/code-placement.ll
@@ -12,9 +12,9 @@ entry:
br i1 %0, label %bb2, label %bb
bb:
-; CHECK: LBB0_1:
-; CHECK: bne LBB0_1
-; CHECK-NOT: b LBB0_1
+; CHECK: LBB0_2:
+; CHECK: bne LBB0_2
+; CHECK-NOT: b LBB0_2
; CHECK: bx lr
%list_addr.05 = phi %struct.list_head* [ %2, %bb ], [ %list, %entry ]
%next.04 = phi %struct.list_head* [ %list_addr.05, %bb ], [ null, %entry ]
diff --git a/test/CodeGen/ARM/crash-greedy.ll b/test/CodeGen/ARM/crash-greedy.ll
index a3d49f620e9c..6a58bb871d35 100644
--- a/test/CodeGen/ARM/crash-greedy.ll
+++ b/test/CodeGen/ARM/crash-greedy.ll
@@ -30,7 +30,7 @@ for.end: ; preds = %cond.end
%call85 = tail call double @exp(double %mul84) nounwind
%mul86 = fmul double %conv78, %call85
%add88 = fadd double 0.000000e+00, %mul86
-; CHECK: blx _exp
+; CHECK: bl _exp
%call100 = tail call double @exp(double %mul84) nounwind
%mul101 = fmul double undef, %call100
%add103 = fadd double %add46, %mul101
diff --git a/test/CodeGen/ARM/cxx-tlscc.ll b/test/CodeGen/ARM/cxx-tlscc.ll
index 11173bbb1978..5d017bbeebca 100644
--- a/test/CodeGen/ARM/cxx-tlscc.ll
+++ b/test/CodeGen/ARM/cxx-tlscc.ll
@@ -1,7 +1,12 @@
; RUN: llc < %s -mtriple=armv7k-apple-watchos2.0 | FileCheck %s
-; RUN: llc < %s -mtriple=armv7k-apple-watchos2.0 -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; RUN: llc < %s -mtriple=armv7k-apple-watchos2.0 -enable-shrink-wrap=true | FileCheck %s
; RUN: llc < %s -mtriple=armv7-apple-ios8.0 | FileCheck %s
-; RUN: llc < %s -mtriple=armv7-apple-ios8.0 -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; RUN: llc < %s -mtriple=armv7-apple-ios8.0 -enable-shrink-wrap=true | FileCheck %s
+
+; RUN: llc < %s -mtriple=armv7k-apple-watchos2.0 -O0 | FileCheck --check-prefix=CHECK-O0 --check-prefix=WATCH-O0 %s
+; RUN: llc < %s -mtriple=armv7-apple-ios8.0 -O0 | FileCheck --check-prefix=CHECK-O0 --check-prefix=IOS-O0 %s
+
+; RUN: llc < %s -mtriple=thumbv7-apple-ios8.0 | FileCheck --check-prefix=THUMB %s
%struct.S = type { i8 }
@@ -10,10 +15,24 @@
@__tls_guard = internal thread_local unnamed_addr global i1 false
@sum1 = internal thread_local global i32 0, align 4
+%class.C = type { i32 }
+@tC = internal thread_local global %class.C zeroinitializer, align 4
+
declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+; Make sure Epilog does not overwrite an explicitly-handled CSR in CXX_FAST_TLS.
+; THUMB-LABEL: _ZTW2sg
+; THUMB: push {{.*}}lr
+; THUMB: blx
+; THUMB: bne [[TH_end:.?LBB0_[0-9]+]]
+; THUMB: blx
+; THUMB: tlv_atexit
+; THUMB: [[TH_end]]:
+; THUMB: blx
+; THUMB: r4
+; THUMB: pop {{.*}}r4
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
%.b.i = load i1, i1* @__tls_guard, align 1
br i1 %.b.i, label %__tls_init.exit, label %init.i
@@ -29,14 +48,13 @@ __tls_init.exit:
}
; CHECK-LABEL: _ZTW2sg
-; CHECK: push {lr}
-; CHECK-NOT: push {r1, r2, r3, r4, r7, lr}
-; CHECK-NOT: push {r9, r12}
+; CHECK: push {r4, r5, r7, lr}
+; CHECK: push {r11, r12}
; CHECK-NOT: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
; CHECK-NOT: vpush {d0, d1, d2, d3, d4, d5, d6, d7}
; CHECK: blx
; CHECK: bne [[BB_end:.?LBB0_[0-9]+]]
-; CHECK; blx
+; CHECK: blx
; CHECK: tlv_atexit
; CHECK: [[BB_end]]:
; CHECK: blx
@@ -44,7 +62,25 @@ __tls_init.exit:
; CHECK-NOT: vpop {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
; CHECK-NOT: pop {r9, r12}
; CHECK-NOT: pop {r1, r2, r3, r4, r7, pc}
-; CHECK: pop {lr}
+; CHECK: pop {r4, r5, r7, pc}
+
+; CHECK-O0-LABEL: _ZTW2sg
+; WATCH-O0: push {r1, r2, r3, r6, r7, lr}
+; IOS-O0: push {r1, r2, r3, r7, lr}
+; CHECK-O0: push {r9, r12}
+; CHECK-O0: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+; CHECK-O0: vpush {d0, d1, d2, d3, d4, d5, d6, d7}
+; CHECK-O0: blx
+; CHECK-O0: bne [[BB_end:.?LBB0_[0-9]+]]
+; CHECK-O0: blx
+; CHECK-O0: tlv_atexit
+; CHECK-O0: [[BB_end]]:
+; CHECK-O0: blx
+; CHECK-O0: vpop {d0, d1, d2, d3, d4, d5, d6, d7}
+; CHECK-O0: vpop {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
+; CHECK-O0: pop {r9, r12}
+; WATCH-O0: pop {r1, r2, r3, r6, r7, pc}
+; IOS-O0: pop {r1, r2, r3, r7, pc}
; CHECK-LABEL: _ZTW4sum1
; CHECK-NOT: push {r1, r2, r3, r4, r7, lr}
@@ -52,6 +88,65 @@ __tls_init.exit:
; CHECK-NOT: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31}
; CHECK-NOT: vpush {d0, d1, d2, d3, d4, d5, d6, d7}
; CHECK: blx
+
+; CHECK-O0-LABEL: _ZTW4sum1
+; CHECK-O0-NOT: vpush
+; CHECK-O0-NOT: vstr
+; CHECK-O0-NOT: vpop
+; CHECK-O0-NOT: vldr
+; CHECK-O0: pop
define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
ret i32* @sum1
}
+
+; Make sure at O0, we don't generate spilling/reloading of the CSRs.
+; CHECK-O0-LABEL: tls_test2
+; CHECK-O0: push
+; CHECK-O0-NOT: vpush
+; CHECK-O0-NOT: vstr
+; CHECK-O0: tls_helper
+; CHECK-O0-NOT: vpop
+; CHECK-O0-NOT: vldr
+; CHECK-O0: pop
+declare cxx_fast_tlscc void @tls_helper()
+define cxx_fast_tlscc %class.C* @tls_test2() #1 {
+ call cxx_fast_tlscc void @tls_helper()
+ ret %class.C* @tC
+}
+
+; Make sure we do not allow tail call when caller and callee have different
+; calling conventions.
+declare %class.C* @_ZN1CD1Ev(%class.C* readnone returned %this)
+; CHECK-LABEL: tls_test
+; CHECK: bl __tlv_atexit
+define cxx_fast_tlscc void @__tls_test() {
+entry:
+ store i32 0, i32* getelementptr inbounds (%class.C, %class.C* @tC, i64 0, i32 0), align 4
+ %0 = tail call i32 @_tlv_atexit(void (i8*)* bitcast (%class.C* (%class.C*)* @_ZN1CD1Ev to void (i8*)*), i8* bitcast (%class.C* @tC to i8*), i8* nonnull @__dso_handle) #1
+ ret void
+}
+
+declare void @somefunc()
+define cxx_fast_tlscc void @test_ccmismatch_notail() {
+; A tail call is not possible here because somefunc does not preserve enough
+; registers.
+; CHECK-LABEL: test_ccmismatch_notail:
+; CHECK-NOT: b _somefunc
+; CHECK: bl _somefunc
+ tail call void @somefunc()
+ ret void
+}
+
+declare cxx_fast_tlscc void @some_fast_tls_func()
+define void @test_ccmismatch_tail() {
+; We can perform a tail call here because some_fast_tls_func preserves all
+; necessary registers (and more).
+; CHECK-LABEL: test_ccmismatch_tail:
+; CHECK-NOT: bl _some_fast_tls_func
+; CHECK: b _some_fast_tls_func
+ tail call cxx_fast_tlscc void @some_fast_tls_func()
+ ret void
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll b/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll
index 8b7153503b1f..206371a8f4e4 100644
--- a/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll
+++ b/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll
@@ -19,12 +19,48 @@ define float @f(<4 x i16>* nocapture %in) {
ret float %7
}
+; CHECK-LABEL: g:
define float @g(<4 x i16>* nocapture %in) {
; CHECK: vldr
%1 = load <4 x i16>, <4 x i16>* %in
- ; CHECK-NOT: uxth
+
+ ; For now we're generating a vmov.16 and a uxth instruction.
+ ; The uxth is redundant, and we should be able to extend without
+ ; having to generate cross-domain copies. Once we can do this
+ ; we should modify the checks below.
+
+ ; CHECK: uxth
%2 = extractelement <4 x i16> %1, i32 0
; CHECK: vcvt.f32.u32
%3 = uitofp i16 %2 to float
ret float %3
}
+
+; The backend generates for the following code an
+; (and 0xff (i32 extract_vector_elt (zext load <4 x i8> to 4 x i16)))
+;
+; The and is not redundant and cannot be removed. Since
+; extract_vector_elt is doing an implicit any_ext, the and
+; is required to guarantee that the top bits are set to zero.
+
+; Ideally should be a zext from <4 x i8> to <4 x 32>.
+
+; CHECK-LABEL: h:
+; CHECK: vld1.32
+; CHECK: uxtb
+define <4 x i32> @h(<4 x i8> *%in) {
+ %1 = load <4 x i8>, <4 x i8>* %in, align 4
+ %2 = extractelement <4 x i8> %1, i32 0
+ %3 = zext i8 %2 to i32
+ %4 = insertelement <4 x i32> undef, i32 %3, i32 0
+ %5 = extractelement <4 x i8> %1, i32 1
+ %6 = zext i8 %5 to i32
+ %7 = insertelement <4 x i32> %4, i32 %6, i32 1
+ %8 = extractelement <4 x i8> %1, i32 2
+ %9 = zext i8 %8 to i32
+ %10 = insertelement <4 x i32> %7, i32 %9, i32 2
+ %11 = extractelement <4 x i8> %1, i32 3
+ %12 = zext i8 %11 to i32
+ %13 = insertelement <4 x i32> %10, i32 %12, i32 3
+ ret <4 x i32> %13
+}
diff --git a/test/CodeGen/ARM/darwin-tls.ll b/test/CodeGen/ARM/darwin-tls.ll
index e19953222020..1043cce6218b 100644
--- a/test/CodeGen/ARM/darwin-tls.ll
+++ b/test/CodeGen/ARM/darwin-tls.ll
@@ -10,6 +10,8 @@
@local_tls_var = thread_local global i32 0
@external_tls_var = external thread_local global i32
+@hidden_external_tls_var = external hidden thread_local global i32
+
define i32 @test_local_tls() {
; T2-MOVT-PIC-LABEL: test_local_tls:
@@ -163,3 +165,18 @@ define i32 @test_external_tls() {
%val = load i32, i32* @external_tls_var, align 4
ret i32 %val
}
+
+; Just need something to trigger an indirect reference to the var.
+define i32 @use_hidden_external_tls() {
+ %val = load i32, i32* @hidden_external_tls_var, align 4
+ ret i32 %val
+}
+
+; T2-MOVT-PIC: .section __DATA,__thread_ptr,thread_local_variable_pointers
+; T2-MOVT-PIC: .p2align 2
+; T2-MOVT-PIC: L_external_tls_var$non_lazy_ptr:
+; T2-MOVT-PIC: .indirect_symbol _external_tls_var
+; T2-MOVT-PIC: .long 0
+; T2-MOVT-PIC: L_hidden_external_tls_var$non_lazy_ptr:
+; T2-MOVT-PIC: .indirect_symbol _hidden_external_tls_var
+; T2-MOVT-PIC: .long 0
diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
index 13ca20c20359..b9eae59cc320 100644
--- a/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -25,11 +25,10 @@
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "var.c", directory: "/tmp")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "sum", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "sum", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
!5 = !DIFile(filename: "var.c", directory: "/tmp")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index 4bd401b68496..9b54a4a463d1 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -125,11 +125,10 @@ declare void @_ZSt9terminatev()
!llvm.module.flags = !{!10, !11}
!llvm.ident = !{!12}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "exp.cpp", directory: "/tmp")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test", linkageName: "_Z4testiiiiiddddd", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "test", linkageName: "_Z4testiiiiiddddd", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
!5 = !DIFile(filename: "exp.cpp", directory: "/tmp")
!6 = !DISubroutineType(types: !7)
!7 = !{null, !8, !8, !8, !8, !8, !9, !9, !9, !9, !9}
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index bf7e7321ae3d..9dd820134dd4 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -32,8 +32,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!33}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !32, enums: !{}, retainedTypes: !{}, subprograms: !30, imports: null)
-!1 = distinct !DISubprogram(name: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !2, scope: !2, type: !3, variables: !31)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: FullDebug, file: !32, enums: !{}, retainedTypes: !{}, imports: null)
+!1 = distinct !DISubprogram(name: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 11, file: !2, scope: !2, type: !3, variables: !31)
!2 = !DIFile(filename: "one.c", directory: "/Volumes/Athwagate/R10048772")
!3 = !DISubroutineType(types: !4)
!4 = !{null}
@@ -62,7 +62,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!27 = distinct !DILexicalBlock(line: 11, column: 107, file: !2, scope: !1)
!28 = !DILocation(line: 13, column: 5, scope: !27)
!29 = !DILocation(line: 14, column: 1, scope: !27)
-!30 = !{!1}
!31 = !{!5, !13, !14, !17, !18, !19}
!32 = !DIFile(filename: "one.c", directory: "/Volumes/Athwagate/R10048772")
!33 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index c628c5e9038d..1e9d890e9333 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -111,7 +111,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!162}
-!0 = distinct !DICompileUnit(language: DW_LANG_ObjC, producer: "Apple clang version 2.1", isOptimized: false, runtimeVersion: 2, emissionKind: 1, file: !153, enums: !147, retainedTypes: !{}, subprograms: !148)
+!0 = distinct !DICompileUnit(language: DW_LANG_ObjC, producer: "Apple clang version 2.1", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, file: !153, enums: !147, retainedTypes: !{})
!1 = !DICompositeType(tag: DW_TAG_enumeration_type, line: 248, size: 32, align: 32, file: !160, scope: !0, elements: !3)
!2 = !DIFile(filename: "header.h", directory: "/Volumes/Sandbox/llvm")
!3 = !{!4}
@@ -134,7 +134,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
!20 = !DIFile(filename: "header4.h", directory: "/Volumes/Sandbox/llvm")
!21 = !{!22}
!22 = !DIEnumerator(name: "Eleven", value: 0) ; [ DW_TAG_enumerator ]
-!23 = distinct !DISubprogram(name: "foobar_func_block_invoke_0", line: 609, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 609, file: !152, scope: !24, type: !25)
+!23 = distinct !DISubprogram(name: "foobar_func_block_invoke_0", line: 609, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 609, file: !152, scope: !24, type: !25)
!24 = !DIFile(filename: "MyLibrary.m", directory: "/Volumes/Sandbox/llvm")
!25 = !DISubroutineType(types: !26)
!26 = !{null}
@@ -259,7 +259,6 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
!145 = !DILocation(line: 613, column: 17, scope: !142)
!146 = !DILocation(line: 615, column: 13, scope: !142)
!147 = !{!1, !1, !5, !5, !9, !14, !19, !19, !14, !14, !14, !19, !19, !19}
-!148 = !{!23}
!149 = !DIFile(filename: "header3.h", directory: "/Volumes/Sandbox/llvm")
!150 = !DIFile(filename: "Private.h", directory: "/Volumes/Sandbox/llvm")
!151 = !DIFile(filename: "header4.h", directory: "/Volumes/Sandbox/llvm")
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index b9d110e42cd4..b4e48c4c423e 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -42,9 +42,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.module.flags = !{!56}
!llvm.dbg.cu = !{!2}
-!0 = distinct !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !54, scope: null, type: !3, variables: !51)
+!0 = distinct !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, file: !54, scope: null, type: !3, variables: !51)
!1 = !DIFile(filename: "build2.c", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: FullDebug, file: !54, enums: !{}, retainedTypes: !{}, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "v4f32", line: 14, file: !54, scope: !2, baseType: !6)
@@ -52,11 +52,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!7 = !DIBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
!8 = !{!9}
!9 = !DISubrange(count: 4)
-!10 = distinct !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !54, scope: null, type: !11, variables: !52)
+!10 = distinct !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, file: !54, scope: null, type: !11, variables: !52)
!11 = !DISubroutineType(types: !12)
!12 = !{!13}
!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!14 = distinct !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !55, scope: null, type: !16, variables: !53)
+!14 = distinct !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, file: !55, scope: null, type: !16, variables: !53)
!15 = !DIFile(filename: "/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", directory: "/private/tmp")
!16 = !DISubroutineType(types: !17)
!17 = !{null}
@@ -92,7 +92,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!47 = distinct !DILexicalBlock(line: 41, column: 28, file: !15, scope: !14)
!48 = !DILocation(line: 95, column: 3, scope: !25)
!49 = !DILocation(line: 99, column: 3, scope: !25)
-!50 = !{!0, !10, !14}
!51 = !{!18}
!52 = !{!19, !20, !24, !26, !27, !28, !29}
!53 = !{!30}
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 0d457d3a7371..46146c7b8bf5 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -59,17 +59,17 @@ declare i32 @puts(i8* nocapture) nounwind
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!48}
-!0 = distinct !DISubprogram(name: "printer", linkageName: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !46, scope: !1, type: !3, variables: !43)
+!0 = distinct !DISubprogram(name: "printer", linkageName: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 12, file: !46, scope: !1, type: !3, variables: !43)
!1 = !DIFile(filename: "a.c", directory: "/tmp/")
-!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "(LLVM build 00)", isOptimized: true, emissionKind: 1, file: !46, enums: !47, retainedTypes: !47, subprograms: !42, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "(LLVM build 00)", isOptimized: true, emissionKind: FullDebug, file: !46, enums: !47, retainedTypes: !47, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5, !6, !7, !8}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !46, scope: !1, baseType: null)
!7 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 32, encoding: DW_ATE_float)
!8 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
-!9 = distinct !DISubprogram(name: "inlineprinter", linkageName: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !46, scope: !1, type: !3, variables: !44)
-!10 = distinct !DISubprogram(name: "main", linkageName: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 18, file: !46, scope: !1, type: !11, variables: !45)
+!9 = distinct !DISubprogram(name: "inlineprinter", linkageName: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 5, file: !46, scope: !1, type: !3, variables: !44)
+!10 = distinct !DISubprogram(name: "main", linkageName: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 18, file: !46, scope: !1, type: !11, variables: !45)
!11 = !DISubroutineType(types: !12)
!12 = !{!5, !5, !13}
!13 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, file: !46, scope: !1, baseType: !14)
@@ -106,7 +106,6 @@ declare i32 @puts(i8* nocapture) nounwind
!39 = !DILocation(line: 6, scope: !28, inlinedAt: !37)
!40 = !DILocation(line: 22, scope: !25)
!41 = !DILocation(line: 23, scope: !25)
-!42 = !{!0, !9, !10}
!43 = !{!16, !17, !18}
!44 = !{!19, !20, !21}
!45 = !{!22, !23, !24}
diff --git a/test/CodeGen/ARM/debug-info-no-frame.ll b/test/CodeGen/ARM/debug-info-no-frame.ll
index d77a195b9528..861c4ecefa98 100644
--- a/test/CodeGen/ARM/debug-info-no-frame.ll
+++ b/test/CodeGen/ARM/debug-info-no-frame.ll
@@ -21,10 +21,10 @@ attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!7, !8}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, subprograms: !{!3})
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, emissionKind: FullDebug)
!1 = !DIFile(filename: "file.c", directory: "/dir")
!2 = !{}
-!3 = distinct !DISubprogram(name: "need_cfi_def_cfa_offset", scope: !1, file: !1, line: 1, type: !4, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, variables: !2)
+!3 = distinct !DISubprogram(name: "need_cfi_def_cfa_offset", scope: !1, file: !1, line: 1, type: !4, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, variables: !2)
!4 = !DISubroutineType(types: !5)
!5 = !{null}
!6 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index 1cd90d433640..581b3e915ef1 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -38,9 +38,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!56}
-!0 = distinct !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !54, scope: !1, type: !3, variables: !51)
+!0 = distinct !DISubprogram(name: "test0001", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 3, file: !54, scope: !1, type: !3, variables: !51)
!1 = !DIFile(filename: "build2.c", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !54, enums: !{}, retainedTypes: !{}, subprograms: !50, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: FullDebug, file: !54, enums: !{}, retainedTypes: !{}, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "v4f32", line: 14, file: !54, scope: !2, baseType: !6)
@@ -48,11 +48,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!7 = !DIBasicType(tag: DW_TAG_base_type, name: "float", size: 32, align: 32, encoding: DW_ATE_float)
!8 = !{!9}
!9 = !DISubrange(count: 4)
-!10 = distinct !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 59, file: !54, scope: !1, type: !11, variables: !52)
+!10 = distinct !DISubprogram(name: "main", line: 59, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 59, file: !54, scope: !1, type: !11, variables: !52)
!11 = !DISubroutineType(types: !12)
!12 = !{!13}
!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!14 = distinct !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 41, file: !55, scope: !15, type: !16, variables: !53)
+!14 = distinct !DISubprogram(name: "printFV", line: 41, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 41, file: !55, scope: !15, type: !16, variables: !53)
!15 = !DIFile(filename: "/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", directory: "/private/tmp")
!16 = !DISubroutineType(types: !17)
!17 = !{null}
@@ -88,7 +88,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!47 = distinct !DILexicalBlock(line: 41, column: 28, file: !55, scope: !14)
!48 = !DILocation(line: 95, column: 3, scope: !25)
!49 = !DILocation(line: 99, column: 3, scope: !25)
-!50 = !{!0, !10, !14}
!51 = !{!18}
!52 = !{!19, !20, !24, !26, !27, !28, !29}
!53 = !{!30}
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index 654aa4545ca4..2987b9a2105a 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -65,14 +65,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!53}
-!0 = distinct !DISubprogram(name: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !51, scope: !1, type: !3, variables: !48)
+!0 = distinct !DISubprogram(name: "inlineprinter", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 5, file: !51, scope: !1, type: !3, variables: !48)
!1 = !DIFile(filename: "a.c", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: 1, file: !51, enums: !52, retainedTypes: !52, subprograms: !47, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (trunk 129915)", isOptimized: true, emissionKind: FullDebug, file: !51, enums: !52, retainedTypes: !52, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = distinct !DISubprogram(name: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !51, scope: !1, type: !3, variables: !49)
-!7 = distinct !DISubprogram(name: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 18, file: !51, scope: !1, type: !3, variables: !50)
+!6 = distinct !DISubprogram(name: "printer", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 12, file: !51, scope: !1, type: !3, variables: !49)
+!7 = distinct !DISubprogram(name: "main", line: 18, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 18, file: !51, scope: !1, type: !3, variables: !50)
!8 = !DILocalVariable(name: "ptr", line: 4, arg: 1, scope: !0, file: !1, type: !9)
!9 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, scope: !2, baseType: null)
!10 = !DILocalVariable(name: "val", line: 4, arg: 2, scope: !0, file: !1, type: !11)
@@ -117,7 +117,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!44 = !DILocation(line: 6, column: 3, scope: !28, inlinedAt: !40)
!45 = !DILocation(line: 22, column: 3, scope: !23)
!46 = !DILocation(line: 23, column: 1, scope: !23)
-!47 = !{!0, !6, !7}
!48 = !{!8, !10, !12}
!49 = !{!14, !15, !16}
!50 = !{!17, !18, !22}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index eadf1b48156b..b31d1b7bed4f 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -43,8 +43,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!20}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.0 (trunk 130845)", isOptimized: true, emissionKind: 1, file: !18, enums: !19, retainedTypes: !19, subprograms: !16, imports: null)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !18, scope: !2, type: !3, variables: !17)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.0 (trunk 130845)", isOptimized: true, emissionKind: FullDebug, file: !18, enums: !19, retainedTypes: !19, imports: null)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 5, file: !18, scope: !2, type: !3, variables: !17)
!2 = !DIFile(filename: "k.cc", directory: "/private/tmp")
!3 = !DISubroutineType(types: !4)
!4 = !{null}
@@ -59,7 +59,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!13 = !DILocation(line: 8, column: 20, scope: !9)
!14 = !DILocation(line: 7, column: 20, scope: !10)
!15 = !DILocation(line: 10, column: 1, scope: !6)
-!16 = !{!1}
!17 = !{!5, !8}
!18 = !DIFile(filename: "k.cc", directory: "/private/tmp")
!19 = !{}
diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
index bd0abedc4133..3aa33f754118 100644
--- a/test/CodeGen/ARM/debug-segmented-stacks.ll
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll
@@ -39,11 +39,10 @@ define void @test_basic() #0 !dbg !4 {
; ARM-linux .cfi_same_value r5
}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "var.c", directory: "/tmp")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test_basic", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "test_basic", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 5, file: !1, scope: !5, type: !6, variables: !2)
!5 = !DIFile(filename: "var.c", directory: "/tmp")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
diff --git a/test/CodeGen/ARM/debugtrap.ll b/test/CodeGen/ARM/debugtrap.ll
index 3d8cdea6cdae..5064a4ec2ca9 100644
--- a/test/CodeGen/ARM/debugtrap.ll
+++ b/test/CodeGen/ARM/debugtrap.ll
@@ -10,7 +10,7 @@ define void @test() nounwind {
entry:
; CHECK: bl foo
; CHECK-NEXT: pop
- ; CHECK-NEXT: trap
+ ; CHECK-NEXT: .inst 0xe7ffdefe
call void @foo()
call void @llvm.debugtrap()
ret void
diff --git a/test/CodeGen/ARM/default-float-abi.ll b/test/CodeGen/ARM/default-float-abi.ll
index 1b26bbdd9259..78d79def82cf 100644
--- a/test/CodeGen/ARM/default-float-abi.ll
+++ b/test/CodeGen/ARM/default-float-abi.ll
@@ -1,7 +1,10 @@
; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s --check-prefix=CHECK-HARD
+; RUN: llc -mtriple=armv7-linux-musleabihf %s -o - | FileCheck %s --check-prefix=CHECK-HARD
; RUN: llc -mtriple=armv7-linux-eabihf %s -o - | FileCheck %s --check-prefix=CHECK-HARD
; RUN: llc -mtriple=armv7-linux-gnueabihf -float-abi=soft %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc -mtriple=armv7-linux-musleabihf -float-abi=soft %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
; RUN: llc -mtriple=armv7-linux-gnueabi %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc -mtriple=armv7-linux-musleabi %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
; RUN: llc -mtriple=armv7-linux-eabi -float-abi=hard %s -o - | FileCheck %s --check-prefix=CHECK-HARD
; RUN: llc -mtriple=thumbv7-apple-ios6.0 %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
diff --git a/test/CodeGen/ARM/default-reloc.ll b/test/CodeGen/ARM/default-reloc.ll
new file mode 100644
index 000000000000..0b80b73061bc
--- /dev/null
+++ b/test/CodeGen/ARM/default-reloc.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=armv7-linux-gnu -O0 < %s
+@a = external global i32
+define i32* @get() {
+ ret i32* @a
+}
diff --git a/test/CodeGen/ARM/divmod-eabi.ll b/test/CodeGen/ARM/divmod-eabi.ll
index 4178af397e66..1e4c8fc336da 100644
--- a/test/CodeGen/ARM/divmod-eabi.ll
+++ b/test/CodeGen/ARM/divmod-eabi.ll
@@ -1,16 +1,33 @@
+; We run the tests with both the default optimization level and O0, to make sure
+; we don't have any ABI differences between them. In principle, the ABI checks
+; should be the same for both optimization levels (there could be exceptions
+; from this when a div and a mod with the same operands are not coallesced into
+; the same divmod, but luckily this doesn't occur in practice even at O0).
+; Sometimes the checks that the correct registers are used after the libcalls
+; are different between optimization levels, so we have to separate them.
; RUN: llc -mtriple armv7-none-eabi %s -o - | FileCheck %s --check-prefix=EABI
+; RUN: llc -mtriple armv7-none-eabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
; RUN: llc -mtriple armv7-none-eabihf %s -o - | FileCheck %s --check-prefix=EABI
-; Both "none-eabi" and "androideabi" must lower SREM/UREM to __aeabi_{u,i}divmod
+; RUN: llc -mtriple armv7-none-eabihf %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
+; All "eabi" (Bare, GNU and Android) must lower SREM/UREM to __aeabi_{u,i}divmod
; RUN: llc -mtriple armv7-linux-androideabi %s -o - | FileCheck %s --check-prefix=EABI
-; RUN: llc -mtriple armv7-linux-gnueabi %s -o - | FileCheck %s --check-prefix=GNU
-; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefix=DARWIN
+; RUN: llc -mtriple armv7-linux-androideabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
+; RUN: llc -mtriple armv7-linux-gnueabi %s -o - | FileCheck %s --check-prefix=EABI
+; RUN: llc -mtriple armv7-linux-gnueabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
+; RUN: llc -mtriple armv7-linux-musleabi %s -o - | FileCheck %s --check-prefix=EABI
+; RUN: llc -mtriple armv7-linux-musleabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
+; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT
+; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0
; FIXME: long-term, we will use "-apple-macho" and won't need this exception:
-; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - | FileCheck %s --check-prefix=DARWIN
+; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT
+; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0
+; RUN: llc -mtriple thumbv7-windows %s -o - | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-DEFAULT
+; RUN: llc -mtriple thumbv7-windows %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-O0
define signext i16 @f16(i16 signext %a, i16 signext %b) {
; EABI-LABEL: f16:
-; GNU-LABEL: f16:
; DARWIN-LABEL: f16:
+; WINDOWS-LABEL: f16:
entry:
%conv = sext i16 %a to i32
%conv1 = sext i16 %b to i32
@@ -19,189 +36,238 @@ entry:
; EABI: __aeabi_idivmod
; EABI: mov [[div:r[0-9]+]], r0
; EABI: mov [[rem:r[0-9]+]], r1
-; GNU: __aeabi_idiv
-; GNU: mov [[sum:r[0-9]+]], r0
-; GNU: __modsi3
-; GNU: add [[sum]]{{.*}}r0
; DARWIN: ___divsi3
-; DARWIN: mov [[sum:r[0-9]+]], r0
+; DARWIN: mov [[div:r[0-9]+]], r0
; DARWIN: __modsi3
-; DARWIN: add [[sum]]{{.*}}r0
+; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
+; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; WINDOWS: __rt_sdiv
+; WINDOWS-DEFAULT: mls [[rem:r[0-9]+]], r0,
+; WINDOWS-DEFAULT: adds [[sum:r[0-9]+]], [[rem]], r0
+; WINDOWS-O0: mov [[div:r[0-9]+]], r0
+; WINDOWS-O0: mls [[rem:r[0-9]+]], [[div]],
%rem8 = srem i32 %conv1, %conv
; EABI: __aeabi_idivmod
-; GNU: __modsi3
; DARWIN: __modsi3
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem1:r[0-9]+]], r0,
%add = add nsw i32 %rem, %div
%add13 = add nsw i32 %add, %rem8
%conv14 = trunc i32 %add13 to i16
; EABI: add r0{{.*}}r1
; EABI: sxth r0, r0
-; GNU: add r0{{.*}}[[sum]]
-; GNU: sxth r0, r0
-; DARWIN: add r0{{.*}}[[sum]]
-; DARWIN: sxth r0, r0
+; DARWIN-DEFAULT: add [[res:r[0-9]+]], [[sum]], r0
+; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
+; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
+; DARWIN: sxth r0, [[res]]
+; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], [[div]]
+; WINDOWS: add [[rem1]], [[sum]]
+; WINDOWS: sxth [[res:r[0-9]+]], [[rem1]]
ret i16 %conv14
}
define i32 @f32(i32 %a, i32 %b) {
; EABI-LABEL: f32:
-; GNU-LABEL: f32:
; DARWIN-LABEL: f32:
+; WINDOWS-LABEL: f32:
entry:
%div = sdiv i32 %a, %b
%rem = srem i32 %a, %b
; EABI: __aeabi_idivmod
; EABI: mov [[div:r[0-9]+]], r0
; EABI: mov [[rem:r[0-9]+]], r1
-; GNU: __aeabi_idiv
-; GNU: mov [[sum:r[0-9]+]], r0
-; GNU: __modsi3
-; GNU: add [[sum]]{{.*}}r0
; DARWIN: ___divsi3
-; DARWIN: mov [[sum:r[0-9]+]], r0
+; DARWIN: mov [[div:r[0-9]+]], r0
; DARWIN: __modsi3
-; DARWIN: add [[sum]]{{.*}}r0
+; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
+; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; WINDOWS: __rt_sdiv
+; WINDOWS: mov [[div:r[0-9]+]], r0
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem:r[0-9]+]], r0,
+; WINDOWS-DEFAULT: add [[div]], [[rem]]
%rem1 = srem i32 %b, %a
; EABI: __aeabi_idivmod
-; GNU: __modsi3
; DARWIN: __modsi3
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem1:r[0-9]+]], r0,
%add = add nsw i32 %rem, %div
%add2 = add nsw i32 %add, %rem1
; EABI: add r0{{.*}}r1
-; GNU: add r0{{.*}}[[sum]]
-; DARWIN: add r0{{.*}}[[sum]]
+; DARWIN-DEFAULT: add r0, [[sum]], r0
+; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
+; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
+; WINDOWS-DEFAULT: add [[rem1]], [[div]]
+; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], [[div]]
+; WINDOWS-O0: add [[rem1]], [[sum]]
ret i32 %add2
}
define i32 @uf(i32 %a, i32 %b) {
; EABI-LABEL: uf:
-; GNU-LABEL: uf:
; DARWIN-LABEL: uf:
+; WINDOWS-LABEL: uf:
entry:
%div = udiv i32 %a, %b
%rem = urem i32 %a, %b
; EABI: __aeabi_uidivmod
-; GNU: __aeabi_uidiv
-; GNU: mov [[sum:r[0-9]+]], r0
-; GNU: __umodsi3
-; GNU: add [[sum]]{{.*}}r0
; DARWIN: ___udivsi3
-; DARWIN: mov [[sum:r[0-9]+]], r0
+; DARWIN: mov [[div:r[0-9]+]], r0
; DARWIN: __umodsi3
-; DARWIN: add [[sum]]{{.*}}r0
+; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
+; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; WINDOWS: __rt_udiv
+; WINDOWS: mov [[div:r[0-9]+]], r0
+; WINDOWS: __rt_udiv
+; WINDOWS: mls [[rem:r[0-9]+]], r0,
+; WINDOWS-DEFAULT: add [[div]], [[rem]]
%rem1 = urem i32 %b, %a
; EABI: __aeabi_uidivmod
-; GNU: __umodsi3
; DARWIN: __umodsi3
+; WINDOWS: __rt_udiv
+; WINDOWS: mls [[rem1:r[0-9]+]], r0,
%add = add nuw i32 %rem, %div
%add2 = add nuw i32 %add, %rem1
; EABI: add r0{{.*}}r1
-; GNU: add r0{{.*}}[[sum]]
-; DARWIN: add r0{{.*}}[[sum]]
+; DARWIN-DEFAULT: add r0, [[sum]], r0
+; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
+; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
+; WINDOWS-DEFAULT: add [[rem1]], [[div]]
+; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], [[div]]
+; WINDOWS-O0: add [[rem1]], [[sum]]
ret i32 %add2
}
-; FIXME: AEABI is not lowering long u/srem into u/ldivmod
define i64 @longf(i64 %a, i64 %b) {
; EABI-LABEL: longf:
-; GNU-LABEL: longf:
; DARWIN-LABEL: longf:
+; WINDOWS-LABEL: longf:
entry:
%div = sdiv i64 %a, %b
%rem = srem i64 %a, %b
; EABI: __aeabi_ldivmod
-; GNU: __aeabi_ldivmod
-; GNU: mov [[div1:r[0-9]+]], r0
-; GNU: mov [[div2:r[0-9]+]], r1
+; EABI-NEXT: adds r0
+; EABI-NEXT: adc r1
+; EABI-NOT: __aeabi_ldivmod
; DARWIN: ___divdi3
; DARWIN: mov [[div1:r[0-9]+]], r0
; DARWIN: mov [[div2:r[0-9]+]], r1
; DARWIN: __moddi3
+; WINDOWS: __rt_sdiv64
+; WINDOWS: mov [[div1:r[0-9]+]], r0
+; WINDOWS: mov [[div2:r[0-9]+]], r1
+; WINDOWS: __moddi3
%add = add nsw i64 %rem, %div
-; GNU: adds r0{{.*}}[[div1]]
-; GNU: adc r1{{.*}}[[div2]]
; DARWIN: adds r0{{.*}}[[div1]]
; DARWIN: adc r1{{.*}}[[div2]]
+; WINDOWS: adds.w r0, r0, [[div1]]
+; WINDOWS: adc.w r1, r1, [[div2]]
ret i64 %add
}
+define i16 @shortf(i16 %a, i16 %b) {
+; EABI-LABEL: shortf:
+; DARWIN-LABEL: shortf:
+; WINDOWS-LABEL: shortf:
+entry:
+ %div = sdiv i16 %a, %b
+ %rem = srem i16 %a, %b
+; EABI: __aeabi_idivmod
+; DARWIN: ___divsi3
+; DARWIN: mov [[div1:r[0-9]+]], r0
+; DARWIN: __modsi3
+; WINDOWS: __rt_sdiv
+; WINDOWS: mov [[div:r[0-9]+]], r0
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem:r[0-9]+]], r0,
+ %add = add nsw i16 %rem, %div
+; EABI: add r0, r1
+; DARWIN: add r0{{.*}}[[div1]]
+; WINDOWS: add [[rem]], [[div]]
+ ret i16 %add
+}
+
define i32 @g1(i32 %a, i32 %b) {
; EABI-LABEL: g1:
-; GNU-LABEL: g1:
; DARWIN-LABEL: g1:
+; WINDOWS-LABEL: g1:
entry:
%div = sdiv i32 %a, %b
%rem = srem i32 %a, %b
; EABI: __aeabi_idivmod
-; GNU: __aeabi_idiv
-; GNU: mov [[sum:r[0-9]+]], r0
-; GNU: __modsi3
; DARWIN: ___divsi3
; DARWIN: mov [[sum:r[0-9]+]], r0
; DARWIN: __modsi3
+; WINDOWS: __rt_sdiv
+; WINDOWS: mov [[div:r[0-9]+]], r0
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem:r[0-9]+]], r0,
%add = add nsw i32 %rem, %div
; EABI: add r0{{.*}}r1
-; GNU: add r0{{.*}}[[sum]]
; DARWIN: add r0{{.*}}[[sum]]
+; WINDOWS: add [[rem]], [[div]]
ret i32 %add
}
; On both Darwin and Gnu, this is just a call to __modsi3
define i32 @g2(i32 %a, i32 %b) {
; EABI-LABEL: g2:
-; GNU-LABEL: g2:
; DARWIN-LABEL: g2:
+; WINDOWS-LABEL: g2:
entry:
%rem = srem i32 %a, %b
; EABI: __aeabi_idivmod
-; GNU: __modsi3
; DARWIN: __modsi3
+; WINDOWS: __rt_sdiv
ret i32 %rem
; EABI: mov r0, r1
+; WINDOWS: mls r0, r0,
}
define i32 @g3(i32 %a, i32 %b) {
; EABI-LABEL: g3:
-; GNU-LABEL: g3:
; DARWIN-LABEL: g3:
+; WINDOWS-LABEL: g3:
entry:
%rem = srem i32 %a, %b
; EABI: __aeabi_idivmod
; EABI: mov [[mod:r[0-9]+]], r1
-; GNU: __modsi3
-; GNU: mov [[sum:r[0-9]+]], r0
; DARWIN: __modsi3
; DARWIN: mov [[sum:r[0-9]+]], r0
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem:r[0-9]+]], r0,
%rem1 = srem i32 %b, %rem
; EABI: __aeabi_idivmod
-; GNU: __modsi3
; DARWIN: __modsi3
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem1:r[0-9]+]], r0,
%add = add nsw i32 %rem1, %rem
; EABI: add r0, r1, [[mod]]
-; GNU: add r0{{.*}}[[sum]]
; DARWIN: add r0{{.*}}[[sum]]
+; WINDOWS: add [[rem1]], [[rem]]
ret i32 %add
}
define i32 @g4(i32 %a, i32 %b) {
; EABI-LABEL: g4:
-; GNU-LABEL: g4:
; DARWIN-LABEL: g4:
+; WINDOWS-LABEL: g4:
entry:
%div = sdiv i32 %a, %b
; EABI: __aeabi_idiv{{$}}
; EABI: mov [[div:r[0-9]+]], r0
-; GNU: __aeabi_idiv
-; GNU: mov [[sum:r[0-9]+]], r0
; DARWIN: ___divsi3
; DARWIN: mov [[sum:r[0-9]+]], r0
+; WINDOWS: __rt_sdiv
+; WINDOWS: mov [[div:r[0-9]+]], r0
%rem = srem i32 %b, %div
; EABI: __aeabi_idivmod
-; GNU: __modsi3
; DARWIN: __modsi3
+; WINDOWS: __rt_sdiv
+; WINDOWS: mls [[rem:r[0-9]+]], r0,
%add = add nsw i32 %rem, %div
; EABI: add r0, r1, [[div]]
-; GNU: add r0{{.*}}[[sum]]
; DARWIN: add r0{{.*}}[[sum]]
+; WINDOWS: add [[rem]], [[div]]
ret i32 %add
}
diff --git a/test/CodeGen/ARM/eh-resume-darwin.ll b/test/CodeGen/ARM/eh-resume-darwin.ll
index d3a8481275f3..6c2716bffa6d 100644
--- a/test/CodeGen/ARM/eh-resume-darwin.ll
+++ b/test/CodeGen/ARM/eh-resume-darwin.ll
@@ -1,6 +1,7 @@
; RUN: llc < %s -mtriple=armv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=IOS
-; RUN: llc < %s -mtriple=armv7k-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=IOS
-; RUN: llc < %s -mtriple=armv7k-apple-watchos -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=WATCHOS
+; RUN: llc < %s -mtriple=armv7-apple-watchos -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -mtriple=armv7k-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=WATCHABI
+; RUN: llc < %s -mtriple=armv7k-apple-watchos -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=WATCHABI
declare void @func()
@@ -21,4 +22,4 @@ lpad:
}
; IOS: __Unwind_SjLj_Resume
-; WATCHOS: __Unwind_Resume
+; WATCHABI: __Unwind_Resume
diff --git a/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll b/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
index 3d380bf8f22a..517d5597bb28 100644
--- a/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
+++ b/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
@@ -42,7 +42,7 @@ try.cont:
}
; CHECK: .globl test1
-; CHECK: .align 2
+; CHECK: .p2align 2
; CHECK: .type test1,%function
; CHECK-LABEL: test1:
; CHECK: .fnstart
@@ -51,7 +51,7 @@ try.cont:
; CHECK: .personality __gxx_personality_v0
; CHECK: .handlerdata
-; CHECK: .align 2
+; CHECK: .p2align 2
; CHECK-LABEL: GCC_except_table0:
; CHECK-LABEL: .Lexception0:
; CHECK: .byte 255 @ @LPStart Encoding = omit
diff --git a/test/CodeGen/ARM/ehabi-handlerdata.ll b/test/CodeGen/ARM/ehabi-handlerdata.ll
index c53b36ffe18f..ecb23c3424e2 100644
--- a/test/CodeGen/ARM/ehabi-handlerdata.ll
+++ b/test/CodeGen/ARM/ehabi-handlerdata.ll
@@ -40,13 +40,13 @@ try.cont:
}
; CHECK: .globl test1
-; CHECK: .align 2
+; CHECK: .p2align 2
; CHECK: .type test1,%function
; CHECK-LABEL: test1:
; CHECK: .fnstart
; CHECK: .personality __gxx_personality_v0
; CHECK: .handlerdata
-; CHECK: .align 2
+; CHECK: .p2align 2
; CHECK-LABEL: GCC_except_table0:
; CHECK-LABEL: .Lexception0:
; CHECK: .byte 255 @ @LPStart Encoding = omit
diff --git a/test/CodeGen/ARM/ehabi.ll b/test/CodeGen/ARM/ehabi.ll
index 923cffcf6532..e2984de9ce4d 100644
--- a/test/CodeGen/ARM/ehabi.ll
+++ b/test/CodeGen/ARM/ehabi.ll
@@ -34,6 +34,22 @@
; RUN: -filetype=asm -o - %s \
; RUN: | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
+; RUN: llc -mtriple arm-unknown-linux-musleabi \
+; RUN: -disable-fp-elim -filetype=asm -o - %s \
+; RUN: | FileCheck %s --check-prefix=CHECK-FP
+
+; RUN: llc -mtriple arm-unknown-linux-musleabi \
+; RUN: -filetype=asm -o - %s \
+; RUN: | FileCheck %s --check-prefix=CHECK-FP-ELIM
+
+; RUN: llc -mtriple armv7-unknown-linux-musleabi \
+; RUN: -disable-fp-elim -filetype=asm -o - %s \
+; RUN: | FileCheck %s --check-prefix=CHECK-V7-FP
+
+; RUN: llc -mtriple armv7-unknown-linux-musleabi \
+; RUN: -filetype=asm -o - %s \
+; RUN: | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
+
; RUN: llc -mtriple arm-unknown-linux-androideabi \
; RUN: -disable-fp-elim -filetype=asm -o - %s \
; RUN: | FileCheck %s --check-prefix=CHECK-FP
diff --git a/test/CodeGen/ARM/emutls.ll b/test/CodeGen/ARM/emutls.ll
index 7ba50dd249bb..e66d93ebcb89 100644
--- a/test/CodeGen/ARM/emutls.ll
+++ b/test/CodeGen/ARM/emutls.ll
@@ -11,8 +11,9 @@ define i32 @my_get_xyz() {
; ARM32-LABEL: my_get_xyz:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl my_emutls_get_address(PLT)
+; ARM32-NEXT: bl my_emutls_get_address
; ARM32-NEXT: ldr r0, [r0]
+; ARM32: .long my_emutls_v_xyz(GOT_PREL)
entry:
%call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
@@ -33,8 +34,9 @@ define i32 @f1() {
; ARM32-LABEL: f1:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldr r0, [r0]
+; ARM32: .long __emutls_v.i1(GOT_PREL)
entry:
%tmp1 = load i32, i32* @i1
@@ -45,8 +47,9 @@ define i32* @f2() {
; ARM32-LABEL: f2:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: pop
+; ARM32: .long __emutls_v.i1(GOT_PREL)
entry:
ret i32* @i1
@@ -56,8 +59,9 @@ define i32 @f3() nounwind {
; ARM32-LABEL: f3:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldr r0, [r0]
+; ARM32: .long __emutls_v.i2(GOT_PREL)
entry:
%tmp1 = load i32, i32* @i2
@@ -68,8 +72,9 @@ define i32* @f4() {
; ARM32-LABEL: f4:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: pop
+; ARM32: .long __emutls_v.i2(GOT_PREL)
entry:
ret i32* @i2
@@ -78,9 +83,10 @@ entry:
define i32 @f5() nounwind {
; ARM32-LABEL: f5:
; ARM32: ldr r0,
-; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32: add r0, pc, r0
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldr r0, [r0]
+; ARM32: .long __emutls_v.i3-
entry:
%tmp1 = load i32, i32* @i3
@@ -90,9 +96,10 @@ entry:
define i32* @f6() {
; ARM32-LABEL: f6:
; ARM32: ldr r0,
-; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32: add r0, pc, r0
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: pop
+; ARM32: .long __emutls_v.i3-
entry:
ret i32* @i3
@@ -101,9 +108,10 @@ entry:
define i32 @f7() {
; ARM32-LABEL: f7:
; ARM32: ldr r0,
-; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32: add r0, pc, r0
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldr r0, [r0]
+; ARM32: .long __emutls_v.i4-(.LPC
entry:
%tmp1 = load i32, i32* @i4
@@ -113,9 +121,10 @@ entry:
define i32* @f8() {
; ARM32-LABEL: f8:
; ARM32: ldr r0,
-; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32: add r0, pc, r0
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: pop
+; ARM32: .long __emutls_v.i4-(.LPC
entry:
ret i32* @i4
@@ -124,8 +133,8 @@ entry:
define i32 @f9() {
; ARM32-LABEL: f9:
; ARM32: ldr r0,
-; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32: add r0, pc, r0
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldr r0, [r0]
entry:
@@ -136,8 +145,8 @@ entry:
define i32* @f10() {
; ARM32-LABEL: f10:
; ARM32: ldr r0,
-; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32: add r0, pc, r0
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: pop
entry:
@@ -148,7 +157,7 @@ define i16 @f11() {
; ARM32-LABEL: f11:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldrh r0, [r0]
entry:
@@ -160,7 +169,7 @@ define i32 @f12() {
; ARM32-LABEL: f12:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldrsh r0, [r0]
entry:
@@ -173,7 +182,7 @@ define i8 @f13() {
; ARM32-LABEL: f13:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldrb r0, [r0]
; ARM32-NEXT: pop
@@ -186,7 +195,7 @@ define i32 @f14() {
; ARM32-LABEL: f14:
; ARM32: ldr r0,
; ARM32: ldr r0, [pc, r0]
-; ARM32-NEXT: bl __emutls_get_address(PLT)
+; ARM32-NEXT: bl __emutls_get_address
; ARM32-NEXT: ldrsb r0, [r0]
; ARM32-NEXT: pop
@@ -198,46 +207,50 @@ entry:
;;;;;;;;;;;;;; 32-bit __emutls_v. and __emutls_t.
-; ARM32 .section .data.rel.local,
+; ARM32: .data{{$}}
+; ARM32: .globl __emutls_v.i1
; ARM32-LABEL: __emutls_v.i1:
; ARM32-NEXT: .long 4
; ARM32-NEXT: .long 4
; ARM32-NEXT: .long 0
; ARM32-NEXT: .long __emutls_t.i1
-; ARM32 .section .rodata,
+; ARM32: .section .rodata,
; ARM32-LABEL: __emutls_t.i1:
; ARM32-NEXT: .long 15
; ARM32-NOT: __emutls_v.i2
-; ARM32 .section .data.rel.local,
+; ARM32: .data{{$}}
+; ARM32-NOT: .globl
; ARM32-LABEL: __emutls_v.i3:
; ARM32-NEXT: .long 4
; ARM32-NEXT: .long 4
; ARM32-NEXT: .long 0
; ARM32-NEXT: .long __emutls_t.i3
-; ARM32 .section .rodata,
+; ARM32: .section .rodata,
; ARM32-LABEL: __emutls_t.i3:
; ARM32-NEXT: .long 15
-; ARM32 .section .data.rel.local,
+; ARM32: .data{{$}}
+; ARM32: .globl __emutls_v.i4
; ARM32-LABEL: __emutls_v.i4:
; ARM32-NEXT: .long 4
; ARM32-NEXT: .long 4
; ARM32-NEXT: .long 0
; ARM32-NEXT: .long __emutls_t.i4
-; ARM32 .section .rodata,
+; ARM32: .section .rodata,
; ARM32-LABEL: __emutls_t.i4:
; ARM32-NEXT: .long 15
; ARM32-NOT: __emutls_v.i5:
-; ARM32 .hidden __emutls_v.i5
+; ARM32: .hidden __emutls_v.i5
; ARM32-NOT: __emutls_v.i5:
-; ARM32 .section .data.rel.local,
+; ARM32: .data{{$}}
+; ARM32: .globl __emutls_v.s1
; ARM32-LABEL: __emutls_v.s1:
; ARM32-NEXT: .long 2
; ARM32-NEXT: .long 2
@@ -248,7 +261,8 @@ entry:
; ARM32-LABEL: __emutls_t.s1:
; ARM32-NEXT: .short 15
-; ARM32 .section .data.rel.local,
+; ARM32: .data{{$}}
+; ARM32: .globl __emutls_v.b1
; ARM32-LABEL: __emutls_v.b1:
; ARM32-NEXT: .long 1
; ARM32-NEXT: .long 1
diff --git a/test/CodeGen/ARM/emutls_generic.ll b/test/CodeGen/ARM/emutls_generic.ll
index 0fada88fb5d9..f5633dc23bcd 100644
--- a/test/CodeGen/ARM/emutls_generic.ll
+++ b/test/CodeGen/ARM/emutls_generic.ll
@@ -35,12 +35,13 @@ entry:
; ARM_32: bl __emutls_get_address
; ARM_32: .long __emutls_v.external_y
; ARM_32-LABEL: get_internal_y:
-; ARM_32: bl __emutls_get_address
-; ARM_32: .long __emutls_v.internal_y
-; ARM_32-NOT: __emutls_t.external_x
-; ARM_32-NOT: __emutls_v.external_x:
-; ARM_32: .data
-; ARM_32: .align 2
+; ARM_32: bl __emutls_get_address
+; ARM_32: .long __emutls_v.internal_y
+; ARM_32-NOT: __emutls_t.external_x
+; ARM_32-NOT: __emutls_v.external_x:
+; ARM_32: .data{{$}}
+; ARM_32: .globl __emutls_v.external_y
+; ARM_32: .p2align 2
; ARM_32-LABEL: __emutls_v.external_y:
; ARM_32-NEXT: .long 1
; ARM_32-NEXT: .long 2
@@ -49,8 +50,9 @@ entry:
; ARM_32: .section .rodata,
; ARM_32-LABEL: __emutls_t.external_y:
; ARM_32-NEXT: .byte 7
-; ARM_32: .data
-; ARM_32: .align 2
+; ARM_32: .data{{$}}
+; ARM_32-NOT: .globl
+; ARM_32: .p2align 2
; ARM_32-LABEL: __emutls_v.internal_y:
; ARM_32-NEXT: .long 8
; ARM_32-NEXT: .long 16
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index e382e78a9950..1122867356b3 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,8 +1,8 @@
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG --check-prefix=ARM-LONG-MACHO
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG --check-prefix=ARM-LONG-ELF
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=THUMB-LONG
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
@@ -111,9 +111,14 @@ entry:
; ARM: str [[R4]], [sp, #4]
; ARM: bl {{_?}}bar
; ARM-LONG: @t10
-; ARM-LONG: {{(movw)|(ldr)}} [[R:l?r[0-9]*]], {{(:lower16:L_bar\$non_lazy_ptr)|(.LCPI)}}
-; ARM-LONG: {{(movt [[R]], :upper16:L_bar\$non_lazy_ptr)?}}
-; ARM-LONG: ldr [[R]], {{\[}}[[R]]{{\]}}
+
+; ARM-LONG-MACHO: {{(movw)|(ldr)}} [[R:l?r[0-9]*]], {{(:lower16:L_bar\$non_lazy_ptr)|(.LCPI)}}
+; ARM-LONG-MACHO: {{(movt [[R]], :upper16:L_bar\$non_lazy_ptr)?}}
+; ARM-LONG-MACHO: ldr [[R]], {{\[}}[[R]]{{\]}}
+
+; ARM-LONG-ELF: movw [[R:l?r[0-9]*]], :lower16:bar
+; ARM-LONG-ELF: {{(movt [[R]], :upper16:L_bar\$non_lazy_ptr)?}}
+
; ARM-LONG: blx [[R]]
; THUMB: @t10
; THUMB: movs [[R0:l?r[0-9]*]], #0
@@ -167,9 +172,14 @@ entry:
; ARM: LibCall
; ARM: bl {{___udivsi3|__aeabi_uidiv}}
; ARM-LONG: LibCall
-; ARM-LONG: {{(movw r2, :lower16:L___udivsi3\$non_lazy_ptr)|(ldr r2, .LCPI)}}
-; ARM-LONG: {{(movt r2, :upper16:L___udivsi3\$non_lazy_ptr)?}}
-; ARM-LONG: ldr r2, [r2]
+
+; ARM-LONG-MACHO: {{(movw r2, :lower16:L___udivsi3\$non_lazy_ptr)|(ldr r2, .LCPI)}}
+; ARM-LONG-MACHO: {{(movt r2, :upper16:L___udivsi3\$non_lazy_ptr)?}}
+; ARM-LONG-MACHO: ldr r2, [r2]
+
+; ARM-LONG-ELF: movw r2, :lower16:__aeabi_uidiv
+; ARM-LONG-ELF: movt r2, :upper16:__aeabi_uidiv
+
; ARM-LONG: blx r2
; THUMB: LibCall
; THUMB: bl {{___udivsi3|__aeabi_uidiv}}
diff --git a/test/CodeGen/ARM/fast-isel-deadcode.ll b/test/CodeGen/ARM/fast-isel-deadcode.ll
index e584c54b48a2..d66a81c7cdb2 100644
--- a/test/CodeGen/ARM/fast-isel-deadcode.ll
+++ b/test/CodeGen/ARM/fast-isel-deadcode.ll
@@ -7,7 +7,7 @@ define i32 @main(i32 %argc, i8** %argv) nounwind {
entry:
; THUMB: main
call void @printArgsNoRet(i32 1, float 0x4000CCCCC0000000, i8 signext 99, double 4.100000e+00)
-; THUMB: blx _printArgsNoRet
+; THUMB: bl _printArgsNoRet
; THUMB-NOT: ldr
; THUMB-NOT: vldr
; THUMB-NOT: vmov
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 1c7ff6879386..277461aa566b 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM --check-prefix=ARM-MACHO
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM --check-prefix=ARM-ELF
; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG --check-prefix=ARM-LONG-MACHO
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG --check-prefix=ARM-LONG-ELF
; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=+long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
; Note that some of these tests assume that relocations are either
@@ -22,9 +22,14 @@ define void @t1() nounwind ssp {
; ARM: and r1, r1, #255
; ARM: bl {{_?}}memset
; ARM-LONG-LABEL: t1:
-; ARM-LONG: {{(movw r3, :lower16:L_memset\$non_lazy_ptr)|(ldr r3, .LCPI)}}
-; ARM-LONG: {{(movt r3, :upper16:L_memset\$non_lazy_ptr)?}}
-; ARM-LONG: ldr r3, [r3]
+
+; ARM-LONG-MACHO: {{(movw r3, :lower16:L_memset\$non_lazy_ptr)|(ldr r3, .LCPI)}}
+; ARM-LONG-MACHO: {{(movt r3, :upper16:L_memset\$non_lazy_ptr)?}}
+; ARM-LONG-MACHO: ldr r3, [r3]
+
+; ARM-LONG-ELF: movw r3, :lower16:memset
+; ARM-LONG-ELF: movt r3, :upper16:memset
+
; ARM-LONG: blx r3
; THUMB-LABEL: t1:
; THUMB: {{(movw r0, :lower16:_?message1)|(ldr.n r0, .LCPI)}}
@@ -47,9 +52,14 @@ declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
define void @t2() nounwind ssp {
; ARM-LABEL: t2:
-; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
-; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
-; ARM: ldr r0, [r0]
+
+; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
+; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
+; ARM-MACHO: ldr r0, [r0]
+
+; ARM-ELF: movw r0, :lower16:temp
+; ARM-ELF: movt r0, :upper16:temp
+
; ARM: add r1, r0, #4
; ARM: add r0, r0, #16
; ARM: movw r2, #17
@@ -58,9 +68,14 @@ define void @t2() nounwind ssp {
; ARM: ldr r1, [sp[[SLOT]]] @ 4-byte Reload
; ARM: bl {{_?}}memcpy
; ARM-LONG-LABEL: t2:
-; ARM-LONG: {{(movw r3, :lower16:L_memcpy\$non_lazy_ptr)|(ldr r3, .LCPI)}}
-; ARM-LONG: {{(movt r3, :upper16:L_memcpy\$non_lazy_ptr)?}}
-; ARM-LONG: ldr r3, [r3]
+
+; ARM-LONG-MACHO: {{(movw r3, :lower16:L_memcpy\$non_lazy_ptr)|(ldr r3, .LCPI)}}
+; ARM-LONG-MACHO: {{(movt r3, :upper16:L_memcpy\$non_lazy_ptr)?}}
+; ARM-LONG-MACHO: ldr r3, [r3]
+
+; ARM-LONG-ELF: movw r3, :lower16:memcpy
+; ARM-LONG-ELF: movt r3, :upper16:memcpy
+
; ARM-LONG: blx r3
; THUMB-LABEL: t2:
; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
@@ -86,18 +101,29 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32,
define void @t3() nounwind ssp {
; ARM-LABEL: t3:
-; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
-; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
-; ARM: ldr r0, [r0]
+
+; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
+; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
+; ARM-MACHO: ldr r0, [r0]
+
+; ARM-ELF: movw r0, :lower16:temp
+; ARM-ELF: movt r0, :upper16:temp
+
+
; ARM: add r1, r0, #4
; ARM: add r0, r0, #16
; ARM: movw r2, #10
; ARM: mov r0, r1
; ARM: bl {{_?}}memmove
; ARM-LONG-LABEL: t3:
-; ARM-LONG: {{(movw r3, :lower16:L_memmove\$non_lazy_ptr)|(ldr r3, .LCPI)}}
-; ARM-LONG: {{(movt r3, :upper16:L_memmove\$non_lazy_ptr)?}}
-; ARM-LONG: ldr r3, [r3]
+
+; ARM-LONG-MACHO: {{(movw r3, :lower16:L_memmove\$non_lazy_ptr)|(ldr r3, .LCPI)}}
+; ARM-LONG-MACHO: {{(movt r3, :upper16:L_memmove\$non_lazy_ptr)?}}
+; ARM-LONG-MACHO: ldr r3, [r3]
+
+; ARM-LONG-ELF: movw r3, :lower16:memmove
+; ARM-LONG-ELF: movt r3, :upper16:memmove
+
; ARM-LONG: blx r3
; THUMB-LABEL: t3:
; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
@@ -121,9 +147,14 @@ define void @t3() nounwind ssp {
define void @t4() nounwind ssp {
; ARM-LABEL: t4:
-; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
-; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
-; ARM: ldr r0, [r0]
+
+; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
+; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
+; ARM-MACHO: ldr r0, [r0]
+
+; ARM-ELF: movw r0, :lower16:temp
+; ARM-ELF: movt r0, :upper16:temp
+
; ARM: ldr r1, [r0, #16]
; ARM: str r1, [r0, #4]
; ARM: ldr r1, [r0, #20]
@@ -150,9 +181,14 @@ declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32,
define void @t5() nounwind ssp {
; ARM-LABEL: t5:
-; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
-; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
-; ARM: ldr r0, [r0]
+
+; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
+; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
+; ARM-MACHO: ldr r0, [r0]
+
+; ARM-ELF: movw r0, :lower16:temp
+; ARM-ELF: movt r0, :upper16:temp
+
; ARM: ldrh r1, [r0, #16]
; ARM: strh r1, [r0, #4]
; ARM: ldrh r1, [r0, #18]
@@ -185,9 +221,14 @@ define void @t5() nounwind ssp {
define void @t6() nounwind ssp {
; ARM-LABEL: t6:
-; ARM: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
-; ARM: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
-; ARM: ldr r0, [r0]
+
+; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
+; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
+; ARM-MACHO: ldr r0, [r0]
+
+; ARM-ELF: movw r0, :lower16:temp
+; ARM-ELF: movt r0, :upper16:temp
+
; ARM: ldrb r1, [r0, #16]
; ARM: strb r1, [r0, #4]
; ARM: ldrb r1, [r0, #17]
diff --git a/test/CodeGen/ARM/fast-isel-pie.ll b/test/CodeGen/ARM/fast-isel-pie.ll
new file mode 100644
index 000000000000..23a88bdaa22e
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-pie.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=pic -mtriple=armv7-pc-linux-gnueabi | FileCheck %s
+
+@var = global i32 42
+
+define i32* @foo() {
+; CHECK: foo:
+; CHECK: ldr r0, .L[[POOL:.*]]
+; CHECK-NEXT: .L[[ADDR:.*]]:
+; CHECK-NEXT: add r0, pc, r0
+; CHECK-NEXT: bx lr
+
+; CHECK: .L[[POOL]]:
+; CHECK-NEXT: .long var-(.L[[ADDR]]+8)
+
+ ret i32* @var
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"PIE Level", i32 2}
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 49460220c47c..502285e85dfd 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM --check-prefix=ARM-MACHO
+; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM --check-prefix=ARM-ELF
; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
; Very basic fast-isel functionality.
@@ -154,9 +154,13 @@ define void @test4() {
; THUMB: adds r1, #1
; THUMB: str r1, [r0]
-; ARM: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr r0, .LCPI)}}
-; ARM: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
-; ARM: ldr r0, [r0]
+; ARM-MACHO: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr r0, .LCPI)}}
+; ARM-MACHO: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
+; ARM-MACHO: ldr r0, [r0]
+
+; ARM-ELF: movw r0, :lower16:test4g
+; ARM-ELF: movt r0, :upper16:test4g
+
; ARM: ldr r1, [r0]
; ARM: add r1, r1, #1
; ARM: str r1, [r0]
diff --git a/test/CodeGen/ARM/fast-tail-call.ll b/test/CodeGen/ARM/fast-tail-call.ll
index 6472016c0572..c93028bad50c 100644
--- a/test/CodeGen/ARM/fast-tail-call.ll
+++ b/test/CodeGen/ARM/fast-tail-call.ll
@@ -1,4 +1,5 @@
; RUN: llc -mtriple=thumbv7-linux-gnueabi -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8m.base-arm-none-eabi -filetype=obj < %s
; Primarily a non-crash test: Thumbv7 Linux does not have FastISel support,
; which led (via a convoluted route) to DAG nodes after a TC_RETURN that
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index 2a2eb8d2b6ba..ebc5934df022 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -642,6 +642,40 @@ define void @test_maxnum(half* %p, half* %q) #0 {
ret void
}
+; CHECK-ALL-LABEL: test_minnan:
+; CHECK-FP16: vcvtb.f32.f16
+; CHECK-FP16: vcvtb.f32.f16
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vmin.f32
+; CHECK-NOVFP: bl __aeabi_fcmpge
+; CHECK-FP16: vcvtb.f16.f32
+; CHECK-LIBCALL: bl __aeabi_f2h
+define void @test_minnan(half* %p) #0 {
+ %a = load half, half* %p, align 2
+ %c = fcmp ult half %a, 1.0
+ %r = select i1 %c, half %a, half 1.0
+ store half %r, half* %p
+ ret void
+}
+
+; CHECK-ALL-LABEL: test_maxnan:
+; CHECK-FP16: vcvtb.f32.f16
+; CHECK-FP16: vcvtb.f32.f16
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-LIBCALL: bl __aeabi_h2f
+; CHECK-VFP: vmax.f32
+; CHECK-NOVFP: bl __aeabi_fcmple
+; CHECK-FP16: vcvtb.f16.f32
+; CHECK-LIBCALL: bl __aeabi_f2h
+define void @test_maxnan(half* %p) #0 {
+ %a = load half, half* %p, align 2
+ %c = fcmp ugt half %a, 1.0
+ %r = select i1 %c, half %a, half 1.0
+ store half %r, half* %p
+ ret void
+}
+
; CHECK-FP16-LABEL: test_copysign:
; CHECK-FP16: vcvtb.f32.f16
; CHECK-FP16: vcvtb.f32.f16
@@ -889,4 +923,44 @@ define half @test_struct_arg(%struct.dummy %p) {
ret half %a
}
+; CHECK-LABEL: test_uitofp_i32_fadd:
+; CHECK-VFP-DAG: vcvt.f32.u32
+; CHECK-NOVFP-DAG: bl __aeabi_ui2f
+
+; CHECK-FP16-DAG: vcvtb.f16.f32
+; CHECK-FP16-DAG: vcvtb.f32.f16
+; CHECK-LIBCALL-DAG: bl __aeabi_h2f
+; CHECK-LIBCALL-DAG: bl __aeabi_h2f
+
+; CHECK-VFP-DAG: vadd.f32
+; CHECK-NOVFP-DAG: bl __aeabi_fadd
+
+; CHECK-FP16-DAG: vcvtb.f16.f32
+; CHECK-LIBCALL-DAG: bl __aeabi_f2h
+define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = uitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32_fadd:
+; CHECK-VFP-DAG: vcvt.f32.s32
+; CHECK-NOVFP-DAG: bl __aeabi_i2f
+
+; CHECK-FP16-DAG: vcvtb.f16.f32
+; CHECK-FP16-DAG: vcvtb.f32.f16
+; CHECK-LIBCALL-DAG: bl __aeabi_h2f
+; CHECK-LIBCALL-DAG: bl __aeabi_h2f
+
+; CHECK-VFP-DAG: vadd.f32
+; CHECK-NOVFP-DAG: bl __aeabi_fadd
+
+; CHECK-FP16-DAG: vcvtb.f16.f32
+; CHECK-LIBCALL-DAG: bl __aeabi_f2h
+define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = sitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/ARM/fp16-v3.ll b/test/CodeGen/ARM/fp16-v3.ll
index 6ed9c9d22c9d..e26455e61e7f 100644
--- a/test/CodeGen/ARM/fp16-v3.ll
+++ b/test/CodeGen/ARM/fp16-v3.ll
@@ -1,14 +1,16 @@
-; RUN: llc -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mattr=+fp16 < %s | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "armv7a--none-eabi"
; CHECK-LABEL: test_vec3:
-; CHECK: vcvtb.f32.f16
-; CHECK: vcvt.f32.s32
-; CHECK: vadd.f32
-; CHECK-NEXT: vcvtb.f16.f32 [[SREG:s[0-9]+]], {{.*}}
-; CHECK-NEXT: vmov [[RREG1:r[0-9]+]], [[SREG]]
+; CHECK-DAG: vcvtb.f32.f16 [[SREG1:s[0-9]+]],
+; CHECK-DAG: vcvt.f32.s32 [[SREG2:s[0-9]+]],
+; CHECK-DAG: vcvtb.f16.f32 [[SREG3:s[0-9]+]], [[SREG2]]
+; CHECK-DAG: vcvtb.f32.f16 [[SREG4:s[0-9]+]], [[SREG3]]
+; CHECK: vadd.f32 [[SREG5:s[0-9]+]], [[SREG4]], [[SREG1]]
+; CHECK-NEXT: vcvtb.f16.f32 [[SREG6:s[0-9]+]], [[SREG5]]
+; CHECK-NEXT: vmov [[RREG1:r[0-9]+]], [[SREG6]]
; CHECK-NEXT: uxth [[RREG2:r[0-9]+]], [[RREG1]]
; CHECK-NEXT: pkhbt [[RREG3:r[0-9]+]], [[RREG1]], [[RREG1]], lsl #16
; CHECK-DAG: strh [[RREG1]], [r0, #4]
@@ -25,4 +27,16 @@ define void @test_vec3(<3 x half>* %arr, i32 %i) #0 {
ret void
}
+; CHECK-LABEL: test_bitcast:
+; CHECK: vcvtb.f16.f32
+; CHECK: vcvtb.f16.f32
+; CHECK: vcvtb.f16.f32
+; CHECK: pkhbt
+; CHECK: uxth
+define void @test_bitcast(<3 x half> %inp, <3 x i16>* %arr) #0 {
+ %bc = bitcast <3 x half> %inp to <3 x i16>
+ store <3 x i16> %bc, <3 x i16>* %arr, align 8
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/ARM/fp16.ll b/test/CodeGen/ARM/fp16.ll
index 73d5c36a9c2f..b2cccd832198 100644
--- a/test/CodeGen/ARM/fp16.ll
+++ b/test/CodeGen/ARM/fp16.ll
@@ -1,9 +1,17 @@
; RUN: llc -mtriple=armv7a--none-eabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-EABI %s
; RUN: llc -mtriple=armv7a--none-gnueabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-GNU %s
-; RUN: llc -mattr=+vfp3,+fp16 < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-FP16 %s
+; RUN: llc -mtriple=armv7a--none-musleabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-GNU %s
; RUN: llc -mtriple=armv8-eabihf < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARMV8 %s
; RUN: llc -mtriple=thumbv7m-eabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-EABI %s
; RUN: llc -mtriple=thumbv7m-gnueabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-GNU %s
+; RUN: llc -mtriple=thumbv7m-musleabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-GNU %s
+
+;; +fp16 is special: it has f32->f16 (unlike v7), but not f64->f16 (unlike v8).
+;; This exposes unsafe-fp-math optimization opportunities; test that.
+; RUN: llc -mattr=+vfp3,+fp16 < %s |\
+; RUN: FileCheck --check-prefix=CHECK --check-prefix=CHECK-FP16 --check-prefix=CHECK-FP16-SAFE %s
+; RUN: llc -mattr=+vfp3,+fp16 < %s -enable-unsafe-fp-math |\
+; RUN: FileCheck --check-prefix=CHECK --check-prefix=CHECK-FP16 --check-prefix=CHECK-FP16-UNSAFE %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
target triple = "armv7---eabihf"
@@ -76,7 +84,10 @@ define i16 @test_to_fp16(double %in) {
; CHECK-HARDFLOAT-GNU: bl __aeabi_d2h
-; CHECK-FP16: bl __aeabi_d2h
+; CHECK-FP16-SAFE: bl __aeabi_d2h
+
+; CHECK-FP16-UNSAFE: vcvt.f32.f64 s0, d0
+; CHECK-FP16-UNSAFE-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0
; CHECK-ARMV8: vmov r0, [[TMP]]
diff --git a/test/CodeGen/ARM/globals.ll b/test/CodeGen/ARM/globals.ll
index e6aa2db744d5..399d5208ae2c 100644
--- a/test/CodeGen/ARM/globals.ll
+++ b/test/CodeGen/ARM/globals.ll
@@ -15,7 +15,7 @@ define i32 @test1() {
; DarwinStatic: ldr r0, [r0]
; DarwinStatic: bx lr
-; DarwinStatic: .align 2
+; DarwinStatic: .p2align 2
; DarwinStatic: LCPI0_0:
; DarwinStatic: .long {{_G$}}
@@ -26,12 +26,12 @@ define i32 @test1() {
; DarwinDynamic: ldr r0, [r0]
; DarwinDynamic: bx lr
-; DarwinDynamic: .align 2
+; DarwinDynamic: .p2align 2
; DarwinDynamic: LCPI0_0:
; DarwinDynamic: .long L_G$non_lazy_ptr
; DarwinDynamic: .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
-; DarwinDynamic: .align 2
+; DarwinDynamic: .p2align 2
; DarwinDynamic: L_G$non_lazy_ptr:
; DarwinDynamic: .indirect_symbol _G
; DarwinDynamic: .long 0
@@ -46,12 +46,12 @@ define i32 @test1() {
; DarwinPIC-NOT: ldr
; DarwinPIC: bx lr
-; DarwinPIC: .align 2
+; DarwinPIC: .p2align 2
; DarwinPIC: LCPI0_0:
; DarwinPIC: .long L_G$non_lazy_ptr-(LPC0_0+8)
; DarwinPIC: .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
-; DarwinPIC: .align 2
+; DarwinPIC: .p2align 2
; DarwinPIC: L_G$non_lazy_ptr:
; DarwinPIC: .indirect_symbol _G
; DarwinPIC: .long 0
@@ -66,7 +66,7 @@ define i32 @test1() {
; LinuxPIC: ldr r0, [r0]
; LinuxPIC: bx lr
-; LinuxPIC: .align 2
+; LinuxPIC: .p2align 2
; LinuxPIC: .LCPI0_0:
; LinuxPIC: .Ltmp0:
; LinuxPIC: .long G(GOT_PREL)-((.LPC0_0+8)-.Ltmp0)
diff --git a/test/CodeGen/ARM/half.ll b/test/CodeGen/ARM/half.ll
index b40eaf638519..ad039b9d6865 100644
--- a/test/CodeGen/ARM/half.ll
+++ b/test/CodeGen/ARM/half.ll
@@ -41,7 +41,7 @@ define float @test_extend32(half* %addr) {
define double @test_extend64(half* %addr) {
; CHECK-LABEL: test_extend64:
-; CHECK-OLD: blx ___extendhfsf2
+; CHECK-OLD: bl ___extendhfsf2
; CHECK-OLD: vcvt.f64.f32
; CHECK-F16: vcvtb.f32.f16
; CHECK-F16: vcvt.f64.f32
@@ -54,7 +54,7 @@ define double @test_extend64(half* %addr) {
define void @test_trunc32(float %in, half* %addr) {
; CHECK-LABEL: test_trunc32:
-; CHECK-OLD: blx ___truncsfhf2
+; CHECK-OLD: bl ___truncsfhf2
; CHECK-F16: vcvtb.f16.f32
; CHECK-V8: vcvtb.f16.f32
%val16 = fptrunc float %in to half
@@ -65,8 +65,8 @@ define void @test_trunc32(float %in, half* %addr) {
define void @test_trunc64(double %in, half* %addr) {
; CHECK-LABEL: test_trunc64:
-; CHECK-OLD: blx ___truncdfhf2
-; CHECK-F16: blx ___truncdfhf2
+; CHECK-OLD: bl ___truncdfhf2
+; CHECK-F16: bl ___truncdfhf2
; CHECK-V8: vcvtb.f16.f64
%val16 = fptrunc double %in to half
store half %val16, half* %addr
diff --git a/test/CodeGen/ARM/hello.ll b/test/CodeGen/ARM/hello.ll
index 08e6104bbcf6..b03a60ab60c9 100644
--- a/test/CodeGen/ARM/hello.ll
+++ b/test/CodeGen/ARM/hello.ll
@@ -16,9 +16,11 @@ define i32 @main() {
declare i32 @puts(i8*)
+; CHECK-LABEL: main
; CHECK: mov
; CHECK-NOT: mov
+; CHECK-FP-ELIM-LABEL: main
; CHECK-FP-ELIM: mov
; CHECK-FP-ELIM: mov
; CHECK-FP-ELIM-NOT: mov
diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
index a96b6e8a1e83..967d6ebce277 100644
--- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll
+++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
@@ -27,7 +27,7 @@ declare i8* @bar(i32, i8*, i8*)
; CHECK-NEXT: LBB{{[0-9_]+}}:
; CHECK-NEXT: movw r0, #4567
; CHECK-NEXT: [[FOOCALL]]:
-; CHECK-NEXT: blx _foo
+; CHECK-NEXT: bl _foo
;
; CHECK-PROB: BB#0:
; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
diff --git a/test/CodeGen/ARM/inlineasm-X-allocation.ll b/test/CodeGen/ARM/inlineasm-X-allocation.ll
new file mode 100644
index 000000000000..e88d668f5ccf
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-X-allocation.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=armv7-none-eabi -mattr=-neon,-vfpv2 %s -o - | FileCheck %s -check-prefix=novfp
+; RUN: llc -mtriple=armv7-none-eabi -mattr=+neon %s -float-abi=hard -o - | FileCheck %s -check-prefix=vfp
+
+; vfp-LABEL: f1
+; vfp-CHECK: vadd.f32 s0, s0, s0
+
+; In the novfp case, the compiler is forced to assign a core register.
+; Although this register class can't be used with the vadd.f32 instruction,
+; the compiler behaved as expected since it is allowed to emit anything.
+
+; novfp-LABEL: f1
+; novfp-CHECK: vadd.f32 r0, r0, r0
+
+; This can be generated by a function such as:
+; void f1(float f) {asm volatile ("add.f32 $0, $0, $0" : : "X" (f));}
+
+define arm_aapcs_vfpcc void @f1(float %f) {
+entry:
+ call void asm sideeffect "vadd.f32 $0, $0, $0", "X" (float %f) nounwind
+ ret void
+}
diff --git a/test/CodeGen/ARM/inlineasm-X-constraint.ll b/test/CodeGen/ARM/inlineasm-X-constraint.ll
new file mode 100644
index 000000000000..d3d53df11b56
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-X-constraint.ll
@@ -0,0 +1,157 @@
+; RUN: llc -mtriple=armv7-none-eabi -mattr=+neon < %s -o - | FileCheck %s
+
+; The following functions test the use case where an X constraint is used to
+; add a dependency between an assembly instruction (vmsr in this case) and
+; another instruction. In each function, we use a different type for the
+; X constraint argument.
+;
+; We can something similar from the following C code:
+; double f1(double f, int pscr_value) {
+; asm volatile("vmsr fpscr,%0" : "=X" ((f)): "r" (pscr_value));
+; return f+f;
+; }
+
+; CHECK-LABEL: f1
+; CHECK: vmsr fpscr
+; CHECK: vadd.f64
+
+define arm_aapcs_vfpcc double @f1(double %f, i32 %pscr_value) {
+entry:
+ %f.addr = alloca double, align 8
+ store double %f, double* %f.addr, align 8
+ call void asm sideeffect "vmsr fpscr,$1", "=*X,r"(double* nonnull %f.addr, i32 %pscr_value) nounwind
+ %0 = load double, double* %f.addr, align 8
+ %add = fadd double %0, %0
+ ret double %add
+}
+
+; int f2(int f, int pscr_value) {
+; asm volatile("vmsr fpscr,%0" : "=X" ((f)): "r" (pscr_value));
+; return f+f;
+; }
+
+; CHECK-LABEL: f2
+; CHECK: vmsr fpscr
+; CHECK: mul
+define arm_aapcs_vfpcc i32 @f2(i32 %f, i32 %pscr_value) {
+entry:
+ %f.addr = alloca i32, align 4
+ store i32 %f, i32* %f.addr, align 4
+ call void asm sideeffect "vmsr fpscr,$1", "=*X,r"(i32* nonnull %f.addr, i32 %pscr_value) nounwind
+ %0 = load i32, i32* %f.addr, align 4
+ %mul = mul i32 %0, %0
+ ret i32 %mul
+}
+
+
+; int f3(int f, int pscr_value) {
+; asm volatile("vmsr fpscr,%0" : "=X" ((f)): "r" (pscr_value));
+; return f+f;
+; }
+
+; typedef signed char int8_t;
+; typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+; void f3 (void)
+; {
+; int8x8_t vector_res_int8x8;
+; unsigned int fpscr;
+; asm volatile ("vmsr fpscr,%1" : "=X" ((vector_res_int8x8)) : "r" (fpscr));
+; return vector_res_int8x8 * vector_res_int8x8;
+; }
+
+; CHECK-LABEL: f3
+; CHECK: vmsr fpscr
+; CHECK: vmul.i8
+define arm_aapcs_vfpcc <8 x i8> @f3() {
+entry:
+ %vector_res_int8x8 = alloca <8 x i8>, align 8
+ %0 = getelementptr inbounds <8 x i8>, <8 x i8>* %vector_res_int8x8, i32 0, i32 0
+ call void asm sideeffect "vmsr fpscr,$1", "=*X,r"(<8 x i8>* nonnull %vector_res_int8x8, i32 undef) nounwind
+ %1 = load <8 x i8>, <8 x i8>* %vector_res_int8x8, align 8
+ %mul = mul <8 x i8> %1, %1
+ ret <8 x i8> %mul
+}
+
+; We can emit integer constants.
+; We can get this from:
+; void f() {
+; int x = 2;
+; asm volatile ("add r0, r0, %0" : : "X" (x));
+; }
+;
+; CHECK-LABEL: f4
+; CHECK: add r0, r0, #2
+define void @f4() {
+entry:
+ tail call void asm sideeffect "add r0, r0, $0", "X"(i32 2)
+ ret void
+}
+
+; We can emit function labels. This is equivalent to the following C code:
+; void f(void) {
+; void (*x)(void) = &foo;
+; asm volatile ("bl %0" : : "X" (x));
+; }
+; CHECK-LABEL: f5
+; CHECK: bl f4
+define void @f5() {
+entry:
+ tail call void asm sideeffect "bl $0", "X"(void ()* nonnull @f4)
+ ret void
+}
+
+declare void @foo(...)
+
+; This tests the behavior of the X constraint when used on functions pointers,
+; or functions with a cast. In the first asm call we figure out that this
+; is a function pointer and emit the label. However, in the second asm call
+; we can't see through the bitcast and we end up having to lower this constraint
+; to something else. This is not ideal, but it is a correct behaviour according
+; to the definition of the X constraint.
+;
+; In this case (and other cases where we could have emitted something else),
+; what we're doing with the X constraint is not particularly useful either,
+; since the user could have used "r" in this situation for the same effect.
+
+; CHECK-LABEL: f6
+; CHECK: bl foo
+; CHECK: bl r
+
+define void @f6() nounwind {
+entry:
+ tail call void asm sideeffect "bl $0", "X"(void (...)* @foo) nounwind
+ tail call void asm sideeffect "bl $0", "X"(void (...)* bitcast (void ()* @f4 to void (...)*)) nounwind
+ ret void
+}
+
+; The following IR can be generated from C code with a function like:
+; void a() {
+; void* a = &&A;
+; asm volatile ("bl %0" : : "X" (a));
+; A:
+; return;
+; }
+;
+; Ideally this would give the block address of bb, but it requires us to see
+; through blockaddress, which we can't do at the moment. This might break some
+; existing use cases where a user would expect to get a block label and instead
+; gets the block address in a register. However, note that according to the
+; "no constraints" definition this behaviour is correct (although not very nice).
+
+; CHECK-LABEL: f7
+; CHECK: bl
+define void @f7() {
+ call void asm sideeffect "bl $0", "X"( i8* blockaddress(@f7, %bb) )
+ br label %bb
+bb:
+ ret void
+}
+
+; If we use a constraint "=*X", we should get a store back to *%x (in r0).
+; CHECK-LABEL: f8
+; CHECK: str r{{.*}}, [r0]
+define void @f8(i32 *%x) {
+entry:
+ tail call void asm sideeffect "add $0, r0, r0", "=*X"(i32 *%x)
+ ret void
+}
diff --git a/test/CodeGen/ARM/inlineasm-ldr-pseudo.ll b/test/CodeGen/ARM/inlineasm-ldr-pseudo.ll
index f63e4b0b3a17..98665f056a21 100644
--- a/test/CodeGen/ARM/inlineasm-ldr-pseudo.ll
+++ b/test/CodeGen/ARM/inlineasm-ldr-pseudo.ll
@@ -9,9 +9,9 @@
; CHECK: 0: 00 00 9f e5 ldr r0, [pc]
; CHECK: 4: 0e f0 a0 e1 mov pc, lr
; Make sure the constant pool entry comes after the return
-; CHECK: 8: 01 00 00 00
+; CHECK: 8: 78 56 34 12
define i32 @foo() nounwind {
entry:
- %0 = tail call i32 asm sideeffect "ldr $0,=1", "=r"() nounwind
+ %0 = tail call i32 asm sideeffect "ldr $0,=0x12345678", "=r"() nounwind
ret i32 %0
}
diff --git a/test/CodeGen/ARM/inlineasm3.ll b/test/CodeGen/ARM/inlineasm3.ll
index eb7ba59b69bf..f725f534c73d 100644
--- a/test/CodeGen/ARM/inlineasm3.ll
+++ b/test/CodeGen/ARM/inlineasm3.ll
@@ -6,6 +6,7 @@
define void @t() nounwind {
entry:
+; CHECK-LABEL: t
; CHECK: vmov.I64 q15, #0
; CHECK: vmov.32 d30[0],
; CHECK: vmov q8, q15
@@ -19,6 +20,7 @@ entry:
define void @t2() nounwind {
entry:
+; CHECK-LABEL: t2
; CHECK: vmov d30, d16
; CHECK: vmov.32 r0, d30[0]
%asmtmp2 = tail call i32 asm sideeffect "vmov d30, $1\0Avmov.32 $0, d30[0]\0A", "=r,w,~{d30}"(<2 x i32> undef) nounwind
@@ -64,7 +66,7 @@ ret i32 0
define float @t6(float %y) nounwind {
entry:
-; CHECK: t6
+; CHECK-LABEL: t6
; CHECK: flds s15, s0
%0 = tail call float asm "flds s15, $0", "=x"() nounwind
ret float %0
@@ -74,7 +76,7 @@ entry:
define double @t7(double %y) nounwind {
entry:
-; CHECK: t7
+; CHECK-LABEL: t7
; CHECK: flds s15, d0
%0 = tail call double asm "flds s15, $0", "=x"() nounwind
ret double %0
@@ -84,7 +86,7 @@ entry:
define float @t8(float %y) nounwind {
entry:
-; CHECK: t8
+; CHECK-LABEL: t8
; CHECK: flds s15, s0
%0 = tail call float asm "flds s15, $0", "=t"() nounwind
ret float %0
@@ -94,7 +96,7 @@ entry:
define i32 @t9(i32 %r0) nounwind {
entry:
-; CHECK: t9
+; CHECK-LABEL: t9
; CHECK: movw r0, #27182
%0 = tail call i32 asm "movw $0, $1", "=r,j"(i32 27182) nounwind
ret i32 %0
@@ -104,7 +106,7 @@ entry:
define void @t10(i8* %f, i32 %g) nounwind {
entry:
-; CHECK: t10
+; CHECK-LABEL: t10
; CHECK: str r1, [r0]
%f.addr = alloca i8*, align 4
store i8* %f, i8** %f.addr, align 4
@@ -116,8 +118,19 @@ entry:
define <4 x i32> @t11(i32* %p) nounwind {
entry:
-; CHECK: t11
+; CHECK-LABEL: t11
; CHECK: vld1.s32 {d16[], d17[]}, [r0]
%0 = tail call <4 x i32> asm "vld1.s32 {${0:e}[], ${0:f}[]}, [$1]", "=w,r"(i32* %p) nounwind
ret <4 x i32> %0
}
+
+; Bugzilla PR26038
+
+define i32 @fn1() local_unnamed_addr nounwind {
+; CHECK-LABEL: fn1
+entry:
+; CHECK: mov [[addr:r[0-9]+]], #5
+; CHECK: ldrh {{.*}}[[addr]]
+ %0 = tail call i32 asm "ldrh $0, $1", "=r,*Q"(i8* inttoptr (i32 5 to i8*)) nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index 43bd5815a558..794f672534dc 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -35,15 +35,15 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
; appropriate sentinel so no special return needed).
; CHECK-M-LABEL: irq_fn:
-; CHECK-M: push.w {r4, r7, r11, lr}
-; CHECK-M: add.w r11, sp, #8
+; CHECK-M: push {r4, r6, r7, lr}
+; CHECK-M: add r7, sp, #8
; CHECK-M: mov r4, sp
; CHECK-M: bfc r4, #0, #3
; CHECK-M: mov sp, r4
; CHECK-M: bl _bar
-; CHECK-M: sub.w r4, r11, #8
+; CHECK-M: sub.w r4, r7, #8
; CHECK-M: mov sp, r4
-; CHECK-M: pop.w {r4, r7, r11, pc}
+; CHECK-M: pop {r4, r6, r7, pc}
call arm_aapcscc void @bar()
ret void
diff --git a/test/CodeGen/ARM/interval-update-remat.ll b/test/CodeGen/ARM/interval-update-remat.ll
new file mode 100644
index 000000000000..6391d4c29604
--- /dev/null
+++ b/test/CodeGen/ARM/interval-update-remat.ll
@@ -0,0 +1,162 @@
+; RUN: llc -verify-regalloc < %s
+; PR27275: When enabling remat for vreg defined by PHIs, make sure the update
+; of the live range removes dead phi. Otherwise, we may end up with PHIs with
+; incorrect operands and that will trigger assertions or verifier failures
+; in later passes.
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7-apple-ios9.0.0"
+
+%class.SOCKSClientSocketPoolTest_AsyncSOCKSConnectError_Test.1.226.276.1301.2326 = type { %class.MockTransportClientSocketPool.0.225.275.1300.2325, i32 }
+%class.MockTransportClientSocketPool.0.225.275.1300.2325 = type { i8 }
+%class.StaticSocketDataProvider.6.231.281.1306.2331 = type { i8, %struct.MockConnect.5.230.280.1305.2330 }
+%struct.MockConnect.5.230.280.1305.2330 = type { %class.IPEndPoint.4.229.279.1304.2329 }
+%class.IPEndPoint.4.229.279.1304.2329 = type { %class.IPAddress.3.228.278.1303.2328 }
+%class.IPAddress.3.228.278.1303.2328 = type { %"class.(anonymous namespace)::vector.2.227.277.1302.2327" }
+%"class.(anonymous namespace)::vector.2.227.277.1302.2327" = type { i8 }
+%class.TestCompletionCallback.9.234.284.1309.2334 = type { %class.TestCompletionCallbackTemplate.8.233.283.1308.2333, i32 }
+%class.TestCompletionCallbackTemplate.8.233.283.1308.2333 = type { i32 }
+%class.AssertionResult.24.249.299.1324.2349 = type { i8, %class.scoped_ptr.23.248.298.1323.2348 }
+%class.scoped_ptr.23.248.298.1323.2348 = type { %class.Trans_NS___1_basic_string.18.243.293.1318.2343* }
+%class.Trans_NS___1_basic_string.18.243.293.1318.2343 = type { %class.Trans_NS___1___libcpp_compressed_pair_imp.17.242.292.1317.2342 }
+%class.Trans_NS___1___libcpp_compressed_pair_imp.17.242.292.1317.2342 = type { %"struct.Trans_NS___1_basic_string<char, int, int>::__rep.16.241.291.1316.2341" }
+%"struct.Trans_NS___1_basic_string<char, int, int>::__rep.16.241.291.1316.2341" = type { %"struct.Trans_NS___1_basic_string<char, int, int>::__long.15.240.290.1315.2340" }
+%"struct.Trans_NS___1_basic_string<char, int, int>::__long.15.240.290.1315.2340" = type { i64, i32 }
+%class.AssertHelper.10.235.285.1310.2335 = type { i8 }
+%class.Message.13.238.288.1313.2338 = type { %class.scoped_ptr.0.12.237.287.1312.2337 }
+%class.scoped_ptr.0.12.237.287.1312.2337 = type { %"class.(anonymous namespace)::basic_stringstream.11.236.286.1311.2336"* }
+%"class.(anonymous namespace)::basic_stringstream.11.236.286.1311.2336" = type { i8 }
+%class.scoped_refptr.19.244.294.1319.2344 = type { i8 }
+%class.BoundNetLog.20.245.295.1320.2345 = type { i32 }
+%struct.MockReadWrite.7.232.282.1307.2332 = type { i32 }
+%"class.(anonymous namespace)::basic_iostream.22.247.297.1322.2347" = type { i8 }
+%class.ClientSocketHandle.14.239.289.1314.2339 = type { i8 }
+%"class.(anonymous namespace)::__vector_base.21.246.296.1321.2346" = type { i8 }
+
+@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+
+define void @_ZN53SOCKSClientSocketPoolTest_AsyncSOCKSConnectError_Test6m_fn10Ev(%class.SOCKSClientSocketPoolTest_AsyncSOCKSConnectError_Test.1.226.276.1301.2326* %this) align 2 {
+entry:
+ %socket_data = alloca %class.StaticSocketDataProvider.6.231.281.1306.2331, align 1
+ %agg.tmp = alloca %struct.MockConnect.5.230.280.1305.2330, align 1
+ %callback = alloca %class.TestCompletionCallback.9.234.284.1309.2334, align 4
+ %gtest_ar = alloca %class.AssertionResult.24.249.299.1324.2349, align 4
+ %temp.lvalue = alloca %class.AssertHelper.10.235.285.1310.2335, align 1
+ %agg.tmp10 = alloca %class.Message.13.238.288.1313.2338, align 4
+ %ref.tmp = alloca %class.Trans_NS___1_basic_string.18.243.293.1318.2343, align 4
+ %agg.tmp16 = alloca %class.scoped_refptr.19.244.294.1319.2344, align 1
+ %agg.tmp18 = alloca %class.BoundNetLog.20.245.295.1320.2345, align 4
+ %call2 = call %class.StaticSocketDataProvider.6.231.281.1306.2331* @_ZN24StaticSocketDataProviderC1EP13MockReadWritejS1_j(%class.StaticSocketDataProvider.6.231.281.1306.2331* nonnull %socket_data, %struct.MockReadWrite.7.232.282.1307.2332* undef, i32 1, %struct.MockReadWrite.7.232.282.1307.2332* null, i32 0)
+ %call3 = call %struct.MockConnect.5.230.280.1305.2330* @_ZN11MockConnectC1Ev(%struct.MockConnect.5.230.280.1305.2330* nonnull %agg.tmp)
+ call void @_ZN24StaticSocketDataProvider5m_fn8E11MockConnect(%class.StaticSocketDataProvider.6.231.281.1306.2331* nonnull %socket_data, %struct.MockConnect.5.230.280.1305.2330* nonnull %agg.tmp)
+ %call5 = call %class.TestCompletionCallback.9.234.284.1309.2334* @_ZN22TestCompletionCallbackC1Ev(%class.TestCompletionCallback.9.234.284.1309.2334* nonnull %callback)
+ %transport_socket_pool_ = getelementptr inbounds %class.SOCKSClientSocketPoolTest_AsyncSOCKSConnectError_Test.1.226.276.1301.2326, %class.SOCKSClientSocketPoolTest_AsyncSOCKSConnectError_Test.1.226.276.1301.2326* %this, i32 0, i32 0
+ %call6 = call i32 @_ZN29MockTransportClientSocketPool5m_fn9Ev(%class.MockTransportClientSocketPool.0.225.275.1300.2325* %transport_socket_pool_)
+ call void @_Z11CmpHelperEQPcS_xx(%class.AssertionResult.24.249.299.1324.2349* nonnull sret %gtest_ar, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i64 0, i64 undef)
+ %tmp = load i8, i8* undef, align 4
+ %tobool.i = icmp eq i8 %tmp, 0
+ br i1 %tobool.i, label %if.else, label %if.end
+
+if.else: ; preds = %entry
+ br i1 undef, label %_ZN15AssertionResult5m_fn6Ev.exit, label %cond.true.i
+
+cond.true.i: ; preds = %if.else
+ %call4.i = call i8* @_ZN25Trans_NS___1_basic_stringIciiE5m_fn1Ev(%class.Trans_NS___1_basic_string.18.243.293.1318.2343* nonnull undef)
+ br label %_ZN15AssertionResult5m_fn6Ev.exit
+
+_ZN15AssertionResult5m_fn6Ev.exit: ; preds = %cond.true.i, %if.else
+ %cond.i = phi i8* [ %call4.i, %cond.true.i ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), %if.else ]
+ %call9 = call %class.AssertHelper.10.235.285.1310.2335* @_ZN12AssertHelperC1EPKc(%class.AssertHelper.10.235.285.1310.2335* nonnull %temp.lvalue, i8* %cond.i)
+ %call11 = call %class.Message.13.238.288.1313.2338* @_ZN7MessageC1Ev(%class.Message.13.238.288.1313.2338* nonnull %agg.tmp10)
+ call void @_ZN12AssertHelperaSE7Message(%class.AssertHelper.10.235.285.1310.2335* nonnull %temp.lvalue, %class.Message.13.238.288.1313.2338* nonnull %agg.tmp10)
+ %call.i.i.i.i27 = call zeroext i1 @_Z6IsTruev()
+ %brmerge = or i1 false, undef
+ br i1 %brmerge, label %_ZN7MessageD1Ev.exit33, label %delete.notnull.i.i.i.i32
+
+delete.notnull.i.i.i.i32: ; preds = %_ZN15AssertionResult5m_fn6Ev.exit
+ %call.i.i.i.i.i.i31 = call %"class.(anonymous namespace)::basic_iostream.22.247.297.1322.2347"* @_ZN12_GLOBAL__N_114basic_iostreamD2Ev(%"class.(anonymous namespace)::basic_iostream.22.247.297.1322.2347"* undef)
+ call void @_ZdlPv(i8* undef)
+ br label %_ZN7MessageD1Ev.exit33
+
+_ZN7MessageD1Ev.exit33: ; preds = %delete.notnull.i.i.i.i32, %_ZN15AssertionResult5m_fn6Ev.exit
+ %call13 = call %class.AssertHelper.10.235.285.1310.2335* @_ZN12AssertHelperD1Ev(%class.AssertHelper.10.235.285.1310.2335* nonnull %temp.lvalue)
+ br label %if.end
+
+if.end: ; preds = %_ZN7MessageD1Ev.exit33, %entry
+ %message_.i.i = getelementptr inbounds %class.AssertionResult.24.249.299.1324.2349, %class.AssertionResult.24.249.299.1324.2349* %gtest_ar, i32 0, i32 1
+ %call.i.i.i = call %class.scoped_ptr.23.248.298.1323.2348* @_ZN10scoped_ptrI25Trans_NS___1_basic_stringIciiEED2Ev(%class.scoped_ptr.23.248.298.1323.2348* %message_.i.i)
+ call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 12, i32 4, i1 false)
+ call void @_ZN25Trans_NS___1_basic_stringIciiE5m_fn2Ev(%class.Trans_NS___1_basic_string.18.243.293.1318.2343* nonnull %ref.tmp)
+ call void @_Z19CreateSOCKSv5Paramsv(%class.scoped_refptr.19.244.294.1319.2344* nonnull sret %agg.tmp16)
+ %callback_.i = getelementptr inbounds %class.TestCompletionCallback.9.234.284.1309.2334, %class.TestCompletionCallback.9.234.284.1309.2334* %callback, i32 0, i32 1
+ %pool_ = getelementptr inbounds %class.SOCKSClientSocketPoolTest_AsyncSOCKSConnectError_Test.1.226.276.1301.2326, %class.SOCKSClientSocketPoolTest_AsyncSOCKSConnectError_Test.1.226.276.1301.2326* %this, i32 0, i32 1
+ %tmp1 = getelementptr inbounds %class.BoundNetLog.20.245.295.1320.2345, %class.BoundNetLog.20.245.295.1320.2345* %agg.tmp18, i32 0, i32 0
+ store i32 0, i32* %tmp1, align 4
+ call void @_ZN18ClientSocketHandle5m_fn3IPiEEvRK25Trans_NS___1_basic_stringIciiE13scoped_refptr15RequestPriorityN16ClientSocketPool13RespectLimitsERiT_11BoundNetLog(%class.ClientSocketHandle.14.239.289.1314.2339* nonnull undef, %class.Trans_NS___1_basic_string.18.243.293.1318.2343* nonnull dereferenceable(12) %ref.tmp, %class.scoped_refptr.19.244.294.1319.2344* nonnull %agg.tmp16, i32 0, i32 1, i32* nonnull dereferenceable(4) %callback_.i, i32* %pool_, %class.BoundNetLog.20.245.295.1320.2345* nonnull %agg.tmp18)
+ %call19 = call %class.BoundNetLog.20.245.295.1320.2345* @_ZN11BoundNetLogD1Ev(%class.BoundNetLog.20.245.295.1320.2345* nonnull %agg.tmp18)
+ call void @_Z11CmpHelperEQPcS_xx(%class.AssertionResult.24.249.299.1324.2349* nonnull sret undef, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i64 -1, i64 0)
+ br i1 undef, label %if.then.i.i.i.i, label %_ZN7MessageD1Ev.exit
+
+if.then.i.i.i.i: ; preds = %if.end
+ %tmp2 = load %"class.(anonymous namespace)::basic_stringstream.11.236.286.1311.2336"*, %"class.(anonymous namespace)::basic_stringstream.11.236.286.1311.2336"** undef, align 4
+ br label %_ZN7MessageD1Ev.exit
+
+_ZN7MessageD1Ev.exit: ; preds = %if.then.i.i.i.i, %if.end
+ %connect_.i.i = getelementptr inbounds %class.StaticSocketDataProvider.6.231.281.1306.2331, %class.StaticSocketDataProvider.6.231.281.1306.2331* %socket_data, i32 0, i32 1
+ %tmp3 = bitcast %struct.MockConnect.5.230.280.1305.2330* %connect_.i.i to %"class.(anonymous namespace)::__vector_base.21.246.296.1321.2346"*
+ %call.i.i.i.i.i.i.i.i.i.i = call %"class.(anonymous namespace)::__vector_base.21.246.296.1321.2346"* @_ZN12_GLOBAL__N_113__vector_baseD2Ev(%"class.(anonymous namespace)::__vector_base.21.246.296.1321.2346"* %tmp3)
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+declare %class.StaticSocketDataProvider.6.231.281.1306.2331* @_ZN24StaticSocketDataProviderC1EP13MockReadWritejS1_j(%class.StaticSocketDataProvider.6.231.281.1306.2331* returned, %struct.MockReadWrite.7.232.282.1307.2332*, i32, %struct.MockReadWrite.7.232.282.1307.2332*, i32) unnamed_addr
+
+declare void @_ZN24StaticSocketDataProvider5m_fn8E11MockConnect(%class.StaticSocketDataProvider.6.231.281.1306.2331*, %struct.MockConnect.5.230.280.1305.2330*)
+
+declare %struct.MockConnect.5.230.280.1305.2330* @_ZN11MockConnectC1Ev(%struct.MockConnect.5.230.280.1305.2330* returned) unnamed_addr
+
+declare %class.TestCompletionCallback.9.234.284.1309.2334* @_ZN22TestCompletionCallbackC1Ev(%class.TestCompletionCallback.9.234.284.1309.2334* returned) unnamed_addr
+
+declare i32 @_ZN29MockTransportClientSocketPool5m_fn9Ev(%class.MockTransportClientSocketPool.0.225.275.1300.2325*)
+
+declare %class.AssertHelper.10.235.285.1310.2335* @_ZN12AssertHelperC1EPKc(%class.AssertHelper.10.235.285.1310.2335* returned, i8*) unnamed_addr
+
+declare void @_ZN12AssertHelperaSE7Message(%class.AssertHelper.10.235.285.1310.2335*, %class.Message.13.238.288.1313.2338*)
+
+declare %class.Message.13.238.288.1313.2338* @_ZN7MessageC1Ev(%class.Message.13.238.288.1313.2338* returned) unnamed_addr
+
+declare %class.AssertHelper.10.235.285.1310.2335* @_ZN12AssertHelperD1Ev(%class.AssertHelper.10.235.285.1310.2335* returned) unnamed_addr
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+declare void @_ZN18ClientSocketHandle5m_fn3IPiEEvRK25Trans_NS___1_basic_stringIciiE13scoped_refptr15RequestPriorityN16ClientSocketPool13RespectLimitsERiT_11BoundNetLog(%class.ClientSocketHandle.14.239.289.1314.2339*, %class.Trans_NS___1_basic_string.18.243.293.1318.2343* dereferenceable(12), %class.scoped_refptr.19.244.294.1319.2344*, i32, i32, i32* dereferenceable(4), i32*, %class.BoundNetLog.20.245.295.1320.2345*)
+
+declare void @_Z19CreateSOCKSv5Paramsv(%class.scoped_refptr.19.244.294.1319.2344* sret)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+declare %class.BoundNetLog.20.245.295.1320.2345* @_ZN11BoundNetLogD1Ev(%class.BoundNetLog.20.245.295.1320.2345* returned) unnamed_addr
+
+declare %class.scoped_refptr.19.244.294.1319.2344* @_ZN13scoped_refptrD1Ev(%class.scoped_refptr.19.244.294.1319.2344* returned) unnamed_addr
+
+declare %"class.(anonymous namespace)::__vector_base.21.246.296.1321.2346"* @_ZN12_GLOBAL__N_113__vector_baseD2Ev(%"class.(anonymous namespace)::__vector_base.21.246.296.1321.2346"* returned) unnamed_addr
+
+declare i8* @_ZN25Trans_NS___1_basic_stringIciiE5m_fn1Ev(%class.Trans_NS___1_basic_string.18.243.293.1318.2343*)
+
+declare zeroext i1 @_Z6IsTruev()
+
+declare void @_ZdlPv(i8*)
+
+declare %"class.(anonymous namespace)::basic_iostream.22.247.297.1322.2347"* @_ZN12_GLOBAL__N_114basic_iostreamD2Ev(%"class.(anonymous namespace)::basic_iostream.22.247.297.1322.2347"* returned) unnamed_addr
+
+declare %class.scoped_ptr.23.248.298.1323.2348* @_ZN10scoped_ptrI25Trans_NS___1_basic_stringIciiEED2Ev(%class.scoped_ptr.23.248.298.1323.2348* readonly returned) unnamed_addr align 2
+
+declare void @_Z11CmpHelperEQPcS_xx(%class.AssertionResult.24.249.299.1324.2349* sret, i8*, i8*, i64, i64)
+
+declare void @_ZN25Trans_NS___1_basic_stringIciiE5m_fn2Ev(%class.Trans_NS___1_basic_string.18.243.293.1318.2343*)
+
+attributes #0 = { argmemonly nounwind }
diff --git a/test/CodeGen/ARM/intrinsics-coprocessor.ll b/test/CodeGen/ARM/intrinsics-coprocessor.ll
new file mode 100644
index 000000000000..8fea49b39fb6
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-coprocessor.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 | FileCheck %s
+
+define void @coproc(i8* %i) nounwind {
+entry:
+ ; CHECK: mrc p7, #1, r{{[0-9]+}}, c1, c1, #4
+ %0 = tail call i32 @llvm.arm.mrc(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+ ; CHECK: mcr p7, #1, r{{[0-9]+}}, c1, c1, #4
+ tail call void @llvm.arm.mcr(i32 7, i32 1, i32 %0, i32 1, i32 1, i32 4) nounwind
+ ; CHECK: mrc2 p7, #1, r{{[0-9]+}}, c1, c1, #4
+ %1 = tail call i32 @llvm.arm.mrc2(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+ ; CHECK: mcr2 p7, #1, r{{[0-9]+}}, c1, c1, #4
+ tail call void @llvm.arm.mcr2(i32 7, i32 1, i32 %1, i32 1, i32 1, i32 4) nounwind
+ ; CHECK: mcrr p7, #1, r{{[0-9]+}}, r{{[0-9]+}}, c1
+ tail call void @llvm.arm.mcrr(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+ ; CHECK: mcrr2 p7, #1, r{{[0-9]+}}, r{{[0-9]+}}, c1
+ tail call void @llvm.arm.mcrr2(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+ ; CHECK: cdp p7, #3, c1, c1, c1, #5
+ tail call void @llvm.arm.cdp(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+ ; CHECK: cdp2 p7, #3, c1, c1, c1, #5
+ tail call void @llvm.arm.cdp2(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+ ; CHECK: ldc p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.ldc(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: ldcl p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.ldcl(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: ldc2 p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.ldc2(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: ldc2l p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.ldc2l(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: stc p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.stc(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: stcl p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.stcl(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: stc2 p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.stc2(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: stc2l p7, c3, [r{{[0-9]+}}]
+ tail call void @llvm.arm.stc2l(i32 7, i32 3, i8* %i) nounwind
+ ; CHECK: mrrc p1, #2, r{{[0-9]+}}, r{{[0-9]+}}, c3
+ %2 = tail call { i32, i32 } @llvm.arm.mrrc(i32 1, i32 2, i32 3) nounwind
+ ; CHECK: mrrc2 p1, #2, r{{[0-9]+}}, r{{[0-9]+}}, c3
+ %3 = tail call { i32, i32 } @llvm.arm.mrrc2(i32 1, i32 2, i32 3) nounwind
+ ret void
+}
+
+declare void @llvm.arm.ldc(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.ldcl(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.ldc2(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.ldc2l(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stc(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stcl(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stc2(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stc2l(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.cdp2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.cdp(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc(i32, i32, i32, i32, i32) nounwind
+
+declare { i32, i32 } @llvm.arm.mrrc(i32, i32, i32) nounwind
+
+declare { i32, i32 } @llvm.arm.mrrc2(i32, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/intrinsics.ll b/test/CodeGen/ARM/intrinsics.ll
deleted file mode 100644
index 54cc3e0a027c..000000000000
--- a/test/CodeGen/ARM/intrinsics.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 | FileCheck %s
-
-define void @coproc() nounwind {
-entry:
- ; CHECK: mrc
- %0 = tail call i32 @llvm.arm.mrc(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
- ; CHECK: mcr
- tail call void @llvm.arm.mcr(i32 7, i32 1, i32 %0, i32 1, i32 1, i32 4) nounwind
- ; CHECK: mrc2
- %1 = tail call i32 @llvm.arm.mrc2(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
- ; CHECK: mcr2
- tail call void @llvm.arm.mcr2(i32 7, i32 1, i32 %1, i32 1, i32 1, i32 4) nounwind
- ; CHECK: mcrr
- tail call void @llvm.arm.mcrr(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
- ; CHECK: mcrr2
- tail call void @llvm.arm.mcrr2(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
- ; CHECK: cdp
- tail call void @llvm.arm.cdp(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
- ; CHECK: cdp2
- tail call void @llvm.arm.cdp2(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
- ret void
-}
-
-declare void @llvm.arm.cdp2(i32, i32, i32, i32, i32, i32) nounwind
-
-declare void @llvm.arm.cdp(i32, i32, i32, i32, i32, i32) nounwind
-
-declare void @llvm.arm.mcrr2(i32, i32, i32, i32, i32) nounwind
-
-declare void @llvm.arm.mcrr(i32, i32, i32, i32, i32) nounwind
-
-declare void @llvm.arm.mcr2(i32, i32, i32, i32, i32, i32) nounwind
-
-declare i32 @llvm.arm.mrc2(i32, i32, i32, i32, i32) nounwind
-
-declare void @llvm.arm.mcr(i32, i32, i32, i32, i32, i32) nounwind
-
-declare i32 @llvm.arm.mrc(i32, i32, i32, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/invalidated-save-point.ll b/test/CodeGen/ARM/invalidated-save-point.ll
new file mode 100644
index 000000000000..0ff153b6799d
--- /dev/null
+++ b/test/CodeGen/ARM/invalidated-save-point.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple thumbv7 -stop-after=if-converter < %s 2>&1 | FileCheck %s
+
+; Make sure the save point and restore point are dropped from MFI at
+; this point. Notably, if it isn't is will be invalid and reference a
+; deleted block (%bb.-1.if.end)
+
+; CHECK-NOT: savePoint:
+; CHECK-NOT: restorePoint:
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7"
+
+define i32 @f(i32 %n) {
+entry:
+ %cmp = icmp ult i32 %n, 4
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ tail call void @g(i32 %n)
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
+ ret i32 %retval.0
+}
+
+declare void @g(i32)
diff --git a/test/CodeGen/ARM/ldc2l.ll b/test/CodeGen/ARM/ldc2l.ll
new file mode 100644
index 000000000000..58d9509b1672
--- /dev/null
+++ b/test/CodeGen/ARM/ldc2l.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -mtriple=armv8-eabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=thumbv8-eabi 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.ldc2l
+define void @ldc2l(i8* %i) nounwind {
+entry:
+ call void @llvm.arm.ldc2l(i32 1, i32 2, i8* %i) nounwind
+ ret void
+}
+
+declare void @llvm.arm.ldc2l(i32, i32, i8*) nounwind
diff --git a/test/CodeGen/ARM/ldm-base-writeback.ll b/test/CodeGen/ARM/ldm-base-writeback.ll
new file mode 100644
index 000000000000..375f58a24a19
--- /dev/null
+++ b/test/CodeGen/ARM/ldm-base-writeback.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O3 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "armv7--linux-gnu"
+
+@a = global i32 0, align 4
+@b = global i32 0, align 4
+@c = global i32 0, align 4
+
+; CHECK-LABEL: bar:
+; CHECK: ldm r{{[0-9]}}!, {r0, r{{[0-9]}}, r{{[0-9]}}}
+define void @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
+ %1 = load i32, i32* @a, align 4
+ %2 = load i32, i32* @b, align 4
+ %3 = load i32, i32* @c, align 4
+ %4 = tail call i32 @baz(i32 %1, i32 %3) minsize optsize
+ %5 = tail call i32 @baz(i32 %2, i32 %3) minsize optsize
+ ret void
+}
+
+declare i32 @baz(i32,i32) minsize optsize
diff --git a/test/CodeGen/ARM/ldr_frame.ll b/test/CodeGen/ARM/ldr_frame.ll
index 01b18bccc337..24c10b42a445 100644
--- a/test/CodeGen/ARM/ldr_frame.ll
+++ b/test/CodeGen/ARM/ldr_frame.ll
@@ -1,5 +1,7 @@
; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
+; CHECK-LABEL: f1
+; CHECK-NOT: mov
define i32 @f1() {
%buf = alloca [32 x i32], align 4
%tmp = getelementptr [32 x i32], [32 x i32]* %buf, i32 0, i32 0
@@ -7,6 +9,8 @@ define i32 @f1() {
ret i32 %tmp1
}
+; CHECK-LABEL: f2
+; CHECK-NOT: mov
define i32 @f2() {
%buf = alloca [32 x i8], align 4
%tmp = getelementptr [32 x i8], [32 x i8]* %buf, i32 0, i32 0
@@ -15,6 +19,8 @@ define i32 @f2() {
ret i32 %tmp2
}
+; CHECK-LABEL: f3
+; CHECK-NOT: mov
define i32 @f3() {
%buf = alloca [32 x i32], align 4
%tmp = getelementptr [32 x i32], [32 x i32]* %buf, i32 0, i32 32
@@ -22,6 +28,8 @@ define i32 @f3() {
ret i32 %tmp1
}
+; CHECK-LABEL: f4
+; CHECK-NOT: mov
define i32 @f4() {
%buf = alloca [32 x i8], align 4
%tmp = getelementptr [32 x i8], [32 x i8]* %buf, i32 0, i32 2
@@ -29,6 +37,3 @@ define i32 @f4() {
%tmp2 = zext i8 %tmp1 to i32
ret i32 %tmp2
}
-
-; CHECK-NOT: mov
-
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index b2596346bfa1..dd97fbfd6404 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK -check-prefix=NORMAL
; rdar://6949835
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK -check-prefix=NORMAL
+
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=CHECK -check-prefix=CONSERVATIVE
; Magic ARM pair hints works best with linearscan / fast.
@@ -15,12 +17,13 @@ declare void @use_i64(i64 %v)
define void @test_ldrd(i64 %a) nounwind readonly {
; CHECK-LABEL: test_ldrd:
-; CHECK: bl{{x?}} _get_ptr
+; NORMAL: bl{{x?}} _get_ptr
; A8: ldrd r0, r1, [r0]
; Cortex-M3 errata 602117: LDRD with base in list may result in incorrect base
; register when interrupted or faulted.
; M3-NOT: ldrd r[[REGNUM:[0-9]+]], {{r[0-9]+}}, [r[[REGNUM]]]
-; CHECK: bl{{x?}} _use_i64
+; CONSERVATIVE-NOT: ldrd
+; NORMAL: bl{{x?}} _use_i64
%ptr = call i64* @get_ptr()
%v = load i64, i64* %ptr, align 8
call void @use_i64(i64 %v)
@@ -39,11 +42,10 @@ define void @test_ldrd(i64 %a) nounwind readonly {
; evict another live range or use callee saved regs. Sorry if the test
; is sensitive to Regalloc changes, but it is an interesting case.
;
-; BASIC: @f
+; CHECK-LABEL: f:
; BASIC: %bb
; BASIC: ldrd
; BASIC: str
-; GREEDY: @f
; GREEDY: %bb
; GREEDY: ldrd
; GREEDY: str
@@ -76,14 +78,15 @@ return: ; preds = %bb, %entry
@TestVar = external global %struct.Test
+; CHECK-LABEL: Func1:
define void @Func1() nounwind ssp {
-; CHECK: @Func1
entry:
; A8: movw [[BASE:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}}
; A8: movt [[BASE]], :upper16:{{.*}}TestVar{{.*}}
; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4]
; A8-NEXT: add [[FIELD1]], [[FIELD2]]
; A8-NEXT: str [[FIELD1]], {{\[}}[[BASE]]{{\]}}
+; CONSERVATIVE-NOT: ldrd
%orig_blocks = alloca [256 x i16], align 2
%0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start(i64 512, i8* %0) nounwind
%tmp1 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 1), align 4
@@ -97,8 +100,9 @@ entry:
declare void @extfunc(i32, i32, i32, i32)
; CHECK-LABEL: Func2:
+; CONSERVATIVE-NOT: ldrd
; A8: ldrd
-; A8: blx
+; CHECK: bl{{x?}} _extfunc
; A8: pop
define void @Func2(i32* %p) {
entry:
@@ -116,12 +120,14 @@ entry:
; M3: strd r1, r0, [sp, #-8]!
; BASIC: strd r1, r0, [sp, #-8]!
; GREEDY: strd r0, r1, [sp, #-8]!
-; CHECK: @ InlineAsm Start
-; CHECK: @ InlineAsm End
+; CONSERVATIVE: strd r0, r1, [sp, #-8]!
+; NORMAL: @ InlineAsm Start
+; NORMAL: @ InlineAsm End
; A8: ldrd r2, r1, [sp]
; M3: ldrd r2, r1, [sp]
; BASIC: ldrd r2, r1, [sp]
; GREEDY: ldrd r1, r2, [sp]
+; CONSERVATIVE: ldrd r1, r2, [sp]
; CHECK: bl{{x?}} _extfunc
define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
; force %v0 and %v1 to be spilled
@@ -134,8 +140,9 @@ define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
declare void @extfunc2(i32*, i32, i32)
; CHECK-LABEL: ldrd_postupdate_dec:
-; CHECK: ldrd r1, r2, [r0], #-8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
define void @ldrd_postupdate_dec(i32* %p0) {
%p0.1 = getelementptr i32, i32* %p0, i32 1
%v0 = load i32, i32* %p0
@@ -146,8 +153,9 @@ define void @ldrd_postupdate_dec(i32* %p0) {
}
; CHECK-LABEL: ldrd_postupdate_inc:
-; CHECK: ldrd r1, r2, [r0], #8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
define void @ldrd_postupdate_inc(i32* %p0) {
%p0.1 = getelementptr i32, i32* %p0, i32 1
%v0 = load i32, i32* %p0
@@ -158,8 +166,9 @@ define void @ldrd_postupdate_inc(i32* %p0) {
}
; CHECK-LABEL: strd_postupdate_dec:
-; CHECK: strd r1, r2, [r0], #-8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
%p0.1 = getelementptr i32, i32* %p0, i32 1
store i32 %v0, i32* %p0
@@ -169,8 +178,9 @@ define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
}
; CHECK-LABEL: strd_postupdate_inc:
-; CHECK: strd r1, r2, [r0], #8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
%p0.1 = getelementptr i32, i32* %p0, i32 1
store i32 %v0, i32* %p0
diff --git a/test/CodeGen/ARM/ldstrex-m.ll b/test/CodeGen/ARM/ldstrex-m.ll
index 3d83a9d78e35..5b717f7f1ae9 100644
--- a/test/CodeGen/ARM/ldstrex-m.ll
+++ b/test/CodeGen/ARM/ldstrex-m.ll
@@ -1,4 +1,6 @@
; RUN: llc < %s -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8m.main-none-eabi | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8m.base-none-eabi | FileCheck %s
; CHECK-LABEL: f0:
; CHECK-NOT: ldrexd
diff --git a/test/CodeGen/ARM/legalize-unaligned-load.ll b/test/CodeGen/ARM/legalize-unaligned-load.ll
index fa5b21aa4a23..eb4e942f0742 100644
--- a/test/CodeGen/ARM/legalize-unaligned-load.ll
+++ b/test/CodeGen/ARM/legalize-unaligned-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -code-model=default -relocation-model=default -mtriple=armv7l-unknown-linux-gnueabihf -mcpu=generic %s -o - | FileCheck %s
+; RUN: llc -O3 -code-model=default -mtriple=armv7l-unknown-linux-gnueabihf -mcpu=generic %s -o - | FileCheck %s
; Check that we respect the existing chain between loads and stores when we
; legalize unaligned loads.
; Test case from PR24669.
diff --git a/test/CodeGen/ARM/litpool-licm.ll b/test/CodeGen/ARM/litpool-licm.ll
new file mode 100644
index 000000000000..dc6b37feaf05
--- /dev/null
+++ b/test/CodeGen/ARM/litpool-licm.ll
@@ -0,0 +1,46 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf -relocation-model=pic %s -o - | FileCheck %s
+
+@var = thread_local global i32 0, align 4
+
+define void @func(i32 %n) {
+; CHECK-LABEL: func:
+; CHECK: ldr [[REF1:r[0-9]+]], [[CP1:.LCPI[0-9]+_[0-9]+]]
+; CHECK: ldr [[REF2:r[0-9]+]], [[CP2:.LCPI[0-9]+_[0-9]+]]
+
+; CHECK: [[PCPOS1:.LPC[0-9]+_[0-9]+]]:
+; CHECK-NEXT: add [[REF1]], pc
+
+; CHECK: [[PCPOS2:.LPC[0-9]+_[0-9]+]]:
+; CHECK-NEXT: add [[REF2]], pc
+
+; CHECK: [[CP1]]:
+; CHECK-NEXT: [[CP1_TMP:.Ltmp[0-9]+]]:
+; CHECK-NEXT: .long var(TLSGD)-(([[PCPOS1]]+4)-[[CP1_TMP]])
+
+; CHECK: [[CP2]]:
+; CHECK-NEXT: [[CP2_TMP:.Ltmp[0-9]+]]:
+; CHECK-NEXT: .long var(TLSGD)-(([[PCPOS2]]+4)-[[CP2_TMP]])
+
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ %inc, %next ], [ 0, %entry ]
+ %val = load i32, i32* @var
+ %tst = icmp eq i32 %val, 0
+ br i1 %tst, label %next, label %call
+
+call:
+ tail call void @foo(i32* nonnull @var) #2
+ br label %next
+
+next:
+ %inc = add i32 %i, 1
+ %stop = icmp eq i32 %inc, %n
+ br i1 %stop, label %done, label %loop
+
+done:
+ ret void
+}
+
+declare void @foo(i32*) \ No newline at end of file
diff --git a/test/CodeGen/ARM/local-call.ll b/test/CodeGen/ARM/local-call.ll
new file mode 100644
index 000000000000..a38df62ff905
--- /dev/null
+++ b/test/CodeGen/ARM/local-call.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -filetype=obj %s -o %t
+; RUN: llvm-objdump -macho -d %t | FileCheck %s
+
+; This function just messes up the offsets enough to make the libcall in
+; test_local_call unencodable with a blx.
+define void @thing() {
+ ret void
+}
+
+define i64 @__udivdi3(i64 %a, i64 %b) {
+ ret i64 %b
+}
+
+define i64 @test_local_call(i64 %a, i64 %b) {
+; CHECK-LABEL: test_local_call:
+; CHECK: bl ___udivdi3
+
+%res = udiv i64 %a, %b
+ ret i64 %res
+} \ No newline at end of file
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
index 3f30fd40b7e7..cc2d745aae8e 100644
--- a/test/CodeGen/ARM/longMAC.ll
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -116,3 +116,32 @@ define i64 @MACLongTest8(i64 %acc, i32 %lhs, i32 %rhs) {
ret i64 %add
}
+define i64 @MACLongTest9(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) {
+;CHECK-LABEL: MACLongTest9:
+;CHECK-V7-LE:umaal
+;CHECK-V7-BE:umaal
+;CHECK-NOT:umaal
+ %conv = zext i32 %lhs to i64
+ %conv1 = zext i32 %rhs to i64
+ %mul = mul nuw i64 %conv1, %conv
+ %conv2 = zext i32 %lo to i64
+ %add = add i64 %mul, %conv2
+ %conv3 = zext i32 %hi to i64
+ %add2 = add i64 %add, %conv3
+ ret i64 %add2
+}
+
+define i64 @MACLongTest10(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) {
+;CHECK-LABEL: MACLongTest10:
+;CHECK-V7-LE:umaal
+;CHECK-V7-BE:umaal
+;CHECK-NOT:umaal
+ %conv = zext i32 %lhs to i64
+ %conv1 = zext i32 %rhs to i64
+ %mul = mul nuw i64 %conv1, %conv
+ %conv2 = zext i32 %lo to i64
+ %conv3 = zext i32 %hi to i64
+ %add = add i64 %conv2, %conv3
+ %add2 = add i64 %add, %mul
+ ret i64 %add2
+}
diff --git a/test/CodeGen/ARM/lsr-code-insertion.ll b/test/CodeGen/ARM/lsr-code-insertion.ll
index aa2b2d26d121..766710fd1d64 100644
--- a/test/CodeGen/ARM/lsr-code-insertion.ll
+++ b/test/CodeGen/ARM/lsr-code-insertion.ll
@@ -9,8 +9,8 @@
;
; CHECK: ldr [[R6:r[0-9*]+]], LCP
; CHECK: cmp {{.*}}, [[R6]]
-; CHECK: ldrle
-; CHECK-NEXT: strle
+; CHECK-NOT: lt
+; CHECK: strlt
target triple = "arm-apple-darwin8"
diff --git a/test/CodeGen/ARM/macho-frame-offset.ll b/test/CodeGen/ARM/macho-frame-offset.ll
new file mode 100644
index 000000000000..f3dacf66b6c3
--- /dev/null
+++ b/test/CodeGen/ARM/macho-frame-offset.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple thumbv7m-apple-macho -disable-fp-elim -o - %s | FileCheck %s
+
+define void @func() {
+; CHECK-LABEL: func:
+; CHECK: push {r6, r7, lr}
+; CHECK: add r7, sp, #4
+ call void @bar()
+ call void asm sideeffect "", "~{r11}"()
+ ret void
+}
+
+declare void @bar()
diff --git a/test/CodeGen/ARM/memcpy-no-inline.ll b/test/CodeGen/ARM/memcpy-no-inline.ll
new file mode 100644
index 000000000000..126546095e1f
--- /dev/null
+++ b/test/CodeGen/ARM/memcpy-no-inline.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=thumbv7m-arm-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s
+
+%struct.mystruct = type { [31 x i8] }
+
+@.str = private unnamed_addr constant [31 x i8] c"012345678901234567890123456789\00", align 1
+@.str.1 = private unnamed_addr constant [21 x i8] c"01234567890123456789\00", align 1
+
+@myglobal = common global %struct.mystruct zeroinitializer, align 1
+
+define void @foo() #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: __aeabi_memcpy
+; CHECK-NOT: ldm
+ %mystring = alloca [31 x i8], align 1
+ %0 = getelementptr inbounds [31 x i8], [31 x i8]* %mystring, i32 0, i32 0
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str, i32 0, i32 0), i32 31, i32 1, i1 false)
+ ret void
+}
+
+define void @bar() #0 {
+entry:
+; CHECK-LABEL: bar:
+; CHECK-NOT: __aeabi_memcpy
+ %mystring = alloca [31 x i8], align 1
+ %0 = getelementptr inbounds [31 x i8], [31 x i8]* %mystring, i32 0, i32 0
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i32 0, i32 0), i32 21, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+attributes #0 = { minsize noinline nounwind optsize }
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 46fef7629cc4..bc60d8e4f0ec 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -5,6 +5,8 @@
; RUN: llc < %s -mtriple=arm-none-androideabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-EABI --check-prefix=CHECK
; RUN: llc < %s -mtriple=arm-none-gnueabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
; RUN: llc < %s -mtriple=arm-none-gnueabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-musleabi -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
+; RUN: llc < %s -mtriple=arm-none-musleabihf -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-GNUEABI --check-prefix=CHECK
define void @f1(i8* %dest, i8* %src) {
entry:
@@ -384,6 +386,8 @@ entry:
@arr5 = weak global [7 x i8] c"\01\02\03\04\05\06\07", align 1
@arr6 = weak_odr global [7 x i8] c"\01\02\03\04\05\06\07", align 1
@arr7 = external global [7 x i8], align 1
+@arr8 = internal global [128 x i8] undef
+@arr9 = weak_odr global [128 x i8] undef
define void @f9(i8* %dest, i32 %n) {
entry:
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr1, i32 0, i32 0), i32 %n, i32 1, i1 false)
@@ -393,28 +397,35 @@ entry:
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr5, i32 0, i32 0), i32 %n, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr6, i32 0, i32 0), i32 %n, i32 1, i1 false)
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr7, i32 0, i32 0), i32 %n, i32 1, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @arr8, i32 0, i32 0), i32 %n, i32 1, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @arr9, i32 0, i32 0), i32 %n, i32 1, i1 false)
unreachable
}
; CHECK: {{\.data|\.section.+data}}
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: arr1:
-; CHECK-IOS: .align 3
-; CHECK-DARWIN: .align 2
-; CHECK-EABI-NOT: .align
-; CHECK-GNUEABI-NOT: .align
+; CHECK-IOS: .p2align 3
+; CHECK-DARWIN: .p2align 2
+; CHECK-EABI-NOT: .p2align
+; CHECK-GNUEABI-NOT: .p2align
; CHECK: arr2:
; CHECK: {{\.section.+foo,bar}}
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: arr3:
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: arr4:
; CHECK: {{\.data|\.section.+data}}
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: arr5:
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: arr6:
+; CHECK: .p2align 4
+; CHECK: arr8:
+; CHECK: .p2align 4
+; CHECK: arr9:
+
; CHECK-NOT: arr7:
declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/minsize-call-cse.ll b/test/CodeGen/ARM/minsize-call-cse.ll
new file mode 100644
index 000000000000..072b76f03ba3
--- /dev/null
+++ b/test/CodeGen/ARM/minsize-call-cse.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; CHECK-LABEL: f:
+; CHECK: blx r
+; CHECK: blx r
+; CHECK: blx r
+define void @f() minsize optsize {
+entry:
+ call void @g(i32 45, i32 66)
+ call void @g(i32 88, i32 32)
+ call void @g(i32 55, i32 33)
+ ret void
+}
+
+; CHECK-LABEL: h:
+; CHECK: bl g
+; CHECK: bl g
+define void @h() minsize optsize {
+entry:
+ call void @g(i32 45, i32 66)
+ call void @g(i32 88, i32 32)
+ ret void
+}
+
+declare void @g(i32,i32)
diff --git a/test/CodeGen/ARM/movt.ll b/test/CodeGen/ARM/movt.ll
index 94c022ee2712..da9b698f2099 100644
--- a/test/CodeGen/ARM/movt.ll
+++ b/test/CodeGen/ARM/movt.ll
@@ -1,9 +1,11 @@
; RUN: llc -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
; rdar://7317664
+; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s
+
define i32 @t(i32 %X) nounwind {
; CHECK-LABEL: t:
-; CHECK: movt r0, #65535
+; CHECK: movt r{{[0-9]}}, #65535
entry:
%0 = or i32 %X, -65536
ret i32 %0
@@ -11,7 +13,7 @@ entry:
define i32 @t2(i32 %X) nounwind {
; CHECK-LABEL: t2:
-; CHECK: movt r0, #65534
+; CHECK: movt r{{[0-9]}}, #65534
entry:
%0 = or i32 %X, -131072
%1 = and i32 %0, -65537
diff --git a/test/CodeGen/ARM/msr-it-block.ll b/test/CodeGen/ARM/msr-it-block.ll
new file mode 100644
index 000000000000..0f9ff6b29d79
--- /dev/null
+++ b/test/CodeGen/ARM/msr-it-block.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi | FileCheck %s --check-prefix=V6M --check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi | FileCheck %s --check-prefix=V7M --check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7a-none-eabi | FileCheck %s --check-prefix=V7A --check-prefix=CHECK
+; RUN: llc < %s -mtriple=armv7a-none-eabi | FileCheck %s --check-prefix=V7A --check-prefix=CHECK
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7a-arm-none-eabi"
+
+define void @test_const(i32 %val) {
+; CHECK-LABEL: test_const:
+entry:
+ %cmp = icmp eq i32 %val, 0
+ br i1 %cmp, label %write_reg, label %exit
+
+write_reg:
+ tail call void @llvm.write_register.i32(metadata !0, i32 0)
+ tail call void @llvm.write_register.i32(metadata !0, i32 0)
+; V6M: msr apsr, {{r[0-9]+}}
+; V6M: msr apsr, {{r[0-9]+}}
+; V7M: msr apsr_nzcvq, {{r[0-9]+}}
+; V7M: msr apsr_nzcvq, {{r[0-9]+}}
+; V7A: msr APSR_nzcvqg, {{r[0-9]+}}
+; V7A: msr APSR_nzcvqg, {{r[0-9]+}}
+ br label %exit
+
+exit:
+ ret void
+}
+
+define void @test_var(i32 %val, i32 %apsr) {
+; CHECK-LABEL: test_var:
+entry:
+ %cmp = icmp eq i32 %val, 0
+ br i1 %cmp, label %write_reg, label %exit
+
+write_reg:
+ tail call void @llvm.write_register.i32(metadata !0, i32 %apsr)
+ tail call void @llvm.write_register.i32(metadata !0, i32 %apsr)
+; V6M: msr apsr, {{r[0-9]+}}
+; V6M: msr apsr, {{r[0-9]+}}
+; V7M: msr apsr_nzcvq, {{r[0-9]+}}
+; V7M: msr apsr_nzcvq, {{r[0-9]+}}
+; V7A: msr APSR_nzcvqg, {{r[0-9]+}}
+; V7A: msr APSR_nzcvqg, {{r[0-9]+}}
+ br label %exit
+
+exit:
+ ret void
+}
+
+
+declare void @llvm.write_register.i32(metadata, i32)
+
+!0 = !{!"apsr"}
diff --git a/test/CodeGen/ARM/none-macho.ll b/test/CodeGen/ARM/none-macho.ll
index 733ba4ba2d2c..fee459f4f5e1 100644
--- a/test/CodeGen/ARM/none-macho.ll
+++ b/test/CodeGen/ARM/none-macho.ll
@@ -43,8 +43,8 @@ define i32 @test_frame_ptr() {
; CHECK-LABEL: test_frame_ptr:
call void @test_trap()
- ; Frame pointer is r11.
-; CHECK: mov r11, sp
+ ; Frame pointer is r7.
+; CHECK: mov r7, sp
ret i32 42
}
@@ -58,9 +58,11 @@ define void @test_two_areas(%big_arr* %addr) {
; This goes with the choice of r7 as FP (largely). FP and LR have to be stored
; consecutively on the stack for the frame record to be valid, which means we
; need the 2 register-save areas employed by iOS.
-; CHECK-NON-FAST: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NON-FAST: push {r4, r5, r6, r7, lr}
+; CHECK-NON-FAST: push.w {r8, r9, r10, r11}
; ...
-; CHECK-NON-FAST: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NON-FAST: pop.w {r8, r9, r10, r11}
+; CHECK-NON-FAST: pop {r4, r5, r6, r7, pc}
ret void
}
diff --git a/test/CodeGen/ARM/pic.ll b/test/CodeGen/ARM/pic.ll
index 9fc7a63bd687..d12addb69d77 100644
--- a/test/CodeGen/ARM/pic.ll
+++ b/test/CodeGen/ARM/pic.ll
@@ -11,10 +11,10 @@ define void @test() {
entry:
%0 = call i32 @get()
-; CHECK: bl get(PLT)
+; CHECK: bl get
call void @put(i32 %0)
-; CHECK: bl put(PLT)
+; CHECK: bl put
ret void
}
diff --git a/test/CodeGen/ARM/pie.ll b/test/CodeGen/ARM/pie.ll
new file mode 100644
index 000000000000..1b1e6e62fda5
--- /dev/null
+++ b/test/CodeGen/ARM/pie.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=armv7-pc-linux-gnueabi -relocation-model=pic < %s | FileCheck %s
+
+@foo = global i32 42
+
+define i32* @get_foo() {
+ ret i32* @foo
+}
+
+; Test that we only use one load. Even that is only needed because there
+; doesn't seem to be pc relative relocations for movw movt.
+; CHECK: ldr r0, .LCPI0_0
+; CHECK-NEXT: .L{{.*}}:
+; CHECK-NEXT: add r0, pc, r0
+; CHECK-NEXT: bx lr
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"PIE Level", i32 2}
diff --git a/test/CodeGen/ARM/plt-relative-reloc.ll b/test/CodeGen/ARM/plt-relative-reloc.ll
new file mode 100644
index 000000000000..08dcfdf1298f
--- /dev/null
+++ b/test/CodeGen/ARM/plt-relative-reloc.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=armv7-unknown-linux -o - %s | FileCheck %s
+
+@vtable = constant [4 x i32] [i32 0,
+ i32 sub (i32 ptrtoint (void ()* @fn1 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32)),
+ i32 sub (i32 ptrtoint (void ()* @fn2 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32)),
+ i32 sub (i32 ptrtoint (void ()* @fn3 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32))
+]
+
+declare void @fn1() unnamed_addr
+declare void @fn2() unnamed_addr
+declare void @fn3()
+
+; CHECK: .long 0
+; CHECK-NEXT: .long (fn1(prel31)-vtable)-4
+; CHECK-NEXT: .long (fn2(prel31)-vtable)-4
+; CHECK-NEXT: .long (fn3-vtable)-4
diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll
index 74f90640ca6e..fd61811f49cf 100644
--- a/test/CodeGen/ARM/popcnt.ll
+++ b/test/CodeGen/ARM/popcnt.ll
@@ -71,12 +71,28 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
ret <4 x i32> %tmp2
}
+define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
+; CHECK-LABEL: vcnt64:
+ %tmp1 = load <1 x i64>, <1 x i64>* %A
+ %tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1)
+ ret <1 x i64> %tmp2
+}
+
+define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind {
+; CHECK-LABEL: vcntQ64:
+ %tmp1 = load <2 x i64>, <2 x i64>* %A
+ %tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1)
+ ret <2 x i64> %tmp2
+}
+
declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
;CHECK-LABEL: vclz8:
diff --git a/test/CodeGen/ARM/pr26669.ll b/test/CodeGen/ARM/pr26669.ll
new file mode 100644
index 000000000000..6c28ddd2d848
--- /dev/null
+++ b/test/CodeGen/ARM/pr26669.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -globaldce -sjljehprepare < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7--ios5.0.0"
+
+define void @g() personality i32 (...)* @__gxx_personality_sj0 {
+entry:
+ %exn.slot = alloca i8*
+ %ehselector.slot = alloca i32
+ invoke void @f()
+ to label %try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %0 = landingpad { i8*, i32 }
+ cleanup
+ br label %try.cont
+
+try.cont: ; preds = %catch, %invoke.cont
+ ret void
+}
+
+declare void @f()
+
+declare i32 @__gxx_personality_sj0(...)
+
+; CHECK-LABEL: define void @g(
+; CHECK: call void @llvm.eh.sjlj.callsite(
+; CHECK: call void @_Unwind_SjLj_Register(
+; CHECK: invoke void @f(
+; CHECK: landingpad
+; CHECK-NEXT: cleanup
+; CHECK: call void @_Unwind_SjLj_Unregister(
diff --git a/test/CodeGen/ARM/preferred-align.ll b/test/CodeGen/ARM/preferred-align.ll
index 8cd4ef615468..a9a17229e064 100644
--- a/test/CodeGen/ARM/preferred-align.ll
+++ b/test/CodeGen/ARM/preferred-align.ll
@@ -3,19 +3,19 @@
@var_agg = global {i8, i8} zeroinitializer
; CHECK: .globl var_agg
-; CHECK-NEXT: .align 2
+; CHECK-NEXT: .p2align 2
@var1 = global i1 zeroinitializer
; CHECK: .globl var1
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
@var8 = global i8 zeroinitializer
; CHECK: .globl var8
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
@var16 = global i16 zeroinitializer
; CHECK: .globl var16
-; CHECK-NEXT: .align 1 \ No newline at end of file
+; CHECK-NEXT: .p2align 1 \ No newline at end of file
diff --git a/test/CodeGen/ARM/rem_crash.ll b/test/CodeGen/ARM/rem_crash.ll
new file mode 100644
index 000000000000..bce597c030d2
--- /dev/null
+++ b/test/CodeGen/ARM/rem_crash.ll
@@ -0,0 +1,257 @@
+; RUN: llc < %s -mtriple=arm-unknown-unknown
+
+define i8 @test_minsize_uu8(i8 %x) minsize optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_minsize_ss8(i8 %x) minsize optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_minsize_us8(i8 %x) minsize optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_minsize_su8(i8 %x) minsize optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i16 @test_minsize_uu16(i16 %x) minsize optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_minsize_ss16(i16 %x) minsize optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_minsize_us16(i16 %x) minsize optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_minsize_su16(i16 %x) minsize optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i32 @test_minsize_uu32(i32 %x) minsize optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_minsize_ss32(i32 %x) minsize optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_minsize_us32(i32 %x) minsize optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_minsize_su32(i32 %x) minsize optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i64 @test_minsize_uu64(i64 %x) minsize optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_minsize_ss64(i64 %x) minsize optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_minsize_us64(i64 %x) minsize optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_minsize_su64(i64 %x) minsize optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i8 @test_uu8(i8 %x) optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_ss8(i8 %x) optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_us8(i8 %x) optsize {
+entry:
+ %0 = udiv i8 %x, 10
+ %1 = srem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i8 @test_su8(i8 %x) optsize {
+entry:
+ %0 = sdiv i8 %x, 10
+ %1 = urem i8 %x, 10
+ %res = add i8 %0, %1
+ ret i8 %res
+}
+
+define i16 @test_uu16(i16 %x) optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_ss16(i16 %x) optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_us16(i16 %x) optsize {
+entry:
+ %0 = udiv i16 %x, 10
+ %1 = srem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i16 @test_su16(i16 %x) optsize {
+entry:
+ %0 = sdiv i16 %x, 10
+ %1 = urem i16 %x, 10
+ %res = add i16 %0, %1
+ ret i16 %res
+}
+
+define i32 @test_uu32(i32 %x) optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_ss32(i32 %x) optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_us32(i32 %x) optsize {
+entry:
+ %0 = udiv i32 %x, 10
+ %1 = srem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i32 @test_su32(i32 %x) optsize {
+entry:
+ %0 = sdiv i32 %x, 10
+ %1 = urem i32 %x, 10
+ %res = add i32 %0, %1
+ ret i32 %res
+}
+
+define i64 @test_uu64(i64 %x) optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_ss64(i64 %x) optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_us64(i64 %x) optsize {
+entry:
+ %0 = udiv i64 %x, 10
+ %1 = srem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
+
+define i64 @test_su64(i64 %x) optsize {
+entry:
+ %0 = sdiv i64 %x, 10
+ %1 = urem i64 %x, 10
+ %res = add i64 %0, %1
+ ret i64 %res
+}
diff --git a/test/CodeGen/ARM/returned-ext.ll b/test/CodeGen/ARM/returned-ext.ll
index 925e9e729f44..f592d0aec5f7 100644
--- a/test/CodeGen/ARM/returned-ext.ll
+++ b/test/CodeGen/ARM/returned-ext.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s -check-prefix=CHECKELF
-; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -arm-this-return-forwarding | FileCheck %s -check-prefix=CHECKELF
+; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 -arm-this-return-forwarding | FileCheck %s -check-prefix=CHECKT2D
declare i16 @identity16(i16 returned %x)
declare i32 @identity32(i32 returned %x)
@@ -18,9 +18,9 @@ entry:
; CHECKELF: mov r0, [[SAVEX]]
; CHECKT2D-LABEL: test_identity:
; CHECKT2D: mov [[SAVEX:r[0-9]+]], r0
-; CHECKT2D: blx _identity16
+; CHECKT2D: bl _identity16
; CHECKT2D: uxth r0, r0
-; CHECKT2D: blx _identity32
+; CHECKT2D: bl _identity32
; CHECKT2D: mov r0, [[SAVEX]]
%call = tail call i16 @identity16(i16 %x)
%b = zext i16 %call to i32
@@ -49,9 +49,9 @@ entry:
; This shouldn't be required
; CHECKT2D: mov [[SAVEX:r[0-9]+]], r0
-; CHECKT2D: blx _retzext16
+; CHECKT2D: bl _retzext16
; CHECKT2D-NOT: uxth r0, {{r[0-9]+}}
-; CHECKT2D: blx _identity32
+; CHECKT2D: bl _identity32
; This shouldn't be required
; CHECKT2D: mov r0, [[SAVEX]]
@@ -72,9 +72,9 @@ entry:
; CHECKELF: mov r0, [[SAVEX]]
; CHECKT2D-LABEL: test_mismatched_ret:
; CHECKT2D: mov [[SAVEX:r[0-9]+]], r0
-; CHECKT2D: blx _retzext16
+; CHECKT2D: bl _retzext16
; CHECKT2D: sxth r0, {{r[0-9]+}}
-; CHECKT2D: blx _identity32
+; CHECKT2D: bl _identity32
; CHECKT2D: mov r0, [[SAVEX]]
%call = tail call i16 @retzext16(i16 %x)
%b = sext i16 %call to i32
@@ -92,9 +92,9 @@ entry:
; CHECKELF: b paramzext16
; CHECKT2D-LABEL: test_matched_paramext:
; CHECKT2D: uxth r0, r0
-; CHECKT2D: blx _paramzext16
+; CHECKT2D: bl _paramzext16
; CHECKT2D: uxth r0, r0
-; CHECKT2D: blx _identity32
+; CHECKT2D: bl _identity32
; CHECKT2D: b.w _paramzext16
%call = tail call i16 @paramzext16(i16 %x)
%b = zext i16 %call to i32
@@ -118,8 +118,8 @@ entry:
; CHECKELF: bl identity32
; CHECKELF: b paramzext16
; CHECKT2D-LABEL: test_matched_paramext2:
-; CHECKT2D: blx _paramzext16
-; CHECKT2D: blx _identity32
+; CHECKT2D: bl _paramzext16
+; CHECKT2D: bl _identity32
; CHECKT2D: b.w _paramzext16
%call = tail call i16 @paramzext16(i16 %x)
@@ -143,11 +143,11 @@ entry:
; CHECKT2D-LABEL: test_matched_bothext:
; CHECKT2D: uxth r0, r0
-; CHECKT2D: blx _bothzext16
+; CHECKT2D: bl _bothzext16
; CHECKT2D-NOT: uxth r0, r0
; FIXME: Tail call should be OK here
-; CHECKT2D: blx _identity32
+; CHECKT2D: bl _identity32
%call = tail call i16 @bothzext16(i16 %x)
%b = zext i16 %x to i32
@@ -167,9 +167,9 @@ entry:
; CHECKT2D-LABEL: test_mismatched_bothext:
; CHECKT2D: mov [[SAVEX:r[0-9]+]], r0
; CHECKT2D: uxth r0, {{r[0-9]+}}
-; CHECKT2D: blx _bothzext16
+; CHECKT2D: bl _bothzext16
; CHECKT2D: sxth r0, [[SAVEX]]
-; CHECKT2D: blx _identity32
+; CHECKT2D: bl _identity32
; CHECKT2D: mov r0, [[SAVEX]]
%call = tail call i16 @bothzext16(i16 %x)
%b = sext i16 %x to i32
diff --git a/test/CodeGen/ARM/sincos.ll b/test/CodeGen/ARM/sincos.ll
index 30b2664e3726..5be0044ddbd3 100644
--- a/test/CodeGen/ARM/sincos.ll
+++ b/test/CodeGen/ARM/sincos.ll
@@ -1,5 +1,8 @@
; RUN: llc < %s -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT
; RUN: llc < %s -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS
+; RUN: llc < %s -mtriple=armv7-linux-gnu -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT-GNU
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 \
+; RUN: --enable-unsafe-fp-math | FileCheck %s --check-prefix=SINCOS-GNU
; Combine sin / cos into a single call.
; rdar://12856873
@@ -9,9 +12,17 @@ entry:
; SINCOS-LABEL: test1:
; SINCOS: bl ___sincosf_stret
+; SINCOS-GNU-LABEL: test1:
+; SINCOS-GNU: bl sincosf
+
; NOOPT-LABEL: test1:
; NOOPT: bl _sinf
; NOOPT: bl _cosf
+
+; NOOPT-GNU-LABEL: test1:
+; NOOPT-GNU: bl sinf
+; NOOPT-GNU: bl cosf
+
%call = tail call float @sinf(float %x) nounwind readnone
%call1 = tail call float @cosf(float %x) nounwind readnone
%add = fadd float %call, %call1
@@ -23,9 +34,16 @@ entry:
; SINCOS-LABEL: test2:
; SINCOS: bl ___sincos_stret
+; SINCOS-GNU-LABEL: test2:
+; SINCOS-GNU: bl sincos
+
; NOOPT-LABEL: test2:
; NOOPT: bl _sin
; NOOPT: bl _cos
+
+; NOOPT-GNU-LABEL: test2:
+; NOOPT-GNU: bl sin
+; NOOPT-GNU: bl cos
%call = tail call double @sin(double %x) nounwind readnone
%call1 = tail call double @cos(double %x) nounwind readnone
%add = fadd double %call, %call1
diff --git a/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
index a1abef9605ca..17393e44b12e 100644
--- a/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
+++ b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
@@ -74,8 +74,8 @@ declare void @terminatev()
; %do.body.i.i.i.
; CHECK-LABEL: __Z4foo1c:
-; CHECK: blx __Znwm
-; CHECK: {{.*}}@ %entry.do.body.i.i.i_crit_edge
+; CHECK: bl __Znwm
+; CHECK: {{.*}}@ %do.body.i.i.i.preheader
; CHECK: str r0, [sp, [[OFFSET:#[0-9]+]]]
; CHECK: {{.*}}@ %do.body.i.i.i
; CHECK: ldr [[R0:r[0-9]+]], [sp, [[OFFSET]]]
diff --git a/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
index b44b447b3dff..323d5037138e 100644
--- a/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
+++ b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=armv7-apple-ios -O1 < %s | FileCheck %s
; RUN: llc -mtriple=armv7-apple-ios -O2 < %s | FileCheck %s
; RUN: llc -mtriple=armv7-apple-ios -O3 < %s | FileCheck %s
-; RUN: llc -mtriple=armv7k-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-watchos -O3 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7k-apple-ios < %s | FileCheck %s --check-prefix=CHECK-WATCH
; SjLjEHPrepare shouldn't crash when lowering empty structs.
;
@@ -16,6 +17,9 @@ entry:
; CHECK: bl __Unwind_SjLj_Register
; CHECK-NEXT: {{[A-Z][a-zA-Z0-9]*}}:
; CHECK-NEXT: bl _bar
+
+; CHECK-WATCH-NOT: bl __Unwind_SjLj_Register
+
invoke void @bar ()
to label %unreachable unwind label %handler
diff --git a/test/CodeGen/ARM/smul.ll b/test/CodeGen/ARM/smul.ll
index 13873f511e1f..4e6f7d5c67b0 100644
--- a/test/CodeGen/ARM/smul.ll
+++ b/test/CodeGen/ARM/smul.ll
@@ -1,11 +1,12 @@
; RUN: llc -mtriple=arm-eabi -mcpu=generic %s -o /dev/null
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
@x = weak global i16 0 ; <i16*> [#uses=1]
@y = weak global i16 0 ; <i16*> [#uses=0]
define i32 @f1(i32 %y) {
-; CHECK: f1
+; CHECK-LABEL: f1:
; CHECK: smulbt
%tmp = load i16, i16* @x ; <i16> [#uses=1]
%tmp1 = add i16 %tmp, 2 ; <i16> [#uses=1]
@@ -16,7 +17,7 @@ define i32 @f1(i32 %y) {
}
define i32 @f2(i32 %x, i32 %y) {
-; CHECK: f2
+; CHECK-LABEL: f2:
; CHECK: smultt
%tmp1 = ashr i32 %x, 16 ; <i32> [#uses=1]
%tmp3 = ashr i32 %y, 16 ; <i32> [#uses=1]
@@ -25,7 +26,7 @@ define i32 @f2(i32 %x, i32 %y) {
}
define i32 @f3(i32 %a, i16 %x, i32 %y) {
-; CHECK: f3
+; CHECK-LABEL: f3:
; CHECK: smlabt
%tmp = sext i16 %x to i32 ; <i32> [#uses=1]
%tmp2 = ashr i32 %y, 16 ; <i32> [#uses=1]
@@ -34,3 +35,107 @@ define i32 @f3(i32 %a, i16 %x, i32 %y) {
ret i32 %tmp5
}
+define i32 @f4(i32 %a, i32 %x, i32 %y) {
+; CHECK-LABEL: f4:
+; CHECK: smlatt
+ %tmp1 = ashr i32 %x, 16
+ %tmp3 = ashr i32 %y, 16
+ %tmp4 = mul i32 %tmp3, %tmp1
+ %tmp5 = add i32 %tmp4, %a
+ ret i32 %tmp5
+}
+
+define i32 @f5(i32 %a, i16 %x, i16 %y) {
+; CHECK-LABEL: f5:
+; CHECK: smlabb
+ %tmp1 = sext i16 %x to i32
+ %tmp3 = sext i16 %y to i32
+ %tmp4 = mul i32 %tmp3, %tmp1
+ %tmp5 = add i32 %tmp4, %a
+ ret i32 %tmp5
+}
+
+define i32 @f6(i32 %a, i16 %x, i32 %y) {
+; CHECK-LABEL: f6:
+; CHECK: smlabt
+ %tmp1 = sext i16 %x to i32
+ %tmp3 = ashr i32 %y, 16
+ %tmp4 = mul i32 %tmp3, %tmp1
+ %tmp5 = add i32 %tmp4, %a
+ ret i32 %tmp5
+}
+
+define i32 @f7(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f7:
+; CHECK: smlawb
+ %shl = shl i32 %b, 16
+ %shr = ashr exact i32 %shl, 16
+ %conv = sext i32 %a to i64
+ %conv2 = sext i32 %shr to i64
+ %mul = mul nsw i64 %conv2, %conv
+ %shr49 = lshr i64 %mul, 16
+ %conv5 = trunc i64 %shr49 to i32
+ %add = add nsw i32 %conv5, %c
+ ret i32 %add
+}
+
+define i32 @f8(i32 %a, i16 signext %b, i32 %c) {
+; CHECK-LABEL: f8:
+; CHECK: smlawb
+ %conv = sext i32 %a to i64
+ %conv1 = sext i16 %b to i64
+ %mul = mul nsw i64 %conv1, %conv
+ %shr5 = lshr i64 %mul, 16
+ %conv2 = trunc i64 %shr5 to i32
+ %add = add nsw i32 %conv2, %c
+ ret i32 %add
+}
+
+define i32 @f9(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f9:
+; CHECK: smlawt
+ %conv = sext i32 %a to i64
+ %shr = ashr i32 %b, 16
+ %conv1 = sext i32 %shr to i64
+ %mul = mul nsw i64 %conv1, %conv
+ %shr26 = lshr i64 %mul, 16
+ %conv3 = trunc i64 %shr26 to i32
+ %add = add nsw i32 %conv3, %c
+ ret i32 %add
+}
+
+define i32 @f10(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f10:
+; CHECK: smulwb
+ %shl = shl i32 %b, 16
+ %shr = ashr exact i32 %shl, 16
+ %conv = sext i32 %a to i64
+ %conv2 = sext i32 %shr to i64
+ %mul = mul nsw i64 %conv2, %conv
+ %shr37 = lshr i64 %mul, 16
+ %conv4 = trunc i64 %shr37 to i32
+ ret i32 %conv4
+}
+
+define i32 @f11(i32 %a, i16 signext %b, i32 %c) {
+; CHECK-LABEL: f11:
+; CHECK: smulwb
+ %conv = sext i32 %a to i64
+ %conv1 = sext i16 %b to i64
+ %mul = mul nsw i64 %conv1, %conv
+ %shr4 = lshr i64 %mul, 16
+ %conv2 = trunc i64 %shr4 to i32
+ ret i32 %conv2
+}
+
+define i32 @f12(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: f12:
+; CHECK: smulwt
+ %conv = sext i32 %a to i64
+ %shr = ashr i32 %b, 16
+ %conv1 = sext i32 %shr to i64
+ %mul = mul nsw i64 %conv1, %conv
+ %shr25 = lshr i64 %mul, 16
+ %conv3 = trunc i64 %shr25 to i32
+ ret i32 %conv3
+}
diff --git a/test/CodeGen/ARM/smulw.ll b/test/CodeGen/ARM/smulw.ll
deleted file mode 100644
index 8653903eee53..000000000000
--- a/test/CodeGen/ARM/smulw.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -mtriple=arm--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
-
-; We cannot codegen the smulw[bt] or smlaw[bt] instructions for these functions,
-; as the top 16 bits of the result would differ
-
-define i32 @f1(i32 %a, i16 %b) {
-; CHECK-LABEL: f1:
-; CHECK: mul
-; CHECK: asr
- %tmp1 = sext i16 %b to i32
- %tmp2 = mul i32 %a, %tmp1
- %tmp3 = ashr i32 %tmp2, 16
- ret i32 %tmp3
-}
-
-define i32 @f2(i32 %a, i16 %b, i32 %c) {
-; CHECK-LABEL: f2:
-; CHECK: mul
-; CHECK: add{{.*}}, asr #16
- %tmp1 = sext i16 %b to i32
- %tmp2 = mul i32 %a, %tmp1
- %tmp3 = ashr i32 %tmp2, 16
- %tmp4 = add i32 %tmp3, %c
- ret i32 %tmp4
-}
diff --git a/test/CodeGen/ARM/special-reg-v8m-base.ll b/test/CodeGen/ARM/special-reg-v8m-base.ll
new file mode 100644
index 000000000000..20284daa0463
--- /dev/null
+++ b/test/CodeGen/ARM/special-reg-v8m-base.ll
@@ -0,0 +1,142 @@
+; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=V7M
+; RUN: llc < %s -mtriple=thumbv8m.base-none-eabi 2>&1 | FileCheck %s
+
+; V7M: LLVM ERROR: Invalid register name "sp_ns".
+
+define i32 @read_mclass_registers() nounwind {
+entry:
+ ; CHECK-LABEL: read_mclass_registers:
+ ; CHECK: mrs r0, apsr
+ ; CHECK: mrs r1, iapsr
+ ; CHECK: mrs r1, eapsr
+ ; CHECK: mrs r1, xpsr
+ ; CHECK: mrs r1, ipsr
+ ; CHECK: mrs r1, epsr
+ ; CHECK: mrs r1, iepsr
+ ; CHECK: mrs r1, msp
+ ; CHECK: mrs r1, psp
+ ; CHECK: mrs r1, primask
+ ; CHECK: mrs r1, control
+ ; CHECK: mrs r1, msplim
+ ; CHECK: mrs r1, psplim
+ ; CHECK: mrs r1, msp_ns
+ ; CHECK: mrs r1, psp_ns
+ ; CHECK: mrs r1, primask_ns
+ ; CHECK: mrs r1, control_ns
+ ; CHECK: mrs r1, sp_ns
+
+ %0 = call i32 @llvm.read_register.i32(metadata !0)
+ %1 = call i32 @llvm.read_register.i32(metadata !4)
+ %add1 = add i32 %1, %0
+ %2 = call i32 @llvm.read_register.i32(metadata !8)
+ %add2 = add i32 %add1, %2
+ %3 = call i32 @llvm.read_register.i32(metadata !12)
+ %add3 = add i32 %add2, %3
+ %4 = call i32 @llvm.read_register.i32(metadata !16)
+ %add4 = add i32 %add3, %4
+ %5 = call i32 @llvm.read_register.i32(metadata !17)
+ %add5 = add i32 %add4, %5
+ %6 = call i32 @llvm.read_register.i32(metadata !18)
+ %add6 = add i32 %add5, %6
+ %7 = call i32 @llvm.read_register.i32(metadata !19)
+ %add7 = add i32 %add6, %7
+ %8 = call i32 @llvm.read_register.i32(metadata !20)
+ %add8 = add i32 %add7, %8
+ %9 = call i32 @llvm.read_register.i32(metadata !21)
+ %add9 = add i32 %add8, %9
+ %10 = call i32 @llvm.read_register.i32(metadata !25)
+ %add10 = add i32 %add9, %10
+ %11 = call i32 @llvm.read_register.i32(metadata !26)
+ %add11 = add i32 %add10, %11
+ %12 = call i32 @llvm.read_register.i32(metadata !27)
+ %add12 = add i32 %add11, %12
+ %13 = call i32 @llvm.read_register.i32(metadata !28)
+ %add13 = add i32 %add12, %13
+ %14 = call i32 @llvm.read_register.i32(metadata !29)
+ %add14 = add i32 %add13, %14
+ %15 = call i32 @llvm.read_register.i32(metadata !32)
+ %add15 = add i32 %add14, %15
+ %16 = call i32 @llvm.read_register.i32(metadata !35)
+ %add16 = add i32 %add15, %16
+ %17 = call i32 @llvm.read_register.i32(metadata !36)
+ %add17 = add i32 %add16, %17
+ ret i32 %add10
+}
+
+define void @write_mclass_registers(i32 %x) nounwind {
+entry:
+ ; CHECK-LABEL: write_mclass_registers:
+ ; CHECK: msr apsr, r0
+ ; CHECK: msr apsr, r0
+ ; CHECK: msr iapsr, r0
+ ; CHECK: msr iapsr, r0
+ ; CHECK: msr eapsr, r0
+ ; CHECK: msr eapsr, r0
+ ; CHECK: msr xpsr, r0
+ ; CHECK: msr xpsr, r0
+ ; CHECK: msr ipsr, r0
+ ; CHECK: msr epsr, r0
+ ; CHECK: msr iepsr, r0
+ ; CHECK: msr msp, r0
+ ; CHECK: msr psp, r0
+ ; CHECK: msr primask, r0
+ ; CHECK: msr control, r0
+ ; CHECK: msr msplim, r0
+ ; CHECK: msr psplim, r0
+ ; CHECK: msr msp_ns, r0
+ ; CHECK: msr psp_ns, r0
+ ; CHECK: msr primask_ns, r0
+ ; CHECK: msr control_ns, r0
+ ; CHECK: msr sp_ns, r0
+
+ call void @llvm.write_register.i32(metadata !0, i32 %x)
+ call void @llvm.write_register.i32(metadata !1, i32 %x)
+ call void @llvm.write_register.i32(metadata !4, i32 %x)
+ call void @llvm.write_register.i32(metadata !5, i32 %x)
+ call void @llvm.write_register.i32(metadata !8, i32 %x)
+ call void @llvm.write_register.i32(metadata !9, i32 %x)
+ call void @llvm.write_register.i32(metadata !12, i32 %x)
+ call void @llvm.write_register.i32(metadata !13, i32 %x)
+ call void @llvm.write_register.i32(metadata !16, i32 %x)
+ call void @llvm.write_register.i32(metadata !17, i32 %x)
+ call void @llvm.write_register.i32(metadata !18, i32 %x)
+ call void @llvm.write_register.i32(metadata !19, i32 %x)
+ call void @llvm.write_register.i32(metadata !20, i32 %x)
+ call void @llvm.write_register.i32(metadata !21, i32 %x)
+ call void @llvm.write_register.i32(metadata !25, i32 %x)
+ call void @llvm.write_register.i32(metadata !26, i32 %x)
+ call void @llvm.write_register.i32(metadata !27, i32 %x)
+ call void @llvm.write_register.i32(metadata !28, i32 %x)
+ call void @llvm.write_register.i32(metadata !29, i32 %x)
+ call void @llvm.write_register.i32(metadata !32, i32 %x)
+ call void @llvm.write_register.i32(metadata !35, i32 %x)
+ call void @llvm.write_register.i32(metadata !36, i32 %x)
+ ret void
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+declare void @llvm.write_register.i32(metadata, i32) nounwind
+
+!0 = !{!"apsr"}
+!1 = !{!"apsr_nzcvq"}
+!4 = !{!"iapsr"}
+!5 = !{!"iapsr_nzcvq"}
+!8 = !{!"eapsr"}
+!9 = !{!"eapsr_nzcvq"}
+!12 = !{!"xpsr"}
+!13 = !{!"xpsr_nzcvq"}
+!16 = !{!"ipsr"}
+!17 = !{!"epsr"}
+!18 = !{!"iepsr"}
+!19 = !{!"msp"}
+!20 = !{!"psp"}
+!21 = !{!"primask"}
+!25 = !{!"control"}
+!26 = !{!"msplim"}
+!27 = !{!"psplim"}
+!28 = !{!"msp_ns"}
+!29 = !{!"psp_ns"}
+!32 = !{!"primask_ns"}
+!35 = !{!"control_ns"}
+!36 = !{!"sp_ns"}
+
diff --git a/test/CodeGen/ARM/special-reg-v8m-main.ll b/test/CodeGen/ARM/special-reg-v8m-main.ll
new file mode 100644
index 000000000000..cde296c6b218
--- /dev/null
+++ b/test/CodeGen/ARM/special-reg-v8m-main.ll
@@ -0,0 +1,214 @@
+; RUN: not llc < %s -mtriple=thumbv8m.base-none-eabi 2>&1 | FileCheck %s --check-prefix=BASELINE
+; RUN: llc < %s -mtriple=thumbv8m.main-none-eabi -mattr=+dsp 2>&1 | FileCheck %s --check-prefix=MAINLINE
+
+; BASELINE: LLVM ERROR: Invalid register name "basepri_max_ns".
+
+define i32 @read_mclass_registers() nounwind {
+entry:
+ ; MAINLINE-LABEL: read_mclass_registers:
+ ; MAINLINE: mrs r0, apsr
+ ; MAINLINE: mrs r1, iapsr
+ ; MAINLINE: mrs r1, eapsr
+ ; MAINLINE: mrs r1, xpsr
+ ; MAINLINE: mrs r1, ipsr
+ ; MAINLINE: mrs r1, epsr
+ ; MAINLINE: mrs r1, iepsr
+ ; MAINLINE: mrs r1, msp
+ ; MAINLINE: mrs r1, psp
+ ; MAINLINE: mrs r1, primask
+ ; MAINLINE: mrs r1, basepri
+ ; MAINLINE: mrs r1, basepri_max
+ ; MAINLINE: mrs r1, faultmask
+ ; MAINLINE: mrs r1, control
+ ; MAINLINE: mrs r1, msplim
+ ; MAINLINE: mrs r1, psplim
+ ; MAINLINE: mrs r1, msp_ns
+ ; MAINLINE: mrs r1, psp_ns
+ ; MAINLINE: mrs r1, msplim_ns
+ ; MAINLINE: mrs r1, psplim_ns
+ ; MAINLINE: mrs r1, primask_ns
+ ; MAINLINE: mrs r1, basepri_ns
+ ; MAINLINE: mrs r1, faultmask_ns
+ ; MAINLINE: mrs r1, control_ns
+ ; MAINLINE: mrs r1, sp_ns
+ ; MAINLINE: mrs r1, basepri_max_ns
+
+ %0 = call i32 @llvm.read_register.i32(metadata !0)
+ %1 = call i32 @llvm.read_register.i32(metadata !4)
+ %add1 = add i32 %1, %0
+ %2 = call i32 @llvm.read_register.i32(metadata !8)
+ %add2 = add i32 %add1, %2
+ %3 = call i32 @llvm.read_register.i32(metadata !12)
+ %add3 = add i32 %add2, %3
+ %4 = call i32 @llvm.read_register.i32(metadata !16)
+ %add4 = add i32 %add3, %4
+ %5 = call i32 @llvm.read_register.i32(metadata !17)
+ %add5 = add i32 %add4, %5
+ %6 = call i32 @llvm.read_register.i32(metadata !18)
+ %add6 = add i32 %add5, %6
+ %7 = call i32 @llvm.read_register.i32(metadata !19)
+ %add7 = add i32 %add6, %7
+ %8 = call i32 @llvm.read_register.i32(metadata !20)
+ %add8 = add i32 %add7, %8
+ %9 = call i32 @llvm.read_register.i32(metadata !21)
+ %add9 = add i32 %add8, %9
+ %10 = call i32 @llvm.read_register.i32(metadata !22)
+ %add10 = add i32 %add9, %10
+ %11 = call i32 @llvm.read_register.i32(metadata !23)
+ %add11 = add i32 %add10, %11
+ %12 = call i32 @llvm.read_register.i32(metadata !24)
+ %add12 = add i32 %add11, %12
+ %13 = call i32 @llvm.read_register.i32(metadata !25)
+ %add13 = add i32 %add12, %13
+ %14 = call i32 @llvm.read_register.i32(metadata !26)
+ %add14 = add i32 %add13, %14
+ %15 = call i32 @llvm.read_register.i32(metadata !27)
+ %add15 = add i32 %add14, %15
+ %16 = call i32 @llvm.read_register.i32(metadata !28)
+ %add16 = add i32 %add15, %16
+ %17 = call i32 @llvm.read_register.i32(metadata !29)
+ %add17 = add i32 %add16, %17
+ %18 = call i32 @llvm.read_register.i32(metadata !30)
+ %add18 = add i32 %add17, %18
+ %19 = call i32 @llvm.read_register.i32(metadata !31)
+ %add19 = add i32 %add18, %19
+ %20 = call i32 @llvm.read_register.i32(metadata !32)
+ %add20 = add i32 %add19, %20
+ %21 = call i32 @llvm.read_register.i32(metadata !33)
+ %add21 = add i32 %add20, %21
+ %22 = call i32 @llvm.read_register.i32(metadata !34)
+ %add22 = add i32 %add21, %22
+ %23 = call i32 @llvm.read_register.i32(metadata !35)
+ %add23 = add i32 %add22, %23
+ %24 = call i32 @llvm.read_register.i32(metadata !36)
+ %add24 = add i32 %add23, %24
+ %25 = call i32 @llvm.read_register.i32(metadata !37)
+ %add25 = add i32 %add24, %25
+ ret i32 %add25
+}
+
+define void @write_mclass_registers(i32 %x) nounwind {
+entry:
+ ; MAINLINE-LABEL: write_mclass_registers:
+ ; MAINLINE: msr apsr_nzcvqg, r0
+ ; MAINLINE: msr apsr_nzcvq, r0
+ ; MAINLINE: msr apsr_g, r0
+ ; MAINLINE: msr apsr_nzcvqg, r0
+ ; MAINLINE: msr iapsr_nzcvqg, r0
+ ; MAINLINE: msr iapsr_nzcvq, r0
+ ; MAINLINE: msr iapsr_g, r0
+ ; MAINLINE: msr iapsr_nzcvqg, r0
+ ; MAINLINE: msr eapsr_nzcvqg, r0
+ ; MAINLINE: msr eapsr_nzcvq, r0
+ ; MAINLINE: msr eapsr_g, r0
+ ; MAINLINE: msr eapsr_nzcvqg, r0
+ ; MAINLINE: msr xpsr_nzcvqg, r0
+ ; MAINLINE: msr xpsr_nzcvq, r0
+ ; MAINLINE: msr xpsr_g, r0
+ ; MAINLINE: msr xpsr_nzcvqg, r0
+ ; MAINLINE: msr ipsr, r0
+ ; MAINLINE: msr epsr, r0
+ ; MAINLINE: msr iepsr, r0
+ ; MAINLINE: msr msp, r0
+ ; MAINLINE: msr psp, r0
+ ; MAINLINE: msr primask, r0
+ ; MAINLINE: msr basepri, r0
+ ; MAINLINE: msr basepri_max, r0
+ ; MAINLINE: msr faultmask, r0
+ ; MAINLINE: msr control, r0
+ ; MAINLINE: msr msplim, r0
+ ; MAINLINE: msr psplim, r0
+ ; MAINLINE: msr msp_ns, r0
+ ; MAINLINE: msr psp_ns, r0
+ ; MAINLINE: msr msplim_ns, r0
+ ; MAINLINE: msr psplim_ns, r0
+ ; MAINLINE: msr primask_ns, r0
+ ; MAINLINE: msr basepri_ns, r0
+ ; MAINLINE: msr faultmask_ns, r0
+ ; MAINLINE: msr control_ns, r0
+ ; MAINLINE: msr sp_ns, r0
+ ; MAINLINE: msr basepri_max_ns, r0
+
+ call void @llvm.write_register.i32(metadata !0, i32 %x)
+ call void @llvm.write_register.i32(metadata !1, i32 %x)
+ call void @llvm.write_register.i32(metadata !2, i32 %x)
+ call void @llvm.write_register.i32(metadata !3, i32 %x)
+ call void @llvm.write_register.i32(metadata !4, i32 %x)
+ call void @llvm.write_register.i32(metadata !5, i32 %x)
+ call void @llvm.write_register.i32(metadata !6, i32 %x)
+ call void @llvm.write_register.i32(metadata !7, i32 %x)
+ call void @llvm.write_register.i32(metadata !8, i32 %x)
+ call void @llvm.write_register.i32(metadata !9, i32 %x)
+ call void @llvm.write_register.i32(metadata !10, i32 %x)
+ call void @llvm.write_register.i32(metadata !11, i32 %x)
+ call void @llvm.write_register.i32(metadata !12, i32 %x)
+ call void @llvm.write_register.i32(metadata !13, i32 %x)
+ call void @llvm.write_register.i32(metadata !14, i32 %x)
+ call void @llvm.write_register.i32(metadata !15, i32 %x)
+ call void @llvm.write_register.i32(metadata !16, i32 %x)
+ call void @llvm.write_register.i32(metadata !17, i32 %x)
+ call void @llvm.write_register.i32(metadata !18, i32 %x)
+ call void @llvm.write_register.i32(metadata !19, i32 %x)
+ call void @llvm.write_register.i32(metadata !20, i32 %x)
+ call void @llvm.write_register.i32(metadata !21, i32 %x)
+ call void @llvm.write_register.i32(metadata !22, i32 %x)
+ call void @llvm.write_register.i32(metadata !23, i32 %x)
+ call void @llvm.write_register.i32(metadata !24, i32 %x)
+ call void @llvm.write_register.i32(metadata !25, i32 %x)
+ call void @llvm.write_register.i32(metadata !26, i32 %x)
+ call void @llvm.write_register.i32(metadata !27, i32 %x)
+ call void @llvm.write_register.i32(metadata !28, i32 %x)
+ call void @llvm.write_register.i32(metadata !29, i32 %x)
+ call void @llvm.write_register.i32(metadata !30, i32 %x)
+ call void @llvm.write_register.i32(metadata !31, i32 %x)
+ call void @llvm.write_register.i32(metadata !32, i32 %x)
+ call void @llvm.write_register.i32(metadata !33, i32 %x)
+ call void @llvm.write_register.i32(metadata !34, i32 %x)
+ call void @llvm.write_register.i32(metadata !35, i32 %x)
+ call void @llvm.write_register.i32(metadata !36, i32 %x)
+ call void @llvm.write_register.i32(metadata !37, i32 %x)
+ ret void
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+declare void @llvm.write_register.i32(metadata, i32) nounwind
+
+!0 = !{!"apsr"}
+!1 = !{!"apsr_nzcvq"}
+!2 = !{!"apsr_g"}
+!3 = !{!"apsr_nzcvqg"}
+!4 = !{!"iapsr"}
+!5 = !{!"iapsr_nzcvq"}
+!6 = !{!"iapsr_g"}
+!7 = !{!"iapsr_nzcvqg"}
+!8 = !{!"eapsr"}
+!9 = !{!"eapsr_nzcvq"}
+!10 = !{!"eapsr_g"}
+!11 = !{!"eapsr_nzcvqg"}
+!12 = !{!"xpsr"}
+!13 = !{!"xpsr_nzcvq"}
+!14 = !{!"xpsr_g"}
+!15 = !{!"xpsr_nzcvqg"}
+!16 = !{!"ipsr"}
+!17 = !{!"epsr"}
+!18 = !{!"iepsr"}
+!19 = !{!"msp"}
+!20 = !{!"psp"}
+!21 = !{!"primask"}
+!22 = !{!"basepri"}
+!23 = !{!"basepri_max"}
+!24 = !{!"faultmask"}
+!25 = !{!"control"}
+!26 = !{!"msplim"}
+!27 = !{!"psplim"}
+!28 = !{!"msp_ns"}
+!29 = !{!"psp_ns"}
+!30 = !{!"msplim_ns"}
+!31 = !{!"psplim_ns"}
+!32 = !{!"primask_ns"}
+!33 = !{!"basepri_ns"}
+!34 = !{!"faultmask_ns"}
+!35 = !{!"control_ns"}
+!36 = !{!"sp_ns"}
+!37 = !{!"basepri_max_ns"}
+
diff --git a/test/CodeGen/ARM/ssat.ll b/test/CodeGen/ARM/ssat.ll
new file mode 100644
index 000000000000..2b75bc410aa8
--- /dev/null
+++ b/test/CodeGen/ARM/ssat.ll
@@ -0,0 +1,215 @@
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+
+; Check for several conditions that should result in SSAT.
+; For example, the base test is equivalent to
+; x < -k ? -k : (x > k ? k : x) in C. All patterns that bound x
+; to the interval [-k, k] where k is a power of 2 can be
+; transformed into SSAT. At the end there are some tests
+; checking that conditionals are not transformed if they don't
+; match the right pattern.
+
+;
+; Base tests with different bit widths
+;
+
+; x < -k ? -k : (x > k ? k : x)
+; 32-bit base test
+define i32 @sat_base_32bit(i32 %x) #0 {
+; CHECK-LABEL: sat_base_32bit:
+; CHECK: ssat r0, #24, r0
+entry:
+ %cmpLow = icmp slt i32 %x, -8388608
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %saturateUp
+ ret i32 %saturateLow
+}
+
+; x < -k ? -k : (x > k ? k : x)
+; 16-bit base test
+define i16 @sat_base_16bit(i16 %x) #0 {
+; CHECK-LABEL: sat_base_16bit:
+; CHECK: ssat r0, #12, r0
+entry:
+ %cmpLow = icmp slt i16 %x, -2048
+ %cmpUp = icmp sgt i16 %x, 2047
+ %saturateUp = select i1 %cmpUp, i16 2047, i16 %x
+ %saturateLow = select i1 %cmpLow, i16 -2048, i16 %saturateUp
+ ret i16 %saturateLow
+}
+
+; x < -k ? -k : (x > k ? k : x)
+; 8-bit base test
+define i8 @sat_base_8bit(i8 %x) #0 {
+; CHECK-LABEL: sat_base_8bit:
+; CHECK: ssat r0, #6, r0
+entry:
+ %cmpLow = icmp slt i8 %x, -32
+ %cmpUp = icmp sgt i8 %x, 31
+ %saturateUp = select i1 %cmpUp, i8 31, i8 %x
+ %saturateLow = select i1 %cmpLow, i8 -32, i8 %saturateUp
+ ret i8 %saturateLow
+}
+
+;
+; Tests where the conditionals that check for upper and lower bounds,
+; or the < and > operators, are arranged in different ways. Only some
+; of the possible combinations that lead to SSAT are tested.
+;
+
+; x < -k ? -k : (x < k ? x : k)
+define i32 @sat_lower_upper_1(i32 %x) #0 {
+; CHECK-LABEL: sat_lower_upper_1:
+; CHECK: ssat r0, #24, r0
+entry:
+ %cmpLow = icmp slt i32 %x, -8388608
+ %cmpUp = icmp slt i32 %x, 8388607
+ %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %saturateUp
+ ret i32 %saturateLow
+}
+
+; x > -k ? (x > k ? k : x) : -k
+define i32 @sat_lower_upper_2(i32 %x) #0 {
+; CHECK-LABEL: sat_lower_upper_2:
+; CHECK: ssat r0, #24, r0
+entry:
+ %cmpLow = icmp sgt i32 %x, -8388608
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x
+ %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 -8388608
+ ret i32 %saturateLow
+}
+
+; x < k ? (x < -k ? -k : x) : k
+define i32 @sat_upper_lower_1(i32 %x) #0 {
+; CHECK-LABEL: sat_upper_lower_1:
+; CHECK: ssat r0, #24, r0
+entry:
+ %cmpUp = icmp slt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607
+ ret i32 %saturateUp
+}
+
+; x > k ? k : (x < -k ? -k : x)
+define i32 @sat_upper_lower_2(i32 %x) #0 {
+; CHECK-LABEL: sat_upper_lower_2:
+; CHECK: ssat r0, #24, r0
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; k < x ? k : (x > -k ? x : -k)
+define i32 @sat_upper_lower_3(i32 %x) #0 {
+; CHECK-LABEL: sat_upper_lower_3:
+; CHECK: ssat r0, #24, r0
+entry:
+ %cmpUp = icmp slt i32 8388607, %x
+ %cmpLow = icmp sgt i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 %x, i32 -8388608
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+;
+; Miscellanea
+;
+
+; Check that >= and <= work the same as > and <
+; k <= x ? k : (x >= -k ? x : -k)
+define i32 @sat_le_ge(i32 %x) #0 {
+; CHECK-LABEL: sat_le_ge:
+; CHECK: ssat r0, #24, r0
+entry:
+ %cmpUp = icmp sle i32 8388607, %x
+ %cmpLow = icmp sge i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 %x, i32 -8388608
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+;
+; The following tests check for patterns that should not transform
+; into SSAT but are similar enough that could confuse the selector.
+;
+
+; x > k ? k : (x > -k ? -k : x)
+; First condition upper-saturates, second doesn't lower-saturate.
+define i32 @no_sat_missing_lower(i32 %x) #0 {
+; CHECK-LABEL: no_sat_missing_lower
+; CHECK-NOT: ssat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp sgt i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; x < k ? k : (x < -k ? -k : x)
+; Second condition lower-saturates, first doesn't upper-saturate.
+define i32 @no_sat_missing_upper(i32 %x) #0 {
+; CHECK-LABEL: no_sat_missing_upper:
+; CHECK-NOT: ssat
+entry:
+ %cmpUp = icmp slt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; Lower constant is different in the select and in the compare
+define i32 @no_sat_incorrect_constant(i32 %x) #0 {
+; CHECK-LABEL: no_sat_incorrect_constant:
+; CHECK-NOT: ssat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 -8388607, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; The interval is not [k, ~k]
+define i32 @no_sat_incorrect_interval(i32 %x) #0 {
+; CHECK-LABEL: no_sat_incorrect_interval:
+; CHECK-NOT: ssat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, -19088744
+ %saturateLow = select i1 %cmpLow, i32 -19088744, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; The returned value (y) is not the same as the tested value (x).
+define i32 @no_sat_incorrect_return(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: no_sat_incorrect_return:
+; CHECK-NOT: ssat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %x, -8388608
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %y
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
+
+; One of the values in a compare (y) is not the same as the rest
+; of the compare and select values (x).
+define i32 @no_sat_incorrect_compare(i32 %x, i32 %y) #0 {
+; CHECK-LABEL: no_sat_incorrect_compare:
+; CHECK-NOT: ssat
+entry:
+ %cmpUp = icmp sgt i32 %x, 8388607
+ %cmpLow = icmp slt i32 %y, -8388608
+ %saturateLow = select i1 %cmpLow, i32 -8388608, i32 %x
+ %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
+ ret i32 %saturateUp
+}
diff --git a/test/CodeGen/ARM/static-addr-hoisting.ll b/test/CodeGen/ARM/static-addr-hoisting.ll
new file mode 100644
index 000000000000..3d47e02f965e
--- /dev/null
+++ b/test/CodeGen/ARM/static-addr-hoisting.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=thumbv7-apple-ios %s -o - | FileCheck %s
+
+define void @multiple_store() {
+; CHECK-LABEL: multiple_store:
+; CHECK: movw r[[BASE1:[0-9]+]], #16960
+; CHECK: movs [[VAL:r[0-9]+]], #42
+; CHECK: movt r[[BASE1]], #15
+
+; CHECK: str [[VAL]], [r[[BASE1]]]
+; CHECK: str [[VAL]], [r[[BASE1]], #24]
+; CHECK: str.w [[VAL]], [r[[BASE1]], #42]
+
+; CHECK: movw r[[BASE2:[0-9]+]], #20394
+; CHECK: movt r[[BASE2]], #18
+
+; CHECK: str [[VAL]], [r[[BASE2]]]
+ store i32 42, i32* inttoptr(i32 1000000 to i32*)
+ store i32 42, i32* inttoptr(i32 1000024 to i32*)
+ store i32 42, i32* inttoptr(i32 1000042 to i32*)
+ store i32 42, i32* inttoptr(i32 1200042 to i32*)
+ ret void
+}
diff --git a/test/CodeGen/ARM/stc2.ll b/test/CodeGen/ARM/stc2.ll
new file mode 100644
index 000000000000..1127796387bb
--- /dev/null
+++ b/test/CodeGen/ARM/stc2.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -mtriple=armv8-eabi 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=thumbv8-eabi 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.arm.stc2
+define void @stc2(i8* %i) nounwind {
+entry:
+ call void @llvm.arm.stc2(i32 1, i32 2, i8* %i) nounwind
+ ret void
+}
+
+declare void @llvm.arm.stc2(i32, i32, i8*) nounwind
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
index d7b9b477ec1e..6c8f6fa0b39c 100644
--- a/test/CodeGen/ARM/struct_byval.ll
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-ios6.0 | FileCheck %s -check-prefix=THUMB
+; RUN: llc < %s -mtriple=thumbv7-apple-ios6.0 | FileCheck %s
; RUN: llc < %s -mtriple=armv7-unknown-nacl-gnueabi | FileCheck %s -check-prefix=NACL
; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi | FileCheck %s -check-prefix=NOMOVT
@@ -15,10 +15,6 @@ entry:
; CHECK: ldr
; CHECK: str
; CHECK-NOT:bne
-; THUMB-LABEL: f:
-; THUMB: ldr
-; THUMB: str
-; THUMB-NOT:bne
%st = alloca %struct.SmallStruct, align 4
%call = call i32 @e1(%struct.SmallStruct* byval %st)
ret i32 0
@@ -32,11 +28,6 @@ entry:
; CHECK: sub
; CHECK: str
; CHECK: bne
-; THUMB-LABEL: g:
-; THUMB: ldr
-; THUMB: sub
-; THUMB: str
-; THUMB: bne
; NACL-LABEL: g:
; Ensure that use movw instead of constpool for the loop trip count. But don't
; match the __stack_chk_guard movw
@@ -58,11 +49,6 @@ entry:
; CHECK: sub
; CHECK: vst1
; CHECK: bne
-; THUMB-LABEL: h:
-; THUMB: vld1
-; THUMB: sub
-; THUMB: vst1
-; THUMB: bne
; NACL: movw r{{[1-9]}}, #
; NACL: vld1
; NACL: sub
@@ -83,8 +69,6 @@ declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
define void @f3(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
; CHECK-LABEL: f3
; CHECK: bl _consumestruct
-; THUMB-LABEL: f3
-; THUMB: blx _consumestruct
entry:
%0 = bitcast %struct.SmallStruct* %s to i8*
tail call void @consumestruct(i8* %0, i32 80) optsize
@@ -94,8 +78,6 @@ entry:
define void @f4(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
; CHECK-LABEL: f4
; CHECK: bl _consumestruct
-; THUMB-LABEL: f4
-; THUMB: blx _consumestruct
entry:
%addr = getelementptr inbounds %struct.SmallStruct, %struct.SmallStruct* %s, i32 0, i32 0
%0 = bitcast i32* %addr to i8*
@@ -106,9 +88,7 @@ entry:
; We can do tail call here since s is in the incoming argument area.
define void @f5(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
; CHECK-LABEL: f5
-; CHECK: b _consumestruct
-; THUMB-LABEL: f5
-; THUMB: b.w _consumestruct
+; CHECK: b{{(\.w)?}} _consumestruct
entry:
%0 = bitcast %struct.SmallStruct* %s to i8*
tail call void @consumestruct(i8* %0, i32 80) optsize
@@ -117,9 +97,7 @@ entry:
define void @f6(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
; CHECK-LABEL: f6
-; CHECK: b _consumestruct
-; THUMB-LABEL: f6
-; THUMB: b.w _consumestruct
+; CHECK: b{{(\.w)?}} _consumestruct
entry:
%addr = getelementptr inbounds %struct.SmallStruct, %struct.SmallStruct* %s, i32 0, i32 0
%0 = bitcast i32* %addr to i8*
@@ -137,9 +115,6 @@ define void @test_I_16() {
; CHECK-LABEL: test_I_16
; CHECK: ldrb
; CHECK: strb
-; THUMB-LABEL: test_I_16
-; THUMB: ldrb
-; THUMB: strb
entry:
call void @use_I(%struct.I.8* byval align 16 undef)
ret void
diff --git a/test/CodeGen/ARM/swift-ios.ll b/test/CodeGen/ARM/swift-ios.ll
new file mode 100644
index 000000000000..a5d09524e11d
--- /dev/null
+++ b/test/CodeGen/ARM/swift-ios.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=armv7-apple-ios < %s | FileCheck %s
+
+define swiftcc float @t1(float %a, float %b) {
+entry:
+; CHECK: t1
+; CHECK-NOT: vmov
+; CHECK: vadd.f32
+ %add = fadd float %a, %b
+ ret float %add
+}
+
+define swiftcc double @t2(double %a, double %b) {
+entry:
+; CHECK: t2
+; CHECK-NOT: vmov
+; CHECK: vadd.f64
+ %add = fadd double %a, %b
+ ret double %add
+}
+
+define swiftcc double @t9(double %d0, double %d1, double %d2, double %d3,
+ double %d4, double %d5, double %d6, double %d7, float %a, float %b) {
+entry:
+; CHECK-LABEL: t9:
+; CHECK-NOT: vmov
+; CHECK: vldr
+ %add = fadd float %a, %b
+ %conv = fpext float %add to double
+ ret double %conv
+}
+
+define swiftcc double @t10(double %d0, double %d1, double %d2, double %d3,
+ double %d4, double %d5, double %a, float %b, double %c) {
+entry:
+; CHECK-LABEL: t10:
+; CHECK-NOT: vmov
+; CHECK: vldr
+ %add = fadd double %a, %c
+ ret double %add
+}
+
+define swiftcc float @t11(double %d0, double %d1, double %d2, double %d3,
+ double %d4, double %d5, double %d6, float %a, double %b, float %c) {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: vldr
+ %add = fadd float %a, %c
+ ret float %add
+}
+
+define swiftcc double @t12(double %a, double %b) {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: vstr
+ %add = fadd double %a, %b
+ %sub = fsub double %a, %b
+ %call = tail call swiftcc double @x(double 0.000000e+00, double 0.000000e+00,
+ double 0.000000e+00, double 0.000000e+00, double 0.000000e+00,
+ double 0.000000e+00, double %add, float 0.000000e+00,
+ double %sub)
+ ret double %call
+}
+
+declare swiftcc double @x(double, double, double, double, double, double,
+ double, float, double)
+
+attributes #0 = { readnone }
+attributes #1 = { readonly }
diff --git a/test/CodeGen/ARM/swift-return.ll b/test/CodeGen/ARM/swift-return.ll
new file mode 100644
index 000000000000..4a5ef5e382ca
--- /dev/null
+++ b/test/CodeGen/ARM/swift-return.ll
@@ -0,0 +1,133 @@
+; RUN: llc -mtriple=armv7k-apple-ios8.0 -mcpu=cortex-a7 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=armv7k-apple-ios8.0 -mcpu=cortex-a7 -verify-machineinstrs < %s -O0 | FileCheck --check-prefix=CHECK-O0 %s
+
+; RUN: llc -mtriple=armv7-apple-ios -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -verify-machineinstrs < %s -O0 | FileCheck --check-prefix=CHECK-O0 %s
+
+; Test how llvm handles return type of {i16, i8}. The return value will be
+; passed in %r0 and %r1.
+; CHECK-LABEL: test:
+; CHECK: bl {{.*}}gen
+; CHECK: sxth {{.*}}, r0
+; CHECK: sxtab r0, {{.*}}, r1
+; CHECK-O0-LABEL: test:
+; CHECK-O0: bl {{.*}}gen
+; CHECK-O0: sxth r0, r0
+; CHECK-O0: sxtb r1, r1
+; CHECK-O0: add r0, r0, r1
+define i16 @test(i32 %key) {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i16, i8 } @gen(i32 %0)
+ %v3 = extractvalue { i16, i8 } %call, 0
+ %v1 = sext i16 %v3 to i32
+ %v5 = extractvalue { i16, i8 } %call, 1
+ %v2 = sext i8 %v5 to i32
+ %add = add nsw i32 %v1, %v2
+ %conv = trunc i32 %add to i16
+ ret i16 %conv
+}
+
+declare swiftcc { i16, i8 } @gen(i32)
+
+; We can't pass every return value in register, instead, pass everything in
+; memroy.
+; The caller provides space for the return value and passes the address in %r0.
+; The first input argument will be in %r1.
+; CHECK-LABEL: test2:
+; CHECK: mov r1, r0
+; CHECK: mov r0, sp
+; CHECK: bl {{.*}}gen2
+; CHECK-DAG: add
+; CHECK-DAG: ldr {{.*}}, [sp, #16]
+; CHECK-DAG: add
+; CHECK-DAG: add
+; CHECK-DAG: add
+; CHECK-O0-LABEL: test2:
+; CHECK-O0: str r0
+; CHECK-O0: mov r0, sp
+; CHECK-O0: bl {{.*}}gen2
+; CHECK-O0-DAG: ldr {{.*}}, [sp]
+; CHECK-O0-DAG: ldr {{.*}}, [sp, #4]
+; CHECK-O0-DAG: ldr {{.*}}, [sp, #8]
+; CHECK-O0-DAG: ldr {{.*}}, [sp, #12]
+; CHECK-O0-DAG: ldr {{.*}}, [sp, #16]
+; CHECK-O0-DAG: add
+; CHECK-O0-DAG: add
+; CHECK-O0-DAG: add
+; CHECK-O0-DAG: add
+define i32 @test2(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32, i32 } %call, 3
+ %v8 = extractvalue { i32, i32, i32, i32, i32 } %call, 4
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ %add3 = add nsw i32 %add2, %v8
+ ret i32 %add3
+}
+
+; The address of the return value is passed in %r0.
+; CHECK-LABEL: gen2:
+; CHECK-DAG: str r1, [r0]
+; CHECK-DAG: str r1, [r0, #4]
+; CHECK-DAG: str r1, [r0, #8]
+; CHECK-DAG: str r1, [r0, #12]
+; CHECK-DAG: str r1, [r0, #16]
+; CHECK-O0-LABEL: gen2:
+; CHECK-O0-DAG: str r1, [r0]
+; CHECK-O0-DAG: str r1, [r0, #4]
+; CHECK-O0-DAG: str r1, [r0, #8]
+; CHECK-O0-DAG: str r1, [r0, #12]
+; CHECK-O0-DAG: str r1, [r0, #16]
+define swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %key) {
+ %Y = insertvalue { i32, i32, i32, i32, i32 } undef, i32 %key, 0
+ %Z = insertvalue { i32, i32, i32, i32, i32 } %Y, i32 %key, 1
+ %Z2 = insertvalue { i32, i32, i32, i32, i32 } %Z, i32 %key, 2
+ %Z3 = insertvalue { i32, i32, i32, i32, i32 } %Z2, i32 %key, 3
+ %Z4 = insertvalue { i32, i32, i32, i32, i32 } %Z3, i32 %key, 4
+ ret { i32, i32, i32, i32, i32 } %Z4
+}
+
+; The return value {i32, i32, i32, i32} will be returned via registers %r0, %r1,
+; %r2, %r3.
+; CHECK-LABEL: test3:
+; CHECK: bl {{.*}}gen3
+; CHECK: add r0, r0, r1
+; CHECK: add r0, r0, r2
+; CHECK: add r0, r0, r3
+; CHECK-O0-LABEL: test3:
+; CHECK-O0: bl {{.*}}gen3
+; CHECK-O0: add r0, r0, r1
+; CHECK-O0: add r0, r0, r2
+; CHECK-O0: add r0, r0, r3
+define i32 @test3(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32 } @gen3(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32 } %call, 3
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ ret i32 %add2
+}
+
+declare swiftcc { i32, i32, i32, i32 } @gen3(i32 %key)
diff --git a/test/CodeGen/ARM/swift-vldm.ll b/test/CodeGen/ARM/swift-vldm.ll
index 9e507279fa09..a53b2413bde1 100644
--- a/test/CodeGen/ARM/swift-vldm.ll
+++ b/test/CodeGen/ARM/swift-vldm.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
+; RUN: llc < %s -arm-assume-misaligned-load-store -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
; Check that we avoid producing vldm instructions using d registers that
; begin in the most-significant half of a q register. These require more
diff --git a/test/CodeGen/ARM/swifterror.ll b/test/CodeGen/ARM/swifterror.ll
new file mode 100644
index 000000000000..17bd7059f6d4
--- /dev/null
+++ b/test/CodeGen/ARM/swifterror.ll
@@ -0,0 +1,381 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=armv7-apple-ios | FileCheck --check-prefix=CHECK-APPLE %s
+; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=armv7-apple-ios | FileCheck --check-prefix=CHECK-O0 %s
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+%swift_error = type { i64, i8 }
+%struct.S = type { i32, i32, i32, i32, i32, i32 }
+
+; This tests the basic usage of a swifterror parameter. "foo" is the function
+; that takes a swifterror parameter and "caller" is the caller of "foo".
+define float @foo(%swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo:
+; CHECK-APPLE: mov r0, #16
+; CHECK-APPLE: malloc
+; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], #1
+; CHECK-APPLE-DAG: mov r6, r{{.*}}
+; CHECK-APPLE-DAG: strb [[ID]], [r{{.*}}, #8]
+
+; CHECK-O0-LABEL: foo:
+; CHECK-O0: mov r{{.*}}, #16
+; CHECK-O0: malloc
+; CHECK-O0: mov [[ID2:r[0-9]+]], r0
+; CHECK-O0: mov [[ID:r[0-9]+]], #1
+; CHECK-O0: strb [[ID]], [r0, #8]
+; CHECK-O0: mov r6, [[ID2]]
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+}
+
+; "caller" calls "foo" that takes a swifterror parameter.
+define float @caller(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller:
+; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], r0
+; CHECK-APPLE-DAG: mov r6, #0
+; CHECK-APPLE: bl {{.*}}foo
+; CHECK-APPLE: cmp r6, #0
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov r0, r6
+; CHECK_APPLE: bl {{.*}}free
+
+; CHECK-O0-LABEL: caller:
+; spill r0
+; CHECK-O0-DAG: str r0,
+; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0: bl {{.*}}foo
+; CHECK-O0: mov r{{.*}}, r6
+; CHECK-O0: bne
+; CHECK-O0: ldrb [[CODE:r[0-9]+]], [r0, #8]
+; reload r0
+; CHECK-O0: ldr [[ID:r[0-9]+]],
+; CHECK-O0: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-O0: mov r0,
+; CHECK-O0: free
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "caller2" is the caller of "foo", it calls "foo" inside a loop.
+define float @caller2(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller2:
+; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], r0
+; CHECK-APPLE-DAG: mov r6, #0
+; CHECK-APPLE: bl {{.*}}foo
+; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: bne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrb [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov r0, r6
+; CHECK_APPLE: bl {{.*}}free
+
+; CHECK-O0-LABEL: caller2:
+; spill r0
+; CHECK-O0-DAG: str r0,
+; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0: bl {{.*}}foo
+; CHECK-O0: mov r{{.*}}, r6
+; CHECK-O0: bne
+; CHECK-O0: ble
+; CHECK-O0: ldrb [[CODE:r[0-9]+]], [r0, #8]
+; reload r0
+; CHECK-O0: ldr [[ID:r[0-9]+]],
+; CHECK-O0: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-O0: mov r0,
+; CHECK-O0: free
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ br label %bb_loop
+bb_loop:
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %cmp = fcmp ogt float %call, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "foo_if" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition.
+define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
+; CHECK-APPLE-LABEL: foo_if:
+; CHECK-APPLE: cmp r0, #0
+; CHECK-APPLE: eq
+; CHECK-APPLE: mov r0, #16
+; CHECK-APPLE: malloc
+; CHECK-APPLE: mov [[ID:r[0-9]+]], #1
+; CHECK-APPLE-DAG: mov r6, r{{.*}}
+; CHECK-APPLE-DAG: strb [[ID]], [r{{.*}}, #8]
+
+; CHECK-O0-LABEL: foo_if:
+; CHECK-O0: cmp r0, #0
+; spill to stack
+; CHECK-O0: str r6
+; CHECK-O0: beq
+; CHECK-O0: mov r0, #16
+; CHECK-O0: malloc
+; CHECK-O0: mov [[ID:r[0-9]+]], r0
+; CHECK-O0: mov [[ID2:[a-z0-9]+]], #1
+; CHECK-O0: strb [[ID2]], [r0, #8]
+; CHECK-O0: mov r6, [[ID]]
+; reload from stack
+; CHECK-O0: ldr r6
+entry:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %normal
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+
+normal:
+ ret float 0.0
+}
+
+; "foo_loop" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition inside a loop.
+define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float %cc2) {
+; CHECK-APPLE-LABEL: foo_loop:
+; CHECK-APPLE: mov [[CODE:r[0-9]+]], r0
+; swifterror is kept in a register
+; CHECK-APPLE: mov [[ID:r[0-9]+]], r6
+; CHECK-APPLE: cmp [[CODE]], #0
+; CHECK-APPLE: beq
+; CHECK-APPLE: mov r0, #16
+; CHECK-APPLE: malloc
+; CHECK-APPLE: strb r{{.*}}, [{{.*}}[[ID]], #8]
+; CHECK-APPLE: ble
+; CHECK-APPLE: mov r6, [[ID]]
+
+; CHECK-O0-LABEL: foo_loop:
+; CHECK-O0: mov r{{.*}}, r6
+; CHECK-O0: cmp r{{.*}}, #0
+; CHECK-O0: beq
+; CHECK-O0-DAG: movw r{{.*}}, #1
+; CHECK-O0-DAG: mov r{{.*}}, #16
+; CHECK-O0: malloc
+; CHECK-O0-DAG: mov [[ID:r[0-9]+]], r0
+; CHECK-O0-DAG: ldr [[ID2:r[0-9]+]], [sp{{.*}}]
+; CHECK-O0: strb [[ID2]], [{{.*}}[[ID]], #8]
+; spill r0
+; CHECK-O0: str r0, [sp{{.*}}]
+; CHECK-O0: vcmpe
+; CHECK-O0: ble
+; reload from stack
+; CHECK-O0: ldr r6
+entry:
+ br label %bb_loop
+
+bb_loop:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %bb_cont
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ br label %bb_cont
+
+bb_cont:
+ %cmp = fcmp ogt float %cc2, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ ret float 0.0
+}
+
+; "foo_sret" is a function that takes a swifterror parameter, it also has a sret
+; parameter.
+define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo_sret:
+; CHECK-APPLE: mov [[SRET:r[0-9]+]], r0
+; CHECK-APPLE: mov r0, #16
+; CHECK-APPLE: malloc
+; CHECK-APPLE: mov [[REG:r[0-9]+]], #1
+; CHECK-APPLE-DAG: mov r6, r0
+; CHECK-APPLE-DAG: strb [[REG]], [r0, #8]
+; CHECK-APPLE-DAG: str r{{.*}}, [{{.*}}[[SRET]], #4]
+
+; CHECK-O0-LABEL: foo_sret:
+; CHECK-O0: mov r{{.*}}, #16
+; spill to stack: sret and val1
+; CHECK-O0-DAG: str r0
+; CHECK-O0-DAG: str r1
+; CHECK-O0: malloc
+; CHECK-O0: mov [[ID:r[0-9]+]], #1
+; CHECK-O0: strb [[ID]], [r0, #8]
+; reload from stack: sret and val1
+; CHECK-O0: ldr
+; CHECK-O0: ldr
+; CHECK-O0: str r{{.*}}, [{{.*}}, #4]
+; CHECK-O0: mov r6
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ %v2 = getelementptr inbounds %struct.S, %struct.S* %agg.result, i32 0, i32 1
+ store i32 %val1, i32* %v2
+ ret void
+}
+
+; "caller3" calls "foo_sret" that takes a swifterror parameter.
+define float @caller3(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller3:
+; CHECK-APPLE: mov [[ID:r[0-9]+]], r0
+; CHECK-APPLE: mov r6, #0
+; CHECK-APPLE: bl {{.*}}foo_sret
+; CHECK-APPLE: cmp r6, #0
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov r0, r6
+; CHECK_APPLE: bl {{.*}}free
+
+; CHECK-O0-LABEL: caller3:
+; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0-DAG: mov r0
+; CHECK-O0-DAG: mov r1
+; CHECK-O0: bl {{.*}}foo_sret
+; CHECK-O0: mov [[ID2:r[0-9]+]], r6
+; CHECK-O0: cmp [[ID2]]
+; CHECK-O0: bne
+; Access part of the error object and save it to error_ref
+; CHECK-O0: ldrb [[CODE:r[0-9]+]]
+; CHECK-O0: ldr [[ID:r[0-9]+]]
+; CHECK-O0: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-O0: mov r0,
+; CHECK_O0: bl {{.*}}free
+entry:
+ %s = alloca %struct.S, align 8
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ call void @foo_sret(%struct.S* sret %s, i32 1, %swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "foo_vararg" is a function that takes a swifterror parameter, it also has
+; variable number of arguments.
+declare void @llvm.va_start(i8*) nounwind
+define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
+; CHECK-APPLE-LABEL: foo_vararg:
+; CHECK-APPLE: mov r0, #16
+; CHECK-APPLE: malloc
+; CHECK-APPLE: mov [[REG:r[0-9]+]], r0
+; CHECK-APPLE: mov [[ID:r[0-9]+]], #1
+; CHECK-APPLE-DAG: strb [[ID]], [{{.*}}[[REG]], #8]
+; CHECK-APPLE-DAG: mov r6, [[REG]]
+
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+
+ %args = alloca i8*, align 8
+ %a10 = alloca i32, align 4
+ %a11 = alloca i32, align 4
+ %a12 = alloca i32, align 4
+ %v10 = bitcast i8** %args to i8*
+ call void @llvm.va_start(i8* %v10)
+ %v11 = va_arg i8** %args, i32
+ store i32 %v11, i32* %a10, align 4
+ %v12 = va_arg i8** %args, i32
+ store i32 %v12, i32* %a11, align 4
+ %v13 = va_arg i8** %args, i32
+ store i32 %v13, i32* %a12, align 4
+
+ ret float 1.0
+}
+
+; "caller4" calls "foo_vararg" that takes a swifterror parameter.
+define float @caller4(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller4:
+; CHECK-APPLE: mov [[ID:r[0-9]+]], r0
+; CHECK-APPLE: mov r6, #0
+; CHECK-APPLE: bl {{.*}}foo_vararg
+; CHECK-APPLE: cmp r6, #0
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE: mov r0, r6
+; CHECK_APPLE: bl {{.*}}free
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+
+ %a10 = alloca i32, align 4
+ %a11 = alloca i32, align 4
+ %a12 = alloca i32, align 4
+ store i32 10, i32* %a10, align 4
+ store i32 11, i32* %a11, align 4
+ store i32 12, i32* %a12, align 4
+ %v10 = load i32, i32* %a10, align 4
+ %v11 = load i32, i32* %a11, align 4
+ %v12 = load i32, i32* %a12, align 4
+
+ %call = call float (%swift_error**, ...) @foo_vararg(%swift_error** swifterror %error_ptr_ref, i32 %v10, i32 %v11, i32 %v12)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
diff --git a/test/CodeGen/ARM/swiftself.ll b/test/CodeGen/ARM/swiftself.ll
new file mode 100644
index 000000000000..6826b123472f
--- /dev/null
+++ b/test/CodeGen/ARM/swiftself.ll
@@ -0,0 +1,65 @@
+; RUN: llc -verify-machineinstrs -mtriple=armv7k-apple-ios8.0 -mcpu=cortex-a7 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT --check-prefix=TAILCALL %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=armv7k-apple-ios8.0 -mcpu=cortex-a7 -o - %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple=armv7-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=armv7-apple-ios -o - %s | FileCheck %s
+
+; Parameter with swiftself should be allocated to r10.
+; CHECK-LABEL: swiftself_param:
+; CHECK: mov r0, r10
+define i8 *@swiftself_param(i8* swiftself %addr0) {
+ ret i8 *%addr0
+}
+
+; Check that r10 is used to pass a swiftself argument.
+; CHECK-LABEL: call_swiftself:
+; CHECK: mov r10, r0
+; CHECK: bl {{_?}}swiftself_param
+define i8 *@call_swiftself(i8* %arg) {
+ %res = call i8 *@swiftself_param(i8* swiftself %arg)
+ ret i8 *%res
+}
+
+; r10 should be saved by the callee even if used for swiftself
+; CHECK-LABEL: swiftself_clobber:
+; CHECK: push {r10}
+; ...
+; CHECK: pop {r10}
+define i8 *@swiftself_clobber(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{r10}"()
+ ret i8 *%addr0
+}
+
+; Demonstrate that we do not need any movs when calling multiple functions
+; with swiftself argument.
+; CHECK-LABEL: swiftself_passthrough:
+; OPT-NOT: mov{{.*}}r10
+; OPT: bl {{_?}}swiftself_param
+; OPT-NOT: mov{{.*}}r10
+; OPT-NEXT: bl {{_?}}swiftself_param
+define void @swiftself_passthrough(i8* swiftself %addr0) {
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ ret void
+}
+
+; We can use a tail call if the callee swiftself is the same as the caller one.
+; CHECK-LABEL: swiftself_tail:
+; TAILCALL: b {{_?}}swiftself_param
+; TAILCALL-NOT: pop
+define i8* @swiftself_tail(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{r10}"()
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr0)
+ ret i8* %res
+}
+
+; We can not use a tail call if the callee swiftself is not the same as the
+; caller one.
+; CHECK-LABEL: swiftself_notail:
+; CHECK: mov r10, r0
+; CHECK: bl {{_?}}swiftself_param
+; CHECK: pop
+define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
+ ret i8* %res
+}
diff --git a/test/CodeGen/ARM/t2-shrink-ldrpost.ll b/test/CodeGen/ARM/t2-shrink-ldrpost.ll
new file mode 100644
index 000000000000..e26f3848ddcd
--- /dev/null
+++ b/test/CodeGen/ARM/t2-shrink-ldrpost.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7m--linux-gnu"
+
+; CHECK-LABEL: f:
+; CHECK: ldm r{{[0-9]}}!, {r[[x:[0-9]]]}
+; CHECK: add.w r[[x]], r[[x]], #3
+; CHECK: stm r{{[0-9]}}!, {r[[x]]}
+define void @f(i32 %n, i32* nocapture %a, i32* nocapture readonly %b) optsize minsize {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %.lr.ph, %0
+ %i.04 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
+ %.03 = phi i32* [ %2, %.lr.ph ], [ %b, %0 ]
+ %.012 = phi i32* [ %5, %.lr.ph ], [ %a, %0 ]
+ %2 = getelementptr inbounds i32, i32* %.03, i32 1
+ %3 = load i32, i32* %.03, align 4
+ %4 = add nsw i32 %3, 3
+ %5 = getelementptr inbounds i32, i32* %.012, i32 1
+ store i32 %4, i32* %.012, align 4
+ %6 = add nsw i32 %i.04, 1
+ %exitcond = icmp eq i32 %6, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ ret void
+}
+
+; CHECK-LABEL: f_nominsize:
+; CHECK-NOT: ldm
+define void @f_nominsize(i32 %n, i32* nocapture %a, i32* nocapture readonly %b) optsize {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %.lr.ph, %0
+ %i.04 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
+ %.03 = phi i32* [ %2, %.lr.ph ], [ %b, %0 ]
+ %.012 = phi i32* [ %5, %.lr.ph ], [ %a, %0 ]
+ %2 = getelementptr inbounds i32, i32* %.03, i32 1
+ %3 = load i32, i32* %.03, align 4
+ %4 = add nsw i32 %3, 3
+ %5 = getelementptr inbounds i32, i32* %.012, i32 1
+ store i32 %4, i32* %.012, align 4
+ %6 = add nsw i32 %i.04, 1
+ %exitcond = icmp eq i32 %6, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ ret void
+}
diff --git a/test/CodeGen/ARM/tail-call-builtin.ll b/test/CodeGen/ARM/tail-call-builtin.ll
new file mode 100644
index 000000000000..c829cc52462e
--- /dev/null
+++ b/test/CodeGen/ARM/tail-call-builtin.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s
+
+define i64 @test_mismatched_call(double %in) {
+; CHECK-LABEL: test_mismatched_call:
+; CHECK: bl floor
+; CHECK: vmov r0, r1, d0
+
+ %val = tail call double @floor(double %in)
+ %res = bitcast double %val to i64
+ ret i64 %res
+}
+
+define double @test_matched_call(double %in) {
+; CHECK-LABEL: test_matched_call:
+; CHECK: b floor
+
+ %val = tail call double @floor(double %in)
+ ret double %val
+}
+
+define void @test_irrelevant_call(double %in) {
+; CHECK-LABEL: test_irrelevant_call:
+; CHECK-NOT: bl floor
+
+ %val = tail call double @floor(double %in)
+ ret void
+}
+
+define arm_aapcscc double @test_callingconv(double %in) {
+; CHECK: test_callingconv:
+; CHECK: bl floor
+
+ %val = tail call double @floor(double %in)
+ ret double %val
+}
+
+declare double @floor(double) nounwind readonly
diff --git a/test/CodeGen/ARM/tail-call-weak.ll b/test/CodeGen/ARM/tail-call-weak.ll
index 466c33d38786..e0117dffecbf 100644
--- a/test/CodeGen/ARM/tail-call-weak.ll
+++ b/test/CodeGen/ARM/tail-call-weak.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple thumbv7-windows-coff -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-COFF
-; RUN: llc -mtriple thumbv7-elf -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-ELF
-; RUN: llc -mtriple thumbv7-macho -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-MACHO
+; RUN: llc -mtriple thumbv7-elf -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-OTHER
+; RUN: llc -mtriple thumbv7-macho -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-OTHER
declare i8* @f()
declare extern_weak i8* @g(i8*)
@@ -14,6 +14,5 @@ define void @test() {
}
; CHECK-COFF: b g
-; CHECK-ELF: bl g
-; CHECK-MACHO: blx _g
+; CHECK-OTHER: bl {{_?}}g
diff --git a/test/CodeGen/ARM/this-return.ll b/test/CodeGen/ARM/this-return.ll
index 802f880c1380..931210c2a8ed 100644
--- a/test/CodeGen/ARM/this-return.ll
+++ b/test/CodeGen/ARM/this-return.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s -check-prefix=CHECKELF
-; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -arm-this-return-forwarding | FileCheck %s -check-prefix=CHECKELF
+; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 -arm-this-return-forwarding | FileCheck %s -check-prefix=CHECKT2D
%struct.A = type { i8 }
%struct.B = type { i32 }
@@ -24,7 +24,7 @@ entry:
; CHECKELF: b B_ctor_base
; CHECKT2D-LABEL: C_ctor_base:
; CHECKT2D-NOT: mov {{r[0-9]+}}, r0
-; CHECKT2D: blx _A_ctor_base
+; CHECKT2D: bl _A_ctor_base
; CHECKT2D-NOT: mov r0, {{r[0-9]+}}
; CHECKT2D: b.w _B_ctor_base
%0 = bitcast %struct.C* %this to %struct.A*
@@ -43,7 +43,7 @@ entry:
; CHECKELF-NOT: b B_ctor_base_nothisret
; CHECKT2D-LABEL: C_ctor_base_nothisret:
; CHECKT2D: mov [[SAVETHIS:r[0-9]+]], r0
-; CHECKT2D: blx _A_ctor_base_nothisret
+; CHECKT2D: bl _A_ctor_base_nothisret
; CHECKT2D: mov r0, [[SAVETHIS]]
; CHECKT2D-NOT: b.w _B_ctor_base_nothisret
%0 = bitcast %struct.C* %this to %struct.A*
@@ -82,7 +82,7 @@ entry:
; CHECKELF: b B_ctor_complete
; CHECKT2D-LABEL: D_ctor_base:
; CHECKT2D-NOT: mov {{r[0-9]+}}, r0
-; CHECKT2D: blx _B_ctor_complete
+; CHECKT2D: bl _B_ctor_complete
; CHECKT2D-NOT: mov r0, {{r[0-9]+}}
; CHECKT2D: b.w _B_ctor_complete
%b = getelementptr inbounds %struct.D, %struct.D* %this, i32 0, i32 0
diff --git a/test/CodeGen/ARM/thread_pointer.ll b/test/CodeGen/ARM/thread_pointer.ll
index c403fa5c4a2a..fe1d3a4dfd0f 100644
--- a/test/CodeGen/ARM/thread_pointer.ll
+++ b/test/CodeGen/ARM/thread_pointer.ll
@@ -3,8 +3,8 @@
define i8* @test() {
entry:
- %tmp1 = call i8* @llvm.arm.thread.pointer( ) ; <i8*> [#uses=0]
+ %tmp1 = call i8* @llvm.thread.pointer( ) ; <i8*> [#uses=0]
ret i8* %tmp1
}
-declare i8* @llvm.arm.thread.pointer()
+declare i8* @llvm.thread.pointer()
diff --git a/test/CodeGen/ARM/thumb-alignment.ll b/test/CodeGen/ARM/thumb-alignment.ll
index b9ddfbb714d1..8e894d28b6c6 100644
--- a/test/CodeGen/ARM/thumb-alignment.ll
+++ b/test/CodeGen/ARM/thumb-alignment.ll
@@ -3,13 +3,13 @@
@x = external global i32
; CHECK: .globl foo
-; CHECK-NEXT: .align 2
+; CHECK-NEXT: .p2align 2
define i32* @foo() {
ret i32* @x
}
; CHECK: .globl bar
-; CHECK-NEXT: .align 1
+; CHECK-NEXT: .p2align 1
define i32* @bar() {
ret i32* zeroinitializer
}
@@ -22,7 +22,7 @@ define i32* @bar() {
; Create a Thumb-2 jump table, which should force alignment to 4 bytes.
; CHECK: .globl baz
-; CHECK-NEXT: .align 2
+; CHECK-NEXT: .p2align 2
; CHECK: tbb
define i32 @baz() {
%1 = load i32, i32* @c, align 4
diff --git a/test/CodeGen/ARM/thumb-stub.ll b/test/CodeGen/ARM/thumb-stub.ll
new file mode 100644
index 000000000000..68001b6c0eec
--- /dev/null
+++ b/test/CodeGen/ARM/thumb-stub.ll
@@ -0,0 +1,10 @@
+; RUN: llc -relocation-model=pic -mtriple=thumb-apple-darwin < %s | FileCheck %s
+
+declare hidden void @f()
+
+; CHECK: bl _f
+
+define void @g() {
+ call void @f()
+ ret void
+}
diff --git a/test/CodeGen/ARM/thumb1-ldst-opt.ll b/test/CodeGen/ARM/thumb1-ldst-opt.ll
index eb82385de0c5..f3c83f04588f 100644
--- a/test/CodeGen/ARM/thumb1-ldst-opt.ll
+++ b/test/CodeGen/ARM/thumb1-ldst-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stop-after block-placement -o /dev/null %s | FileCheck %s
+; RUN: llc -stop-after block-placement -o - %s | FileCheck %s
target triple = "thumbv6m-none-none"
diff --git a/test/CodeGen/ARM/thumb1-varalloc.ll b/test/CodeGen/ARM/thumb1-varalloc.ll
index 0637be03d565..3787c4282b28 100644
--- a/test/CodeGen/ARM/thumb1-varalloc.ll
+++ b/test/CodeGen/ARM/thumb1-varalloc.ll
@@ -52,7 +52,7 @@ define void @test_simple_var() {
; CHECK: mov r0, sp
; CHECK-NOT: adds r0
-; CHECK: blx
+; CHECK: bl
call void @take_ptr(i8* %addr8)
ret void
}
@@ -67,12 +67,12 @@ define void @test_local_var_addr_aligned() {
%addr2 = bitcast i32* %addr2.32 to i8*
; CHECK: add r0, sp, #{{[0-9]+}}
-; CHECK: blx
+; CHECK: bl
call void @take_ptr(i8* %addr1)
; CHECK: mov r0, sp
; CHECK-NOT: add r0
-; CHECK: blx
+; CHECK: bl
call void @take_ptr(i8* %addr2)
ret void
@@ -87,7 +87,7 @@ define void @test_local_var_big_offset() {
; CHECK: add [[RTMP:r[0-9]+]], sp, #1020
; CHECK: adds [[RTMP]], #8
-; CHECK: blx
+; CHECK: bl
call void @take_ptr(i8* %addr1)
ret void
@@ -100,7 +100,7 @@ define void @test_local_var_offset_1020() {
%addr2 = alloca i8, i32 1020
; CHECK: add r0, sp, #1020
-; CHECK-NEXT: blx
+; CHECK-NEXT: bl
call void @take_ptr(i8* %addr1)
ret void
@@ -116,7 +116,7 @@ define void @test_local_var_offset_1268() {
; CHECK: add r0, sp, #1020
; CHECK: adds r0, #248
-; CHECK-NEXT: blx
+; CHECK-NEXT: bl
call void @take_ptr(i8* %addr1)
ret void
diff --git a/test/CodeGen/ARM/thumb2-size-opt.ll b/test/CodeGen/ARM/thumb2-size-opt.ll
index 0084a456a72e..aba027607ea7 100644
--- a/test/CodeGen/ARM/thumb2-size-opt.ll
+++ b/test/CodeGen/ARM/thumb2-size-opt.ll
@@ -82,3 +82,19 @@ entry:
%shr = lshr i32 %a, %b
ret i32 %shr
}
+
+define i32 @bundled_instruction(i32* %addr, i32** %addr2, i1 %tst) minsize {
+; CHECK-LABEL: bundled_instruction:
+; CHECK: iteee ne
+; CHECK: ldmeq r0!, {{{r[0-9]+}}}
+ br i1 %tst, label %true, label %false
+
+true:
+ ret i32 0
+
+false:
+ %res = load i32, i32* %addr, align 4
+ %next = getelementptr i32, i32* %addr, i32 1
+ store i32* %next, i32** %addr2
+ ret i32 %res
+}
diff --git a/test/CodeGen/ARM/tls-models.ll b/test/CodeGen/ARM/tls-models.ll
index f3c58f74ebf7..d8c74d213b13 100644
--- a/test/CodeGen/ARM/tls-models.ll
+++ b/test/CodeGen/ARM/tls-models.ll
@@ -130,7 +130,7 @@ entry:
; EMU-NOT: __emutls_t.external_gd
; EMU-NOT: __emutls_v.external_gd
-; EMU: .align 2
+; EMU: .p2align 2
; EMU-LABEL: __emutls_v.internal_gd:
; EMU-NEXT: .long 4
; EMU-NEXT: .long 4
@@ -144,7 +144,7 @@ entry:
; EMU-NOT: __emutls_t.external_gd
; EMU-NOT: __emutls_v.external_gd
-; EMU: .align 2
+; EMU: .p2align 2
; EMU-LABEL: __emutls_v.internal_le:
; EMU-NEXT: .long 4
; EMU-NEXT: .long 4
diff --git a/test/CodeGen/ARM/tls3.ll b/test/CodeGen/ARM/tls3.ll
index 94cadeedd938..ca3cde264da7 100644
--- a/test/CodeGen/ARM/tls3.ll
+++ b/test/CodeGen/ARM/tls3.ll
@@ -23,7 +23,7 @@ entry:
; CHECK-NOT: __emutls_t.teste
-; EMU: .align 2
+; EMU: .p2align 2
; EMU-LABEL: __emutls_v.teste:
; EMU-NEXT: .long 8
; EMU-NEXT: .long 4
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 0baf50b45b20..585218cf337c 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -1,38 +1,56 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=INSTR
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap | FileCheck %s -check-prefix=FUNC
; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap -O0 | FileCheck %s -check-prefix=FUNC
-; RUN: llc -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
-; RUN: | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
-; RUN: | FileCheck %s -check-prefix=ENCODING-NACL
-; RUN: llc -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
-; RUN: | llvm-objdump -disassemble -triple armv7 -mattr=+nacl-trap - \
-; RUN: | FileCheck %s -check-prefix=ENCODING-NACL
+; RUN: llc < %s -mtriple=armv7 -mattr=+nacl-trap | FileCheck %s -check-prefix=NACL
+; RUN: llc < %s -mtriple=armv7 | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=thumbv7 | FileCheck %s -check-prefix=THUMB
+
; RUN: llc -mtriple=armv7 -mattr=+nacl-trap -filetype=obj %s -o - \
; RUN: | llvm-objdump -disassemble -triple armv7 -mattr=+nacl-trap - \
; RUN: | FileCheck %s -check-prefix=ENCODING-NACL
-; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
-; RUN: | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
+; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7 -mattr=+nacl-trap -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple armv7 -mattr=+nacl-trap - \
; RUN: | FileCheck %s -check-prefix=ENCODING-NACL
+
; RUN: llc -mtriple=armv7 -filetype=obj %s -o - \
; RUN: | llvm-objdump -disassemble -triple armv7 - \
-; RUN: | FileCheck %s -check-prefix=ENCODING-ALL
+; RUN: | FileCheck %s -check-prefix=ENCODING-ARM
; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7 -filetype=obj %s -o - \
; RUN: | llvm-objdump -disassemble -triple armv7 - \
-; RUN: | FileCheck %s -check-prefix=ENCODING-ALL
+; RUN: | FileCheck %s -check-prefix=ENCODING-ARM
+
+; RUN: llc -mtriple=thumbv7 -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple thumbv7 - \
+; RUN: | FileCheck %s -check-prefix=ENCODING-THUMB
+; RUN: llc -verify-machineinstrs -fast-isel -mtriple=thumbv7 -filetype=obj %s -o - \
+; RUN: | llvm-objdump -disassemble -triple thumbv7 - \
+; RUN: | FileCheck %s -check-prefix=ENCODING-THUMB
+
; rdar://7961298
; rdar://9249183
define void @t() nounwind {
entry:
-; INSTR-LABEL: t:
-; INSTR: trap
+; DARWIN-LABEL: t:
+; DARWIN: trap
; FUNC-LABEL: t:
; FUNC: bl __trap
-; ENCODING-NACL: f0 de fe e7
+; NACL-LABEL: t:
+; NACL: .inst 0xe7fedef0
-; ENCODING-ALL: fe de ff e7
+; ARM-LABEL: t:
+; ARM: .inst 0xe7ffdefe
+
+; THUMB-LABEL: t:
+; THUMB: .inst.n 0xdefe
+
+; ENCODING-NACL: f0 de fe e7 trap
+
+; ENCODING-ARM: fe de ff e7 trap
+
+; ENCODING-THUMB: fe de trap
call void @llvm.trap()
unreachable
@@ -40,15 +58,26 @@ entry:
define void @t2() nounwind {
entry:
-; INSTR-LABEL: t2:
-; INSTR: trap
+; DARWIN-LABEL: t2:
+; DARWIN: trap
; FUNC-LABEL: t2:
; FUNC: bl __trap
-; ENCODING-NACL: f0 de fe e7
+; NACL-LABEL: t2:
+; NACL: .inst 0xe7fedef0
+
+; ARM-LABEL: t2:
+; ARM: .inst 0xe7ffdefe
+
+; THUMB-LABEL: t2:
+; THUMB: .inst.n 0xdefe
+
+; ENCODING-NACL: f0 de fe e7 trap
+
+; ENCODING-ARM: fe de ff e7 trap
-; ENCODING-ALL: fe de ff e7
+; ENCODING-THUMB: fe de trap
call void @llvm.debugtrap()
unreachable
diff --git a/test/CodeGen/ARM/truncstore-dag-combine.ll b/test/CodeGen/ARM/truncstore-dag-combine.ll
index 11fa022451f6..5142a305b3a7 100644
--- a/test/CodeGen/ARM/truncstore-dag-combine.ll
+++ b/test/CodeGen/ARM/truncstore-dag-combine.ll
@@ -1,5 +1,8 @@
; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
+; CHECK-LABEL: bar
+; CHECK-NOT: orr
+; CHECK-NOT: mov
define void @bar(i8* %P, i16* %Q) {
entry:
%P1 = bitcast i8* %P to i16* ; <i16*> [#uses=1]
@@ -8,6 +11,9 @@ entry:
ret void
}
+; CHECK-LABEL: foo
+; CHECK-NOT: orr
+; CHECK-NOT: mov
define void @foo(i8* %P, i32* %Q) {
entry:
%P1 = bitcast i8* %P to i32* ; <i32*> [#uses=1]
@@ -15,7 +21,3 @@ entry:
store i32 %tmp, i32* %P1, align 1
ret void
}
-
-; CHECK-NOT: orr
-; CHECK-NOT: mov
-
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 97a49334b742..f0a95c833c6b 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -5,6 +5,7 @@ define void @PR13378() nounwind {
; This was orriginally a crasher trying to schedule the instructions.
; CHECK-LABEL: PR13378:
; CHECK: vld1.32
+; CHECK-NEXT: vmov.i32
; CHECK-NEXT: vst1.32
; CHECK-NEXT: vst1.32
; CHECK-NEXT: vmov.f32
diff --git a/test/CodeGen/ARM/urem-opt-size.ll b/test/CodeGen/ARM/urem-opt-size.ll
new file mode 100644
index 000000000000..7f1cd43bc4e3
--- /dev/null
+++ b/test/CodeGen/ARM/urem-opt-size.ll
@@ -0,0 +1,45 @@
+; When optimising for minimum size, we don't want to expand a div to a mul
+; and a shift sequence. As a result, the urem instruction e.g. will not be
+; expanded to a sequence of umull, lsrs, muls and sub instructions, but
+; just a call to __aeabi_uidivmod.
+;
+; RUN: llc -mtriple=armv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+define i32 @foo1() local_unnamed_addr #0 {
+entry:
+; CHECK-LABEL: foo1:
+; CHECK:__aeabi_idiv
+; CHECK-NOT: smmul
+ %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)()
+ %div = sdiv i32 %call, 1000000
+ ret i32 %div
+}
+
+define i32 @foo2() local_unnamed_addr #0 {
+entry:
+; CHECK-LABEL: foo2:
+; CHECK: __aeabi_uidiv
+; CHECK-NOT: umull
+ %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)()
+ %div = udiv i32 %call, 1000000
+ ret i32 %div
+}
+
+define i32 @foo3() local_unnamed_addr #0 {
+entry:
+; CHECK-LABEL: foo3:
+; CHECK: __aeabi_uidivmod
+; CHECK-NOT: umull
+ %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)()
+ %rem = urem i32 %call, 1000000
+ %cmp = icmp eq i32 %rem, 0
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+declare i32 @GetValue(...) local_unnamed_addr
+
+attributes #0 = { minsize nounwind optsize }
diff --git a/test/CodeGen/ARM/v7k-libcalls.ll b/test/CodeGen/ARM/v7k-libcalls.ll
index a1dfb07ca614..8ca31ef4e652 100644
--- a/test/CodeGen/ARM/v7k-libcalls.ll
+++ b/test/CodeGen/ARM/v7k-libcalls.ll
@@ -145,6 +145,15 @@ define arm_aapcs_vfpcc double @t14(double %x) {
ret double %__exp10
}
+define i16 @t15(double %x) {
+; CHECK-LABEL: t15:
+; CHECK-NOT: vmov
+; CHECK: bl ___truncdfhf2
+ %tmp0 = fptrunc double %x to half
+ %tmp1 = bitcast half %tmp0 to i16
+ ret i16 %tmp1
+}
+
declare arm_aapcs_vfpcc double @x(double, double, double, double, double, double, double, float, double)
declare arm_aapcs_vfpcc double @cos(double) #0
declare arm_aapcs_vfpcc double @sin(double) #0
diff --git a/test/CodeGen/ARM/v7k-sincos.ll b/test/CodeGen/ARM/v7k-sincos.ll
index b89d4dc8120b..2db2dc088d95 100644
--- a/test/CodeGen/ARM/v7k-sincos.ll
+++ b/test/CodeGen/ARM/v7k-sincos.ll
@@ -5,7 +5,7 @@ declare double @cos(double) nounwind readnone
define double @test_stret(double %in) {
; CHECK-LABEL: test_stret:
-; CHECK: blx ___sincos_stret
+; CHECK: bl ___sincos_stret
; CHECK-NOT: ldr
; CHECK: vadd.f64 d0, d0, d1
diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll
index de251c58e6b9..6d9667bda6f3 100644
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@@ -44,6 +44,13 @@ define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
ret <2 x i32> %tmp2
}
+define <1 x i64> @vclz64(<1 x i64>* %A) nounwind {
+;CHECK-LABEL: vclz64:
+ %tmp1 = load <1 x i64>, <1 x i64>* %A
+ %tmp2 = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %tmp1, i1 0)
+ ret <1 x i64> %tmp2
+}
+
define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
;CHECK-LABEL: vclzQ8:
;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
@@ -68,13 +75,84 @@ define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
ret <4 x i32> %tmp2
}
+define <2 x i64> @vclzQ64(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: vclzQ64:
+ %tmp1 = load <2 x i64>, <2 x i64>* %A
+ %tmp2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %tmp1, i1 0)
+ ret <2 x i64> %tmp2
+}
+
+define <8 x i8> @vclz8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vclz8b:
+;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
+ %tmp1 = load <8 x i8>, <8 x i8>* %A
+ %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclz16b(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vclz16b:
+;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
+ %tmp1 = load <4 x i16>, <4 x i16>* %A
+ %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclz32b(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vclz32b:
+;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
+ %tmp1 = load <2 x i32>, <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 1)
+ ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vclz64b(<1 x i64>* %A) nounwind {
+;CHECK-LABEL: vclz64b:
+ %tmp1 = load <1 x i64>, <1 x i64>* %A
+ %tmp2 = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %tmp1, i1 1)
+ ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vclzQ8b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: vclzQ8b:
+;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
+ %tmp1 = load <16 x i8>, <16 x i8>* %A
+ %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 1)
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclzQ16b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: vclzQ16b:
+;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
+ %tmp1 = load <8 x i16>, <8 x i16>* %A
+ %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclzQ32b(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: vclzQ32b:
+;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
+ %tmp1 = load <4 x i32>, <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 1)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vclzQ64b(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: vclzQ64b:
+ %tmp1 = load <2 x i64>, <2 x i64>* %A
+ %tmp2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %tmp1, i1 1)
+ ret <2 x i64> %tmp2
+}
+
declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <1 x i64> @llvm.ctlz.v1i64(<1 x i64>, i1) nounwind readnone
declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
;CHECK-LABEL: vclss8:
diff --git a/test/CodeGen/ARM/vcvt_combine.ll b/test/CodeGen/ARM/vcvt_combine.ll
index 9a8f084d2303..11bed5a1a474 100644
--- a/test/CodeGen/ARM/vcvt_combine.ll
+++ b/test/CodeGen/ARM/vcvt_combine.ll
@@ -62,3 +62,11 @@ define <4 x i32> @t5(<4 x float> %in) {
%vcvt.i = fptosi <4 x float> %mul.i to <4 x i32>
ret <4 x i32> %vcvt.i
}
+
+; CHECK-LABEL: test_illegal_fp_to_int:
+; CHECK: vcvt.s32.f32 {{q[0-9]+}}, {{q[0-9]+}}, #2
+define <3 x i32> @test_illegal_fp_to_int(<3 x float> %in) {
+ %scale = fmul <3 x float> %in, <float 4.0, float 4.0, float 4.0>
+ %val = fptosi <3 x float> %scale to <3 x i32>
+ ret <3 x i32> %val
+} \ No newline at end of file
diff --git a/test/CodeGen/ARM/vdiv_combine.ll b/test/CodeGen/ARM/vdiv_combine.ll
index 8511dbcb6876..4a6c36b42772 100644
--- a/test/CodeGen/ARM/vdiv_combine.ll
+++ b/test/CodeGen/ARM/vdiv_combine.ll
@@ -153,3 +153,11 @@ define <4 x float> @test8(<4 x i32> %in) {
%div.i = fdiv <4 x float> %vcvt.i, <float 2.0, float 2.0, float 2.0, float undef>
ret <4 x float> %div.i
}
+
+; CHECK-LABEL: test_illegal_int_to_fp:
+; CHECK: vcvt.f32.s32
+define <3 x float> @test_illegal_int_to_fp(<3 x i32> %in) {
+ %conv = sitofp <3 x i32> %in to <3 x float>
+ %res = fdiv <3 x float> %conv, <float 4.0, float 4.0, float 4.0>
+ ret <3 x float> %res
+} \ No newline at end of file
diff --git a/test/CodeGen/ARM/vfp-libcalls.ll b/test/CodeGen/ARM/vfp-libcalls.ll
index b08073ab62b3..59d5ccc95840 100644
--- a/test/CodeGen/ARM/vfp-libcalls.ll
+++ b/test/CodeGen/ARM/vfp-libcalls.ll
@@ -4,7 +4,7 @@
define float @test_call(float %a, float %b) {
; CHECK-HARD: vadd.f32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-SOFTISH: blx ___addsf3vfp
+; CHECK-SOFTISH: bl ___addsf3vfp
; CHECK-SOFT: bl ___addsf3{{$}}
%sum = fadd float %a, %b
ret float %sum
diff --git a/test/CodeGen/ARM/vfp-regs-dwarf.ll b/test/CodeGen/ARM/vfp-regs-dwarf.ll
index 1b2055e5aff1..2ab85053a91f 100644
--- a/test/CodeGen/ARM/vfp-regs-dwarf.ll
+++ b/test/CodeGen/ARM/vfp-regs-dwarf.ll
@@ -31,11 +31,10 @@ define void @stack_offsets() !dbg !4 {
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !9}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "tmp.c", directory: "/Users/tim/llvm/build")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "bar", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "bar", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
!5 = !DIFile(filename: "tmp.c", directory: "/Users/tim/llvm/build")
!6 = !DISubroutineType(types: !7)
!7 = !{null}
diff --git a/test/CodeGen/ARM/vminmax.ll b/test/CodeGen/ARM/vminmax.ll
index 011bfd7ff88e..e4f30faa9177 100644
--- a/test/CodeGen/ARM/vminmax.ll
+++ b/test/CodeGen/ARM/vminmax.ll
@@ -291,3 +291,20 @@ declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind read
declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+declare float @llvm.maxnum.f32(float %a, float %b)
+declare float @llvm.minnum.f32(float %a, float %b)
+
+define float @maxnum(float %a, float %b) {
+;CHECK-LABEL: maxnum:
+;CHECK: vmax.f32
+ %r = call nnan float @llvm.maxnum.f32(float %a, float %b)
+ ret float %r
+}
+
+define float @minnum(float %a, float %b) {
+;CHECK-LABEL: minnum:
+;CHECK: vmin.f32
+ %r = call nnan float @llvm.minnum.f32(float %a, float %b)
+ ret float %r
+}
diff --git a/test/CodeGen/ARM/warn-stack.ll b/test/CodeGen/ARM/warn-stack.ll
index 474dc1dfb447..6e819e404323 100644
--- a/test/CodeGen/ARM/warn-stack.ll
+++ b/test/CodeGen/ARM/warn-stack.ll
@@ -12,7 +12,7 @@ entry:
ret void
}
-; CHECK: warning: stack size limit exceeded (96) in warn
+; CHECK: warning: stack size limit exceeded (92) in warn
define void @warn() nounwind ssp {
entry:
%buffer = alloca [80 x i8], align 1
diff --git a/test/CodeGen/ARM/wide-compares.ll b/test/CodeGen/ARM/wide-compares.ll
new file mode 100644
index 000000000000..08e07ef76ad7
--- /dev/null
+++ b/test/CodeGen/ARM/wide-compares.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
+; RUN: llc -mtriple=thumbv6-unknown-linux < %s | FileCheck --check-prefix=CHECK-THUMB1 %s
+; RUN: llc -mtriple=thumbv7-unknown-linux < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-THUMB2 %s
+
+; CHECK-THUMB1-NOT: sbc
+
+; CHECK-LABEL: test_slt1:
+define i32 @test_slt1(i64 %a, i64 %b) {
+entry:
+ ; CHECK-ARM: subs {{[^,]+}}, r0, r2
+ ; CHECK-ARM: mov [[TMP:[0-9a-z]+]], #2
+ ; CHECK-ARM: sbcs {{[^,]+}}, r1, r3
+ ; CHECK-ARM: movwlt [[TMP]], #1
+ ; CHECK-ARM: mov r0, [[TMP]]
+ ; CHECK-ARM: bx lr
+ ; CHECK-THUMB2: subs {{[^,]+}}, r0, r2
+ ; CHECK-THUMB2: mov.w [[TMP:[0-9a-z]+]], #2
+ ; CHECK-THUMB2: sbcs.w {{[^,]+}}, r1, r3
+ ; CHECK-THUMB2: it lt
+ ; CHECK-THUMB2: movlt.w [[TMP]], #1
+ ; CHECK-THUMB2: mov r0, [[TMP]]
+ ; CHECK-THUMB2: bx lr
+ %cmp = icmp slt i64 %a, %b
+ br i1 %cmp, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 2
+}
+
+; CHECK-LABEL: test_slt2:
+define void @test_slt2(i64 %a, i64 %b) {
+entry:
+ %cmp = icmp slt i64 %a, %b
+ ; CHECK-ARM: subs {{[^,]+}}, r0, r2
+ ; CHECK-ARM: sbcs {{[^,]+}}, r1, r3
+ ; CHECK-THUMB2: subs {{[^,]+}}, r0, r2
+ ; CHECK-THUMB2: sbcs.w {{[^,]+}}, r1, r3
+ ; CHECK: bge [[BB2:\.[0-9A-Za-z_]+]]
+ br i1 %cmp, label %bb1, label %bb2
+bb1:
+ call void @f()
+ ret void
+bb2:
+ ; CHECK: [[BB2]]:
+ ; CHECK-NEXT: bl g
+ call void @g()
+ ret void
+}
+
+declare void @f()
+declare void @g()
diff --git a/test/CodeGen/ARM/widen-vmovs.ll b/test/CodeGen/ARM/widen-vmovs.ll
index 316cfabab48c..2abf8d9701fc 100644
--- a/test/CodeGen/ARM/widen-vmovs.ll
+++ b/test/CodeGen/ARM/widen-vmovs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -widen-vmovs -mcpu=cortex-a8 -verify-machineinstrs -disable-block-placement | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a8 -verify-machineinstrs -disable-block-placement | FileCheck %s
target triple = "thumbv7-apple-ios"
; The 1.0e+10 constant is loaded from the constant pool and kept in a register.
diff --git a/test/CodeGen/ARM/zero-cycle-zero.ll b/test/CodeGen/ARM/zero-cycle-zero.ll
index 4e8696f4418a..2022d88dd0ee 100644
--- a/test/CodeGen/ARM/zero-cycle-zero.ll
+++ b/test/CodeGen/ARM/zero-cycle-zero.ll
@@ -1,5 +1,5 @@
; RUN: llc -mtriple=armv8 -mcpu=cyclone < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTSWIFT
-; RUN: llc -mtriple=armv8 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=armv8 -mcpu=swift < %s | FileCheck %s
; RUN: llc -mtriple=armv8 -mcpu=cortex-a57 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTSWIFT
declare arm_aapcs_vfpcc void @take_vec64(<2 x i32>)
diff --git a/test/CodeGen/BPF/sdiv_error.ll b/test/CodeGen/BPF/sdiv_error.ll
new file mode 100644
index 000000000000..053b82dd98fa
--- /dev/null
+++ b/test/CodeGen/BPF/sdiv_error.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -march=bpf < %s 2> %t1
+; RUN: FileCheck %s < %t1
+; CHECK: Unsupport signed division
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @test(i32 %len) #0 {
+ %1 = srem i32 %len, 15
+ ret i32 %1
+}
diff --git a/test/CodeGen/CPP/2007-06-16-Funcname.ll b/test/CodeGen/CPP/2007-06-16-Funcname.ll
deleted file mode 100644
index 71fea12d9c2c..000000000000
--- a/test/CodeGen/CPP/2007-06-16-Funcname.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=cpp -cppfname=WAKKA | not grep makeLLVMModule
-; PR1515
-
-define void @foo() {
- ret void
-}
-
diff --git a/test/CodeGen/CPP/2009-05-01-Long-Double.ll b/test/CodeGen/CPP/2009-05-01-Long-Double.ll
deleted file mode 100644
index 470303d6bb05..000000000000
--- a/test/CodeGen/CPP/2009-05-01-Long-Double.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=cpp -cppgen=program -o %t
-
-define x86_fp80 @some_func() nounwind {
-entry:
- %retval = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
- %call = call i32 (...) @other_func() ; <i32> [#uses=1]
- %conv = sitofp i32 %call to x86_fp80 ; <x86_fp80> [#uses=1]
- store x86_fp80 %conv, x86_fp80* %retval
- %0 = load x86_fp80, x86_fp80* %retval ; <x86_fp80> [#uses=1]
- ret x86_fp80 %0
-}
-
-declare i32 @other_func(...)
diff --git a/test/CodeGen/CPP/2009-05-04-CondBr.ll b/test/CodeGen/CPP/2009-05-04-CondBr.ll
deleted file mode 100644
index 9ce1e5f02042..000000000000
--- a/test/CodeGen/CPP/2009-05-04-CondBr.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s -march=cpp -cppgen=program -o %t
-; RUN: grep "BranchInst::Create(label_if_then, label_if_end, int1_cmp, label_entry);" %t
-
-define i32 @some_func(i32 %a) nounwind {
-entry:
- %retval = alloca i32 ; <i32*> [#uses=2]
- %a.addr = alloca i32 ; <i32*> [#uses=8]
- store i32 %a, i32* %a.addr
- %tmp = load i32, i32* %a.addr ; <i32> [#uses=1]
- %inc = add i32 %tmp, 1 ; <i32> [#uses=1]
- store i32 %inc, i32* %a.addr
- %tmp1 = load i32, i32* %a.addr ; <i32> [#uses=1]
- %cmp = icmp slt i32 %tmp1, 3 ; <i1> [#uses=1]
- br i1 %cmp, label %if.then, label %if.end
-
-if.then: ; preds = %entry
- store i32 7, i32* %a.addr
- br label %if.end
-
-if.end: ; preds = %if.then, %entry
- %tmp2 = load i32, i32* %a.addr ; <i32> [#uses=1]
- %inc3 = add i32 %tmp2, 1 ; <i32> [#uses=1]
- store i32 %inc3, i32* %a.addr
- %tmp4 = load i32, i32* %a.addr ; <i32> [#uses=1]
- store i32 %tmp4, i32* %retval
- %0 = load i32, i32* %retval ; <i32> [#uses=1]
- ret i32 %0
-}
diff --git a/test/CodeGen/CPP/2012-02-05-UnitVarCrash.ll b/test/CodeGen/CPP/2012-02-05-UnitVarCrash.ll
deleted file mode 100644
index 419f59476784..000000000000
--- a/test/CodeGen/CPP/2012-02-05-UnitVarCrash.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=cpp
-declare void @foo(<4 x i32>)
-define void @bar() {
- call void @foo(<4 x i32> <i32 0, i32 1, i32 2, i32 3>)
- ret void
-}
diff --git a/test/CodeGen/CPP/atomic.ll b/test/CodeGen/CPP/atomic.ll
deleted file mode 100644
index e79c45d166a5..000000000000
--- a/test/CodeGen/CPP/atomic.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; RUN: llc -march=cpp -o - %s | FileCheck %s
-
-define void @test_atomicrmw(i32* %addr, i32 %inc) {
- %inst0 = atomicrmw xchg i32* %addr, i32 %inc seq_cst
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Xchg, {{.*}}, SequentiallyConsistent, CrossThread
- ; CHECK: [[INST]]->setName("inst0");
- ; CHECK: [[INST]]->setVolatile(false);
-
- %inst1 = atomicrmw add i32* %addr, i32 %inc seq_cst
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Add, {{.*}}, SequentiallyConsistent, CrossThread
- ; CHECK: [[INST]]->setName("inst1");
- ; CHECK: [[INST]]->setVolatile(false);
-
- %inst2 = atomicrmw volatile sub i32* %addr, i32 %inc singlethread monotonic
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Sub, {{.*}}, Monotonic, SingleThread
- ; CHECK: [[INST]]->setName("inst2");
- ; CHECK: [[INST]]->setVolatile(true);
-
- %inst3 = atomicrmw and i32* %addr, i32 %inc acq_rel
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::And, {{.*}}, AcquireRelease, CrossThread
- ; CHECK: [[INST]]->setName("inst3");
- ; CHECK: [[INST]]->setVolatile(false);
-
- %inst4 = atomicrmw nand i32* %addr, i32 %inc release
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Nand, {{.*}}, Release, CrossThread
- ; CHECK: [[INST]]->setName("inst4");
- ; CHECK: [[INST]]->setVolatile(false);
-
- %inst5 = atomicrmw volatile or i32* %addr, i32 %inc singlethread seq_cst
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Or, {{.*}}, SequentiallyConsistent, SingleThread
- ; CHECK: [[INST]]->setName("inst5");
- ; CHECK: [[INST]]->setVolatile(true);
-
- %inst6 = atomicrmw xor i32* %addr, i32 %inc release
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Xor, {{.*}}, Release, CrossThread
- ; CHECK: [[INST]]->setName("inst6");
- ; CHECK: [[INST]]->setVolatile(false);
-
- %inst7 = atomicrmw volatile max i32* %addr, i32 %inc singlethread monotonic
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Max, {{.*}}, Monotonic, SingleThread
- ; CHECK: [[INST]]->setName("inst7");
- ; CHECK: [[INST]]->setVolatile(true);
-
- %inst8 = atomicrmw min i32* %addr, i32 %inc acquire
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Min, {{.*}}, Acquire, CrossThread
- ; CHECK: [[INST]]->setName("inst8");
- ; CHECK: [[INST]]->setVolatile(false);
-
- %inst9 = atomicrmw volatile umax i32* %addr, i32 %inc monotonic
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::UMax, {{.*}}, Monotonic, CrossThread
- ; CHECK: [[INST]]->setName("inst9");
- ; CHECK: [[INST]]->setVolatile(true);
-
- %inst10 = atomicrmw umin i32* %addr, i32 %inc singlethread release
- ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::UMin, {{.*}}, Release, SingleThread
- ; CHECK: [[INST]]->setName("inst10");
- ; CHECK: [[INST]]->setVolatile(false);
-
-
- ret void
-}
-
-define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
- %inst0 = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
- ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, SequentiallyConsistent, Monotonic, CrossThread
- ; CHECK: [[INST]]->setName("inst0");
- ; CHECK: [[INST]]->setVolatile(false);
- ; CHECK: [[INST]]->setWeak(false);
-
- %inst1 = cmpxchg volatile i32* %addr, i32 %desired, i32 %new singlethread acq_rel acquire
- ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, AcquireRelease, Acquire, SingleThread
- ; CHECK: [[INST]]->setName("inst1");
- ; CHECK: [[INST]]->setVolatile(true);
- ; CHECK: [[INST]]->setWeak(false);
-
- %inst2 = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
- ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, SequentiallyConsistent, Monotonic, CrossThread
- ; CHECK: [[INST]]->setName("inst2");
- ; CHECK: [[INST]]->setVolatile(false);
- ; CHECK: [[INST]]->setWeak(true);
-
- %inst3 = cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread acq_rel acquire
- ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, AcquireRelease, Acquire, SingleThread
- ; CHECK: [[INST]]->setName("inst3");
- ; CHECK: [[INST]]->setVolatile(true);
- ; CHECK: [[INST]]->setWeak(true);
-
- ret void
-}
diff --git a/test/CodeGen/CPP/attributes.ll b/test/CodeGen/CPP/attributes.ll
deleted file mode 100644
index 3dab617d80b9..000000000000
--- a/test/CodeGen/CPP/attributes.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=cpp | FileCheck %s
-
-define void @f1(i8* byval, i8* inalloca) {
-; CHECK: ByVal
-; CHECK: InAlloca
- ret void
-}
diff --git a/test/CodeGen/CPP/gep.ll b/test/CodeGen/CPP/gep.ll
deleted file mode 100644
index 88a0bf1f216d..000000000000
--- a/test/CodeGen/CPP/gep.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc -march=cpp -o - %s | FileCheck %s
-
-define void @f1(i32* %addr) {
- %x = getelementptr i32, i32* %addr, i32 1
-; CHECK: ConstantInt* [[INT_1:.*]] = ConstantInt::get(mod->getContext(), APInt(32, StringRef("1"), 10));
-; CHECK: GetElementPtrInst::Create(IntegerType::get(mod->getContext(), 32), ptr_addr,
-; CHECK-NEXT: [[INT_1]]
-; CHECK-NEXT: }, "x", label_3);
- ret void
-}
diff --git a/test/CodeGen/CPP/lit.local.cfg b/test/CodeGen/CPP/lit.local.cfg
deleted file mode 100644
index 3ff5c6b69737..000000000000
--- a/test/CodeGen/CPP/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if not 'CppBackend' in config.root.targets:
- config.unsupported = True
-
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index ae3c8da21471..e70215049995 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -2,7 +2,7 @@
; ARM & AArch64 run an extra SimplifyCFG which disrupts this test.
; Hexagon crashes (PR23377)
-; XFAIL: arm,aarch64,hexagon
+; XFAIL: arm,aarch64
; Make sure we have the correct weight attached to each successor.
define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
@@ -41,11 +41,11 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
entry:
switch i32 %x, label %return [
i32 0, label %bb0
- i32 10, label %bb1
- i32 20, label %bb2
- i32 30, label %bb3
- i32 40, label %bb4
- i32 50, label %bb5
+ i32 100, label %bb1
+ i32 200, label %bb2
+ i32 300, label %bb3
+ i32 400, label %bb4
+ i32 500, label %bb5
], !prof !1
bb0: tail call void @g(i32 0) br label %return
bb1: tail call void @g(i32 1) br label %return
@@ -68,7 +68,7 @@ return: ret void
!1 = !{!"branch_weights",
; Default:
i32 1,
- ; Case 0, 10, 20:
+ ; Case 0, 100, 200:
i32 10, i32 1, i32 1,
- ; Case 30, 40, 50:
+ ; Case 300, 400, 500:
i32 1, i32 10, i32 10}
diff --git a/test/CodeGen/Generic/Makefile b/test/CodeGen/Generic/Makefile
deleted file mode 100644
index 26ebc316a215..000000000000
--- a/test/CodeGen/Generic/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# Makefile for running ad-hoc custom LLVM tests
-#
-%.bc: %.ll
- llvm-as $<
-
-%.llc.s: %.bc
- llc $< -o $@
-
-%.gcc.s: %.c
- gcc -O0 -S $< -o $@
-
-%.nat: %.s
- gcc -O0 -lm $< -o $@
-
-%.cbe.out: %.cbe.nat
- ./$< > $@
-
-%.out: %.nat
- ./$< > $@
-
-%.clean:
- rm -f $(patsubst %.clean,%.bc,$@) $(patsubst %.clean,%.*.s,$@) \
- $(patsubst %.clean,%.*.nat,$@) $(patsubst %.clean,%.*.out,$@)
diff --git a/test/CodeGen/Generic/dont-remove-empty-preheader.ll b/test/CodeGen/Generic/dont-remove-empty-preheader.ll
new file mode 100644
index 000000000000..36af1ffa8bad
--- /dev/null
+++ b/test/CodeGen/Generic/dont-remove-empty-preheader.ll
@@ -0,0 +1,39 @@
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
+; CHECK: for.body.preheader
+
+@N = common global i32 0, align 4
+@E = common global i8** null, align 8
+@B = common global i8** null, align 8
+
+; Function Attrs: nounwind
+define i32 @foo() {
+entry:
+ %0 = load i32, i32* @N, align 4
+ %1 = load i8**, i8*** @E, align 8
+ %2 = load i8**, i8*** @B, align 8
+ %cmp7 = icmp eq i8** %2, %1
+ br i1 %cmp7, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %n.0.lcssa = phi i32 [ %0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %n.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %I.09 = phi i8** [ %incdec.ptr, %for.body ], [ %2, %for.body.preheader ]
+ %n.08 = phi i32 [ %add, %for.body ], [ %0, %for.body.preheader ]
+ %3 = load i8*, i8** %I.09, align 8
+ %call = tail call i32 @map(i8* %3)
+ %add = add nsw i32 %call, %n.08
+ %incdec.ptr = getelementptr inbounds i8*, i8** %I.09, i64 1
+ %cmp = icmp eq i8** %incdec.ptr, %1
+ br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32 @map(i8*)
diff --git a/test/CodeGen/Generic/run-pass.ll b/test/CodeGen/Generic/run-pass.ll
deleted file mode 100644
index 55d62ec18648..000000000000
--- a/test/CodeGen/Generic/run-pass.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -debug-pass=Structure -run-pass=gc-lowering -o /dev/null 2>&1 | FileCheck %s
-
-; CHECK: -gc-lowering
-; CHECK: FunctionPass Manager
-; CHECK-NEXT: Lower Garbage Collection Instructions
-; CHECK-NEXT: Machine Function Analysis
-; CHECK-NEXT: MIR Printing Pass
diff --git a/test/CodeGen/Generic/stop-after.ll b/test/CodeGen/Generic/stop-after.ll
index 791378c3737d..07a60f7381ab 100644
--- a/test/CodeGen/Generic/stop-after.ll
+++ b/test/CodeGen/Generic/stop-after.ll
@@ -6,6 +6,6 @@
; STOP-NEXT: Machine Function Analysis
; STOP-NEXT: MIR Printing Pass
-; START: -machine-branch-prob -gc-lowering
+; START: -machine-branch-prob -pre-isel-intrinsic-lowering
; START: FunctionPass Manager
; START-NEXT: Lower Garbage Collection Instructions
diff --git a/test/CodeGen/Generic/vector-redux.ll b/test/CodeGen/Generic/vector-redux.ll
new file mode 100644
index 000000000000..8efdbf85b8c0
--- /dev/null
+++ b/test/CodeGen/Generic/vector-redux.ll
@@ -0,0 +1,237 @@
+; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+@a = global [1024 x i32] zeroinitializer, align 16
+
+define i32 @reduce_add() {
+; CHECK-LABEL: reduce_add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+; CHECK: Detected a reduction operation: {{.*}} add
+
+min.iters.checked:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
+ %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
+ %1 = bitcast i32* %0 to <4 x i32>*
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
+ %2 = getelementptr i32, i32* %0, i64 4
+ %3 = bitcast i32* %2 to <4 x i32>*
+ %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
+ %4 = add nsw <4 x i32> %wide.load, %vec.phi
+ %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
+ %index.next = add nuw nsw i64 %index, 8
+ %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
+ %7 = bitcast i32* %6 to <4 x i32>*
+ %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
+ %8 = getelementptr i32, i32* %6, i64 4
+ %9 = bitcast i32* %8 to <4 x i32>*
+ %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
+ %10 = add nsw <4 x i32> %wide.load.1, %4
+ %11 = add nsw <4 x i32> %wide.load5.1, %5
+ %index.next.1 = add nsw i64 %index, 16
+ %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
+ %13 = bitcast i32* %12 to <4 x i32>*
+ %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
+ %14 = getelementptr i32, i32* %12, i64 4
+ %15 = bitcast i32* %14 to <4 x i32>*
+ %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
+ %16 = add nsw <4 x i32> %wide.load.2, %10
+ %17 = add nsw <4 x i32> %wide.load5.2, %11
+ %index.next.2 = add nsw i64 %index, 24
+ %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
+ %19 = bitcast i32* %18 to <4 x i32>*
+ %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
+ %20 = getelementptr i32, i32* %18, i64 4
+ %21 = bitcast i32* %20 to <4 x i32>*
+ %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
+ %22 = add nsw <4 x i32> %wide.load.3, %16
+ %23 = add nsw <4 x i32> %wide.load5.3, %17
+ %index.next.3 = add nsw i64 %index, 32
+ %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
+ %25 = bitcast i32* %24 to <4 x i32>*
+ %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
+ %26 = getelementptr i32, i32* %24, i64 4
+ %27 = bitcast i32* %26 to <4 x i32>*
+ %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
+ %28 = add nsw <4 x i32> %wide.load.4, %22
+ %29 = add nsw <4 x i32> %wide.load5.4, %23
+ %index.next.4 = add nsw i64 %index, 40
+ %30 = icmp eq i64 %index.next.4, 1000
+ br i1 %30, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
+ %.lcssa = phi <4 x i32> [ %28, %vector.body ]
+ %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
+ %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
+ %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
+ %31 = extractelement <4 x i32> %bin.rdx8, i32 0
+ ret i32 %31
+}
+
+define i32 @reduce_and() {
+; CHECK-LABEL: reduce_and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+; CHECK: Detected a reduction operation: {{.*}} and
+
+entry:
+ br label %vector.body
+
+vector.body:
+ %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
+ %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
+ %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
+ %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+ %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
+ %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
+ %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
+ %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
+ %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
+ %0 = and <4 x i32> %wide.load, %vec.phi
+ %1 = and <4 x i32> %wide.load10, %vec.phi9
+ %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+ %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
+ %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
+ %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
+ %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+ %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
+ %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
+ %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
+ %2 = and <4 x i32> %wide.load.1, %0
+ %3 = and <4 x i32> %wide.load10.1, %1
+ %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+ %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
+ %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
+ %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
+ %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+ %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
+ %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
+ %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
+ %4 = and <4 x i32> %wide.load.2, %2
+ %5 = and <4 x i32> %wide.load10.2, %3
+ %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+ %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
+ %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
+ %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
+ %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+ %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
+ %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
+ %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
+ %6 = and <4 x i32> %wide.load.3, %4
+ %7 = and <4 x i32> %wide.load10.3, %5
+ %lsr.iv.next = add nsw i64 %lsr.iv, 128
+ %8 = icmp eq i64 %lsr.iv.next, 0
+ br i1 %8, label %middle.block, label %vector.body
+
+middle.block:
+ %bin.rdx = and <4 x i32> %7, %6
+ %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
+ %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
+ %9 = extractelement <4 x i32> %bin.rdx13, i32 0
+ ret i32 %9
+}
+
+define float @reduce_add_float(float* nocapture readonly %a) {
+; CHECK-LABEL: reduce_add_float
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+; CHECK: Detected a reduction operation: {{.*}} fadd fast
+;
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
+ %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
+ %0 = getelementptr inbounds float, float* %a, i64 %index
+ %1 = bitcast float* %0 to <4 x float>*
+ %wide.load = load <4 x float>, <4 x float>* %1, align 4
+ %2 = getelementptr float, float* %0, i64 4
+ %3 = bitcast float* %2 to <4 x float>*
+ %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
+ %4 = fadd fast <4 x float> %wide.load, %vec.phi
+ %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
+ %index.next = add nuw nsw i64 %index, 8
+ %6 = getelementptr inbounds float, float* %a, i64 %index.next
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
+ %8 = getelementptr float, float* %6, i64 4
+ %9 = bitcast float* %8 to <4 x float>*
+ %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
+ %10 = fadd fast <4 x float> %wide.load.1, %4
+ %11 = fadd fast <4 x float> %wide.load10.1, %5
+ %index.next.1 = add nsw i64 %index, 16
+ %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
+ %13 = bitcast float* %12 to <4 x float>*
+ %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
+ %14 = getelementptr float, float* %12, i64 4
+ %15 = bitcast float* %14 to <4 x float>*
+ %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
+ %16 = fadd fast <4 x float> %wide.load.2, %10
+ %17 = fadd fast <4 x float> %wide.load10.2, %11
+ %index.next.2 = add nsw i64 %index, 24
+ %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
+ %19 = bitcast float* %18 to <4 x float>*
+ %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
+ %20 = getelementptr float, float* %18, i64 4
+ %21 = bitcast float* %20 to <4 x float>*
+ %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
+ %22 = fadd fast <4 x float> %wide.load.3, %16
+ %23 = fadd fast <4 x float> %wide.load10.3, %17
+ %index.next.3 = add nsw i64 %index, 32
+ %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
+ %25 = bitcast float* %24 to <4 x float>*
+ %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
+ %26 = getelementptr float, float* %24, i64 4
+ %27 = bitcast float* %26 to <4 x float>*
+ %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
+ %28 = fadd fast <4 x float> %wide.load.4, %22
+ %29 = fadd fast <4 x float> %wide.load10.4, %23
+ %index.next.4 = add nsw i64 %index, 40
+ %30 = icmp eq i64 %index.next.4, 1000
+ br i1 %30, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
+ %.lcssa = phi <4 x float> [ %28, %vector.body ]
+ %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
+ %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
+ %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
+ %31 = extractelement <4 x float> %bin.rdx13, i32 0
+ ret float %31
+}
diff --git a/test/CodeGen/Hexagon/Atomics.ll b/test/CodeGen/Hexagon/Atomics.ll
index bbac5d73c868..cedf9a48754b 100644
--- a/test/CodeGen/Hexagon/Atomics.ll
+++ b/test/CodeGen/Hexagon/Atomics.ll
@@ -69,3 +69,16 @@ entry:
return: ; preds = %entry
ret void
}
+
+
+define i64 @fred() nounwind {
+entry:
+ %s0 = cmpxchg i32* undef, i32 undef, i32 undef seq_cst seq_cst
+ %s1 = extractvalue { i32, i1 } %s0, 0
+ %t0 = cmpxchg i64* undef, i64 undef, i64 undef seq_cst seq_cst
+ %t1 = extractvalue { i64, i1 } %t0, 0
+ %u0 = zext i32 %s1 to i64
+ %u1 = add i64 %u0, %t1
+ ret i64 %u1
+}
+
diff --git a/test/CodeGen/Hexagon/absaddr-store.ll b/test/CodeGen/Hexagon/absaddr-store.ll
index f4e97d22e7d2..dac8607d88db 100644
--- a/test/CodeGen/Hexagon/absaddr-store.ll
+++ b/test/CodeGen/Hexagon/absaddr-store.ll
@@ -1,6 +1,5 @@
; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 < %s | FileCheck %s
; Check that we generate load instructions with absolute addressing mode.
-; XFAIL: *
@a0 = external global i32
@a1 = external global i32
diff --git a/test/CodeGen/Hexagon/adde.ll b/test/CodeGen/Hexagon/adde.ll
index 4a88914dc6cb..43ddb4307ef2 100644
--- a/test/CodeGen/Hexagon/adde.ll
+++ b/test/CodeGen/Hexagon/adde.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 -disable-post-ra < %s | FileCheck %s
; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #1)
; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #0)
diff --git a/test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll b/test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll
new file mode 100644
index 000000000000..561013b174dd
--- /dev/null
+++ b/test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll
@@ -0,0 +1,49 @@
+; Check that a callee-saved register will be saved correctly if
+; the predicate-to-GPR spilling code uses it.
+;
+; RUN: llc -march=hexagon < %s | FileCheck %s
+;
+; We expect to spill p0 into a general-purpose register and keep it there,
+; without adding an extra spill of that register.
+;
+; CHECK: PredSpill:
+; CHECK: memd(r29{{.*}}) = r17:16
+; CHECK-DAG: r{{[0-9]+}} = p0
+; CHECK-DAG: p0 = r{{[0-9]+}}
+; CHECK-NOT: = memw(r29
+;
+
+define void @PredSpill() {
+entry:
+ br i1 undef, label %if.then, label %if.else.14
+
+if.then: ; preds = %entry
+ br i1 undef, label %if.end.57, label %if.else
+
+if.else: ; preds = %if.then
+ unreachable
+
+if.else.14: ; preds = %entry
+ br i1 undef, label %if.then.17, label %if.end.57
+
+if.then.17: ; preds = %if.else.14
+ br i1 undef, label %if.end.57, label %if.then.20
+
+if.then.20: ; preds = %if.then.17
+ %call21 = tail call i32 @myfun()
+ %tobool22 = icmp eq i32 %call21, 0
+ %0 = tail call i32 @myfun()
+ br i1 %tobool22, label %if.else.42, label %if.then.23
+
+if.then.23: ; preds = %if.then.20
+ unreachable
+
+if.else.42: ; preds = %if.then.20
+ ret void
+
+if.end.57: ; preds = %if.then.17, %if.else.14, %if.then
+ ret void
+}
+
+declare i32 @myfun()
+
diff --git a/test/CodeGen/Hexagon/avoid-predspill.ll b/test/CodeGen/Hexagon/avoid-predspill.ll
new file mode 100644
index 000000000000..159c149c4422
--- /dev/null
+++ b/test/CodeGen/Hexagon/avoid-predspill.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon -O2 < %s | FileCheck %s
+;
+; This checks that predicate registers are moved to GPRs instead of spilling
+; where possible.
+
+; CHECK: p0 =
+; CHECK-NOT: memw(r29
+
+define i32 @f(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+ %cmp = icmp eq i32 %a, 1
+ %cmp1 = icmp eq i32 %b, 2
+ %or.cond = and i1 %cmp, %cmp1
+ %cmp3 = icmp eq i32 %c, 3
+ %or.cond30 = and i1 %or.cond, %cmp3
+ %cmp5 = icmp eq i32 %d, 4
+ %or.cond31 = and i1 %or.cond30, %cmp5
+ %cmp7 = icmp eq i32 %e, 5
+ %or.cond32 = and i1 %or.cond31, %cmp7
+ %ret.0 = zext i1 %or.cond32 to i32
+ %cmp8 = icmp eq i32 %a, 3
+ %cmp10 = icmp eq i32 %b, 4
+ %or.cond33 = and i1 %cmp8, %cmp10
+ %cmp12 = icmp eq i32 %c, 5
+ %or.cond34 = and i1 %or.cond33, %cmp12
+ %cmp14 = icmp eq i32 %d, 6
+ %or.cond35 = and i1 %or.cond34, %cmp14
+ %cmp16 = icmp eq i32 %e, 7
+ %or.cond36 = and i1 %or.cond35, %cmp16
+ %ret.1 = select i1 %or.cond36, i32 2, i32 %ret.0
+ %cmp21 = icmp eq i32 %b, 8
+ %or.cond37 = and i1 %cmp, %cmp21
+ %cmp23 = icmp eq i32 %c, 2
+ %or.cond38 = and i1 %or.cond37, %cmp23
+ %cmp25 = icmp eq i32 %d, 1
+ %or.cond39 = and i1 %or.cond38, %cmp25
+ %cmp27 = icmp eq i32 %e, 3
+ %or.cond40 = and i1 %or.cond39, %cmp27
+ %ret.2 = select i1 %or.cond40, i32 3, i32 %ret.1
+ ret i32 %ret.2
+}
+
diff --git a/test/CodeGen/Hexagon/bit-extractu-half.ll b/test/CodeGen/Hexagon/bit-extractu-half.ll
new file mode 100644
index 000000000000..fec4a02d9269
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-extractu-half.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Pick lsr (in bit-simplification) for extracting high halfword.
+; CHECK: lsr{{.*}}#16
+
+define i32 @foo(i32 %x) #0 {
+ %a = call i32 @llvm.hexagon.S2.extractu(i32 %x, i32 16, i32 16)
+ ret i32 %a
+}
+
+declare i32 @llvm.hexagon.S2.extractu(i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/Hexagon/bitconvert-vector.ll b/test/CodeGen/Hexagon/bitconvert-vector.ll
new file mode 100644
index 000000000000..c090721b8fff
--- /dev/null
+++ b/test/CodeGen/Hexagon/bitconvert-vector.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; This testcase would fail on a bitcast from v64i16 to v32i32. Check that
+; is compiles without errors.
+; CHECK: valign
+; CHECK: vshuff
+
+target triple = "hexagon"
+
+declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>) #0
+declare <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32>, <32 x i32>, i32) #0
+declare <32 x i32> @llvm.hexagon.V6.valignbi.128B(<32 x i32>, <32 x i32>, i32) #0
+
+define void @fred() #1 {
+entry:
+ %t0 = bitcast <64 x i16> zeroinitializer to <32 x i32>
+ %t1 = tail call <32 x i32> @llvm.hexagon.V6.valignbi.128B(<32 x i32> %t0, <32 x i32> undef, i32 2)
+ %t2 = tail call <64 x i32> @llvm.hexagon.V6.vshuffvdd.128B(<32 x i32> undef, <32 x i32> %t1, i32 -2)
+ %t3 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %t2)
+ store <64 x i16> zeroinitializer, <64 x i16>* undef, align 128
+ store <32 x i32> %t3, <32 x i32>* undef, align 128
+ unreachable
+}
+
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
diff --git a/test/CodeGen/Hexagon/block-addr.ll b/test/CodeGen/Hexagon/block-addr.ll
index eda167a67f28..420af2fee1c9 100644
--- a/test/CodeGen/Hexagon/block-addr.ll
+++ b/test/CodeGen/Hexagon/block-addr.ll
@@ -3,7 +3,7 @@
; Allow combine(..##JTI..):
; CHECK: r{{[0-9]+}}{{.*}} = {{.*}}#.LJTI
; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+<<#[0-9]+}})
-; CHECK: jumpr r{{[0-9]+}}
+; CHECK: jumpr:nt r{{[0-9]+}}
define void @main() #0 {
entry:
diff --git a/test/CodeGen/Hexagon/block-ranges-nodef.ll b/test/CodeGen/Hexagon/block-ranges-nodef.ll
new file mode 100644
index 000000000000..aaa365243327
--- /dev/null
+++ b/test/CodeGen/Hexagon/block-ranges-nodef.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+target triple = "hexagon"
+
+declare void @foo() #0
+
+define hidden fastcc void @fred(i32 %a, i64 %b, i64 %c) unnamed_addr #1 {
+entry:
+ %cmp17 = icmp ne i64 %c, 0
+ %conv19 = zext i1 %cmp17 to i64
+ %or = or i64 %conv19, %b
+ store i64 %or, i64* undef, align 8
+ br i1 undef, label %if.then44, label %if.end96
+
+if.then44: ; preds = %entry
+ br i1 undef, label %overflow, label %lor.lhs.false
+
+lor.lhs.false: ; preds = %if.then44
+ br i1 undef, label %overflow, label %if.end52
+
+if.end52: ; preds = %lor.lhs.false
+ br i1 undef, label %if.then55, label %if.end96
+
+if.then55: ; preds = %if.end52
+ %cmp60 = icmp slt i32 %a, 0
+ %or.cond = or i1 %cmp60, false
+ %cmp63 = icmp ule i64 %or, undef
+ %.cmp63 = or i1 %cmp63, %or.cond
+ call void @foo()
+ %or.cond299 = and i1 %.cmp63, undef
+ br i1 %or.cond299, label %if.then72, label %if.end73
+
+if.then72: ; preds = %if.then55
+ unreachable
+
+if.end73: ; preds = %if.then55
+ unreachable
+
+if.end96: ; preds = %if.end52, %entry
+ br i1 undef, label %if.end102, label %if.then98
+
+if.then98: ; preds = %if.end96
+ br label %if.end102
+
+if.end102: ; preds = %if.then98, %if.end96
+ unreachable
+
+overflow: ; preds = %lor.lhs.false, %if.then44
+ ret void
+}
+
+attributes #0 = { noinline norecurse nounwind }
+attributes #1 = { noinline nounwind }
+
diff --git a/test/CodeGen/Hexagon/branch-non-mbb.ll b/test/CodeGen/Hexagon/branch-non-mbb.ll
new file mode 100644
index 000000000000..e86ca2ed4023
--- /dev/null
+++ b/test/CodeGen/Hexagon/branch-non-mbb.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+;
+; The tail calls to foo and bar are branches with functions as operands,
+; instead of basic blocks. Make sure we don't crash on such instructions.
+
+target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon"
+
+%struct.t0 = type { i8, [2 x i8] }
+%struct.t1 = type { i8, i8, [1900 x i8], %struct.t0 }
+
+@var = internal global [3 x %struct.t1] zeroinitializer, align 8
+declare void @foo() #2
+declare void @bar(i32, i32) #2
+
+; Function Attrs: nounwind
+define void @fred(i8 signext %a, i8 signext %b) #1 {
+entry:
+ %i = sext i8 %a to i32
+ %t = getelementptr inbounds [3 x %struct.t1], [3 x %struct.t1]* @var, i32 0, i32 %i, i32 3, i32 0
+ %0 = load i8, i8* %t, align 8
+ switch i8 %0, label %if.end14 [
+ i8 1, label %if.then
+ i8 0, label %do.body
+ ]
+
+if.then: ; preds = %entry
+ %j = sext i8 %b to i32
+ %u = getelementptr inbounds [3 x %struct.t1], [3 x %struct.t1]* @var, i32 0, i32 %i, i32 3, i32 1, i32 %j
+ store i8 1, i8* %u, align 1
+ tail call void @foo() #0
+ br label %if.end14
+
+do.body: ; preds = %entry
+ %conv11 = sext i8 %b to i32
+ tail call void @bar(i32 %i, i32 %conv11) #0
+ br label %if.end14
+
+if.end14: ; preds = %entry, %do.body, %if.then
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "disable-tail-calls"="false" }
+attributes #2 = { "disable-tail-calls"="false" }
diff --git a/test/CodeGen/Hexagon/brev_ld.ll b/test/CodeGen/Hexagon/brev_ld.ll
index 12edb4c2b8f7..a2914296ec41 100644
--- a/test/CodeGen/Hexagon/brev_ld.ll
+++ b/test/CodeGen/Hexagon/brev_ld.ll
@@ -29,9 +29,9 @@ entry:
%1 = bitcast i64* %inputLR to i8*
%sub = sub i32 13, %shr1
%shl = shl i32 1, %sub
-; CHECK: memd(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memd(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%2 = call i8* @llvm.hexagon.brev.ldd(i8* %0, i8* %1, i32 %shl)
- %3 = bitcast i8* %2 to i64*
+ %3 = bitcast i8* %1 to i64*
%4 = load i64, i64* %3, align 8, !tbaa !0
ret i64 %4
}
@@ -49,9 +49,9 @@ entry:
%1 = bitcast i32* %inputLR to i8*
%sub = sub i32 14, %shr1
%shl = shl i32 1, %sub
-; CHECK: memw(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memw(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%2 = call i8* @llvm.hexagon.brev.ldw(i8* %0, i8* %1, i32 %shl)
- %3 = bitcast i8* %2 to i32*
+ %3 = bitcast i8* %1 to i32*
%4 = load i32, i32* %3, align 4, !tbaa !2
ret i32 %4
}
@@ -69,9 +69,9 @@ entry:
%1 = bitcast i16* %inputLR to i8*
%sub = sub i32 15, %shr1
%shl = shl i32 1, %sub
-; CHECK: memh(r{{[0-9]*}} ++ m0:brev)
+; CHECK: = memh(r{{[0-9]*}} ++ m0:brev)
%2 = call i8* @llvm.hexagon.brev.ldh(i8* %0, i8* %1, i32 %shl)
- %3 = bitcast i8* %2 to i16*
+ %3 = bitcast i8* %1 to i16*
%4 = load i16, i16* %3, align 2, !tbaa !3
ret i16 %4
}
@@ -89,9 +89,9 @@ entry:
%1 = bitcast i16* %inputLR to i8*
%sub = sub i32 15, %shr1
%shl = shl i32 1, %sub
-; CHECK: memuh(r{{[0-9]*}} ++ m0:brev)
+; CHECK: = memuh(r{{[0-9]*}} ++ m0:brev)
%2 = call i8* @llvm.hexagon.brev.lduh(i8* %0, i8* %1, i32 %shl)
- %3 = bitcast i8* %2 to i16*
+ %3 = bitcast i8* %1 to i16*
%4 = load i16, i16* %3, align 2, !tbaa !3
ret i16 %4
}
@@ -108,15 +108,15 @@ entry:
%0 = bitcast i16* %arrayidx to i8*
%sub = sub nsw i32 16, %shr1
%shl = shl i32 1, %sub
-; CHECK: memub(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memub(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%1 = call i8* @llvm.hexagon.brev.ldub(i8* %0, i8* %inputLR, i32 %shl)
- %2 = load i8, i8* %1, align 1, !tbaa !0
+ %2 = load i8, i8* %inputLR, align 1, !tbaa !0
ret i8 %2
}
declare i8* @llvm.hexagon.brev.ldub(i8*, i8*, i32) nounwind
-define zeroext i8 @foo5(i16 zeroext %filtMemLen, i16* %filtMemLR, i16 signext %filtMemIndex) nounwind {
+define signext i8 @foo5(i16 zeroext %filtMemLen, i16* %filtMemLR, i16 signext %filtMemIndex) nounwind {
entry:
%inputLR = alloca i8, align 1
%conv = zext i16 %filtMemLen to i32
@@ -126,9 +126,9 @@ entry:
%0 = bitcast i16* %arrayidx to i8*
%sub = sub nsw i32 16, %shr1
%shl = shl i32 1, %sub
-; CHECK: memb(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memb(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%1 = call i8* @llvm.hexagon.brev.ldb(i8* %0, i8* %inputLR, i32 %shl)
- %2 = load i8, i8* %1, align 1, !tbaa !0
+ %2 = load i8, i8* %inputLR, align 1, !tbaa !0
ret i8 %2
}
diff --git a/test/CodeGen/Hexagon/brev_st.ll b/test/CodeGen/Hexagon/brev_st.ll
index b80579185317..6c55681a683b 100644
--- a/test/CodeGen/Hexagon/brev_st.ll
+++ b/test/CodeGen/Hexagon/brev_st.ll
@@ -28,9 +28,7 @@ entry:
%shl = shl i32 1, %sub
; CHECK: memd(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%1 = tail call i8* @llvm.hexagon.brev.std(i8* %0, i64 undef, i32 %shl)
- %2 = bitcast i8* %1 to i64*
- %3 = load i64, i64* %2, align 8, !tbaa !0
- ret i64 %3
+ ret i64 0
}
declare i8* @llvm.hexagon.brev.std(i8*, i64, i32) nounwind
@@ -46,9 +44,7 @@ entry:
%shl = shl i32 1, %sub
; CHECK: memw(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%1 = tail call i8* @llvm.hexagon.brev.stw(i8* %0, i32 undef, i32 %shl)
- %2 = bitcast i8* %1 to i32*
- %3 = load i32, i32* %2, align 4, !tbaa !2
- ret i32 %3
+ ret i32 0
}
declare i8* @llvm.hexagon.brev.stw(i8*, i32, i32) nounwind
@@ -64,9 +60,7 @@ entry:
%shl = shl i32 1, %sub
; CHECK: memh(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%1 = tail call i8* @llvm.hexagon.brev.sth(i8* %0, i32 0, i32 %shl)
- %2 = bitcast i8* %1 to i16*
- %3 = load i16, i16* %2, align 2, !tbaa !3
- ret i16 %3
+ ret i16 0
}
declare i8* @llvm.hexagon.brev.sth(i8*, i32, i32) nounwind
@@ -82,9 +76,7 @@ entry:
%shl = shl i32 1, %sub
; CHECK: memh(r{{[0-9]*}} ++ m{{[0-1]}}:brev){{ *}}={{ *}}r{{[0-9]*}}.h
%1 = tail call i8* @llvm.hexagon.brev.sthhi(i8* %0, i32 0, i32 %shl)
- %2 = bitcast i8* %1 to i16*
- %3 = load i16, i16* %2, align 2, !tbaa !3
- ret i16 %3
+ ret i16 0
}
declare i8* @llvm.hexagon.brev.sthhi(i8*, i32, i32) nounwind
@@ -100,8 +92,7 @@ entry:
; CHECK: memb(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
%shl = shl i32 1, %sub
%1 = tail call i8* @llvm.hexagon.brev.stb(i8* %0, i32 0, i32 %shl)
- %2 = load i8, i8* %1, align 1, !tbaa !0
- ret i8 %2
+ ret i8 0
}
declare i8* @llvm.hexagon.brev.stb(i8*, i32, i32) nounwind
diff --git a/test/CodeGen/Hexagon/builtin-prefetch-offset.ll b/test/CodeGen/Hexagon/builtin-prefetch-offset.ll
new file mode 100644
index 000000000000..b542308abd3e
--- /dev/null
+++ b/test/CodeGen/Hexagon/builtin-prefetch-offset.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Check for the immediate offset. It must be a multiple of 8.
+; CHECK: dcfetch({{.*}}+{{ *}}#8)
+; In 6.2 (which supports v4+ only), we generate indexed dcfetch in all cases
+; (unlike in 6.1, which supported v2, where dcfetch did not allow an immediate
+; offset).
+; For expression %2, where the offset is +9, the offset on dcfetch should be
+; a multiple of 8, and the offset of 0 is most likely (although not the only
+; possible one). Check for #0 anyways, if the test fails with a false
+; positive, the second check can be eliminated, or rewritten, and in the
+; meantime it can help catch real problems.
+; CHECK: dcfetch({{.*}}+{{ *}}#0)
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @foo(i8* %addr) nounwind {
+entry:
+ %addr.addr = alloca i8*, align 4
+ store i8* %addr, i8** %addr.addr, align 4
+ %0 = load i8*, i8** %addr.addr, align 4
+ %1 = getelementptr i8, i8* %0, i32 8
+ call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1)
+ %2 = getelementptr i8, i8* %0, i32 9
+ call void @llvm.prefetch(i8* %2, i32 0, i32 3, i32 1)
+ ret void
+}
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
diff --git a/test/CodeGen/Hexagon/builtin-prefetch.ll b/test/CodeGen/Hexagon/builtin-prefetch.ll
new file mode 100644
index 000000000000..ae236645b282
--- /dev/null
+++ b/test/CodeGen/Hexagon/builtin-prefetch.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: dcfetch
+; CHECK: dcfetch{{.*}}#8
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define zeroext i8 @foo(i8* %addr) #0 {
+entry:
+ %addr.addr = alloca i8*, align 4
+ store i8* %addr, i8** %addr.addr, align 4
+ %0 = load i8*, i8** %addr.addr, align 4
+ call void @llvm.prefetch(i8* %0, i32 0, i32 3, i32 1)
+ %1 = load i8*, i8** %addr.addr, align 4
+ %2 = bitcast i8* %1 to i32*
+ %3 = load i32, i32* %2, align 4
+ %4 = add i32 %3, 8
+ %5 = inttoptr i32 %4 to i8*
+ call void @llvm.hexagon.prefetch(i8* %5)
+ %6 = load i8, i8* %5
+ ret i8 %6
+}
+
+; Function Attrs: nounwind
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) #1
+declare void @llvm.hexagon.prefetch(i8* nocapture) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/Hexagon/callr-dep-edge.ll b/test/CodeGen/Hexagon/callr-dep-edge.ll
new file mode 100644
index 000000000000..d2c6ae4df62a
--- /dev/null
+++ b/test/CodeGen/Hexagon/callr-dep-edge.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Check that the callr and the load into r0 are not packetized together.
+
+target triple = "hexagon"
+
+@fp = common global i32 (...)* null, align 4
+
+; CHECK: r0 = memw
+; CHECK: {
+; CHECK: callr r0
+
+; Function Attrs: nounwind
+define i32 @foo() #0 {
+entry:
+ %0 = load i32 ()*, i32 ()** bitcast (i32 (...)** @fp to i32 ()**), align 4
+ %call = tail call i32 %0() #0
+ ret i32 %call
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Hexagon/cext-check.ll b/test/CodeGen/Hexagon/cext-check.ll
index 19b91c5245b2..46e816d15e5f 100644
--- a/test/CodeGen/Hexagon/cext-check.ll
+++ b/test/CodeGen/Hexagon/cext-check.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-eif=0 -ifcvt-limit=0 < %s | FileCheck %s
; Check that we constant extended instructions only when necessary.
define i32 @cext_test1(i32* %a) nounwind {
diff --git a/test/CodeGen/Hexagon/cfi-late.ll b/test/CodeGen/Hexagon/cfi-late.ll
index ce38711ae8d7..d24732929adf 100644
--- a/test/CodeGen/Hexagon/cfi-late.ll
+++ b/test/CodeGen/Hexagon/cfi-late.ll
@@ -41,11 +41,10 @@ attributes #3 = { nounwind }
!llvm.module.flags = !{!11, !12}
!llvm.ident = !{!13}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 15506a21305e212c406f980ed9b6b1bac785df56)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 15506a21305e212c406f980ed9b6b1bac785df56)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "cfi-late.c", directory: "/test")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, variables: !8)
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8)
!5 = !DISubroutineType(types: !6)
!6 = !{!7, !7, !7}
!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/Hexagon/cfi-offset.ll b/test/CodeGen/Hexagon/cfi-offset.ll
new file mode 100644
index 000000000000..100034a0c6c4
--- /dev/null
+++ b/test/CodeGen/Hexagon/cfi-offset.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Check that all the offsets in the .cfi_offset instructions are negative.
+; They are all based on R30+8 which points to the pair FP/LR stored by an
+; allocframe. Since the stack grows towards negative addresses, anything
+; in the current stack frame will have a negative offset with respect to
+; R30+8.
+
+; CHECK: cfi_def_cfa r30
+; CHECK-NOT: .cfi_offset r{{[0-9]+}}, {{[^-]}}
+
+target triple = "hexagon"
+
+define i64 @_Z3fooxxx(i64 %x, i64 %y, i64 %z) #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+ %call = invoke i64 @_Z3barxxx(i64 %x, i64 %y, i64 %z)
+ to label %try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %0 = landingpad { i8*, i32 }
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ %2 = tail call i8* @__cxa_begin_catch(i8* %1) #1
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont: ; preds = %entry, %lpad
+ %a.0 = phi i64 [ 0, %lpad ], [ %call, %entry ]
+ %mul = mul nsw i64 %y, %x
+ %sub = sub i64 %mul, %z
+ %add = add nsw i64 %sub, %a.0
+ ret i64 %add
+}
+
+declare i64 @_Z3barxxx(i64, i64, i64) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/Hexagon/circ-load-isel.ll b/test/CodeGen/Hexagon/circ-load-isel.ll
new file mode 100644
index 000000000000..576fbdf53cfc
--- /dev/null
+++ b/test/CodeGen/Hexagon/circ-load-isel.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: = memw{{.*}}circ
+
+target triple = "hexagon"
+
+@l = external global i32, align 4
+
+; Function Attrs: nounwind optsize
+define void @circ2() #0 {
+entry:
+ store i32 0, i32* @l, align 4
+ %0 = tail call i8* @llvm.hexagon.circ.ldw(i8* undef, i8* undef, i32 150995968, i32 4)
+ unreachable
+}
+
+declare i8* @llvm.hexagon.circ.ldw(i8*, i8*, i32, i32) #1
+attributes #0 = { nounwind optsize }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/Hexagon/circ_ld.ll b/test/CodeGen/Hexagon/circ_ld.ll
index 6d372403ca7a..ffa5f2cd2220 100644
--- a/test/CodeGen/Hexagon/circ_ld.ll
+++ b/test/CodeGen/Hexagon/circ_ld.ll
@@ -17,7 +17,7 @@
target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
target triple = "hexagon"
-define zeroext i8 @foo1(i16 zeroext %filtMemLen, i16* %filtMemLR, i16 signext %filtMemIndex) nounwind {
+define signext i8 @foo1(i16 zeroext %filtMemLen, i16* %filtMemLR, i16 signext %filtMemIndex) nounwind {
entry:
%inputLR = alloca i8, align 1
%conv = zext i16 %filtMemLen to i32
@@ -26,9 +26,9 @@ entry:
%arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
%0 = bitcast i16* %arrayidx to i8*
%or = or i32 %shr1, 33554432
-; CHECK: memb(r{{[0-9]*.}}++{{.}}#-1:circ(m{{[0-1]}}))
+; CHECK: = memb(r{{[0-9]*.}}++{{.}}#-1:circ(m{{[0-1]}}))
%1 = call i8* @llvm.hexagon.circ.ldb(i8* %0, i8* %inputLR, i32 %or, i32 -1)
- %2 = load i8, i8* %1, align 1, !tbaa !0
+ %2 = load i8, i8* %inputLR, align 1, !tbaa !0
ret i8 %2
}
@@ -45,9 +45,9 @@ entry:
%1 = bitcast i64* %inputLR to i8*
%shl = shl nuw nsw i32 %shr1, 3
%or = or i32 %shl, 83886080
-; CHECK: memd(r{{[0-9]*.}}++{{.}}#-8:circ(m{{[0-1]}}))
+; CHECK: = memd(r{{[0-9]*.}}++{{.}}#-8:circ(m{{[0-1]}}))
%2 = call i8* @llvm.hexagon.circ.ldd(i8* %0, i8* %1, i32 %or, i32 -8)
- %3 = bitcast i8* %2 to i64*
+ %3 = bitcast i8* %1 to i64*
%4 = load i64, i64* %3, align 8, !tbaa !0
ret i64 %4
}
@@ -64,9 +64,9 @@ entry:
%0 = bitcast i16* %arrayidx to i8*
%1 = bitcast i16* %inputLR to i8*
%or = or i32 %shr1, 50331648
-; CHECK: memh(r{{[0-9]*.}}++{{.}}#-2:circ(m{{[0-1]}}))
+; CHECK: = memh(r{{[0-9]*.}}++{{.}}#-2:circ(m{{[0-1]}}))
%2 = call i8* @llvm.hexagon.circ.ldh(i8* %0, i8* %1, i32 %or, i32 -2)
- %3 = bitcast i8* %2 to i16*
+ %3 = bitcast i8* %1 to i16*
%4 = load i16, i16* %3, align 2, !tbaa !2
ret i16 %4
}
@@ -82,9 +82,9 @@ entry:
%arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
%0 = bitcast i16* %arrayidx to i8*
%or = or i32 %shr1, 33554432
-; CHECK: memub(r{{[0-9]*.}}++{{.}}#-1:circ(m{{[0-1]}}))
+; CHECK: = memub(r{{[0-9]*.}}++{{.}}#-1:circ(m{{[0-1]}}))
%1 = call i8* @llvm.hexagon.circ.ldub(i8* %0, i8* %inputLR, i32 %or, i32 -1)
- %2 = load i8, i8* %1, align 1, !tbaa !0
+ %2 = load i8, i8* %inputLR, align 1, !tbaa !0
ret i8 %2
}
@@ -100,9 +100,9 @@ entry:
%0 = bitcast i16* %arrayidx to i8*
%1 = bitcast i16* %inputLR to i8*
%or = or i32 %shr1, 50331648
-; CHECK: memuh(r{{[0-9]*.}}++{{.}}#-2:circ(m{{[0-1]}}))
+; CHECK: = memuh(r{{[0-9]*.}}++{{.}}#-2:circ(m{{[0-1]}}))
%2 = call i8* @llvm.hexagon.circ.lduh(i8* %0, i8* %1, i32 %or, i32 -2)
- %3 = bitcast i8* %2 to i16*
+ %3 = bitcast i8* %1 to i16*
%4 = load i16, i16* %3, align 2, !tbaa !2
ret i16 %4
}
@@ -120,9 +120,9 @@ entry:
%1 = bitcast i32* %inputLR to i8*
%shl = shl nuw nsw i32 %shr1, 2
%or = or i32 %shl, 67108864
-; CHECK: memw(r{{[0-9]*.}}++{{.}}#-4:circ(m{{[0-1]}}))
+; CHECK: = memw(r{{[0-9]*.}}++{{.}}#-4:circ(m{{[0-1]}}))
%2 = call i8* @llvm.hexagon.circ.ldw(i8* %0, i8* %1, i32 %or, i32 -4)
- %3 = bitcast i8* %2 to i32*
+ %3 = bitcast i8* %1 to i32*
%4 = load i32, i32* %3, align 4, !tbaa !3
ret i32 %4
}
diff --git a/test/CodeGen/Hexagon/circ_st.ll b/test/CodeGen/Hexagon/circ_st.ll
index 244ca3bae714..4b54afbc611d 100644
--- a/test/CodeGen/Hexagon/circ_st.ll
+++ b/test/CodeGen/Hexagon/circ_st.ll
@@ -12,7 +12,6 @@
; memh(r1++#-2:circ(m0)) = r3.h
; memw(r1++#-4:circ(m0)) = r0
-; ModuleID = 'circ_st.i'
target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
target triple = "hexagon"
@@ -26,8 +25,7 @@ entry:
%or = or i32 %shr2, 33554432
; CHECK: memb(r{{[0-9]*}}{{.}}++{{.}}#-1:circ(m{{[0-1]}}))
%1 = tail call i8* @llvm.hexagon.circ.stb(i8* %0, i32 0, i32 %or, i32 -1)
- %2 = load i8, i8* %1, align 1, !tbaa !0
- ret i8 %2
+ ret i8 0
}
declare i8* @llvm.hexagon.circ.stb(i8*, i32, i32, i32) nounwind
@@ -43,9 +41,7 @@ entry:
%or = or i32 %shl, 83886080
; CHECK: memd(r{{[0-9]*}}{{.}}++{{.}}#-8:circ(m{{[0-1]}}))
%1 = tail call i8* @llvm.hexagon.circ.std(i8* %0, i64 undef, i32 %or, i32 -8)
- %2 = bitcast i8* %1 to i64*
- %3 = load i64, i64* %2, align 8, !tbaa !0
- ret i64 %3
+ ret i64 0
}
declare i8* @llvm.hexagon.circ.std(i8*, i64, i32, i32) nounwind
@@ -60,9 +56,7 @@ entry:
%or = or i32 %shr2, 50331648
; CHECK: memh(r{{[0-9]*}}{{.}}++{{.}}#-2:circ(m{{[0-1]}}))
%1 = tail call i8* @llvm.hexagon.circ.sth(i8* %0, i32 0, i32 %or, i32 -2)
- %2 = bitcast i8* %1 to i16*
- %3 = load i16, i16* %2, align 2, !tbaa !2
- ret i16 %3
+ ret i16 0
}
declare i8* @llvm.hexagon.circ.sth(i8*, i32, i32, i32) nounwind
@@ -77,9 +71,7 @@ entry:
%or = or i32 %shr2, 50331648
; CHECK: memh(r{{[0-9]*}}{{.}}++{{.}}#-2:circ(m{{[0-1]}})){{ *}}={{ *}}r{{[0-9]*}}.h
%1 = tail call i8* @llvm.hexagon.circ.sthhi(i8* %0, i32 0, i32 %or, i32 -2)
- %2 = bitcast i8* %1 to i16*
- %3 = load i16, i16* %2, align 2, !tbaa !2
- ret i16 %3
+ ret i16 0
}
declare i8* @llvm.hexagon.circ.sthhi(i8*, i32, i32, i32) nounwind
@@ -95,9 +87,7 @@ entry:
%or = or i32 %shl, 67108864
; CHECK: memw(r{{[0-9]*}}{{.}}++{{.}}#-4:circ(m{{[0-1]}}))
%1 = tail call i8* @llvm.hexagon.circ.stw(i8* %0, i32 undef, i32 %or, i32 -4)
- %2 = bitcast i8* %1 to i32*
- %3 = load i32, i32* %2, align 4, !tbaa !3
- ret i32 %3
+ ret i32 0
}
declare i8* @llvm.hexagon.circ.stw(i8*, i32, i32, i32) nounwind
diff --git a/test/CodeGen/Hexagon/clr_set_toggle.ll b/test/CodeGen/Hexagon/clr_set_toggle.ll
index 4e90f3d99a1e..19e3ed0cf897 100644
--- a/test/CodeGen/Hexagon/clr_set_toggle.ll
+++ b/test/CodeGen/Hexagon/clr_set_toggle.ll
@@ -3,6 +3,7 @@
define i32 @my_clrbit(i32 %x) nounwind {
entry:
+; CHECK-LABEL: my_clrbit
; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
%x.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
@@ -13,6 +14,7 @@ entry:
define i64 @my_clrbit2(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_clrbit2
; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
@@ -23,6 +25,7 @@ entry:
define i64 @my_clrbit3(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_clrbit3
; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
@@ -33,6 +36,7 @@ entry:
define i32 @my_clrbit4(i32 %x) nounwind {
entry:
+; CHECK-LABEL: my_clrbit4
; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #13)
%x.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
@@ -43,6 +47,7 @@ entry:
define i64 @my_clrbit5(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_clrbit5
; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #13)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
@@ -53,6 +58,7 @@ entry:
define i64 @my_clrbit6(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_clrbit6
; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #27)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
@@ -63,7 +69,8 @@ entry:
define zeroext i16 @my_setbit(i16 zeroext %crc) nounwind {
entry:
-; CHECK: memh(r{{[0-9]+}}+#0){{ *}}={{ *}}setbit(#15)
+; CHECK-LABEL: my_setbit
+; CHECK: memh(r{{[0-9]+}}+#{{[0-9]+}}){{ *}}={{ *}}setbit(#15)
%crc.addr = alloca i16, align 2
store i16 %crc, i16* %crc.addr, align 2
%0 = load i16, i16* %crc.addr, align 2
@@ -77,6 +84,7 @@ entry:
define i32 @my_setbit2(i32 %x) nounwind {
entry:
+; CHECK-LABEL: my_setbit2
; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #15)
%x.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
@@ -87,6 +95,7 @@ entry:
define i64 @my_setbit3(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_setbit3
; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #15)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
@@ -97,6 +106,7 @@ entry:
define i32 @my_setbit4(i32 %x) nounwind {
entry:
+; CHECK-LABEL: my_setbit4
; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #31)
%x.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
@@ -107,6 +117,7 @@ entry:
define i64 @my_setbit5(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_setbit5
; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #13)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
@@ -117,6 +128,7 @@ entry:
define zeroext i16 @my_togglebit(i16 zeroext %crc) nounwind {
entry:
+; CHECK-LABEL: my_togglebit
; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #15)
%crc.addr = alloca i16, align 2
store i16 %crc, i16* %crc.addr, align 2
@@ -131,6 +143,7 @@ entry:
define i32 @my_togglebit2(i32 %x) nounwind {
entry:
+; CHECK-LABEL: my_togglebit2
; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #15)
%x.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
@@ -141,6 +154,7 @@ entry:
define i64 @my_togglebit3(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_togglebit3
; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #15)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
@@ -151,6 +165,7 @@ entry:
define i64 @my_togglebit4(i64 %x) nounwind {
entry:
+; CHECK-LABEL: my_togglebit4
; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #20)
%x.addr = alloca i64, align 8
store i64 %x, i64* %x.addr, align 8
diff --git a/test/CodeGen/Hexagon/const64.ll b/test/CodeGen/Hexagon/const64.ll
new file mode 100644
index 000000000000..018157d97024
--- /dev/null
+++ b/test/CodeGen/Hexagon/const64.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon -disable-const64=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-const64=1 < %s | FileCheck %s --check-prefix=CHECKOLD
+
+; CHECK: CONST64
+; CHECKOLD-NOT: CONST64
+
+target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo() optsize {
+entry:
+ call void @bar(i32 32768, i32 32768, i8 zeroext 1)
+ ret void
+}
+
+declare void @bar(i32, i32, i8 zeroext)
+
diff --git a/test/CodeGen/Hexagon/csr-func-usedef.ll b/test/CodeGen/Hexagon/csr-func-usedef.ll
new file mode 100644
index 000000000000..a9f81b9f521a
--- /dev/null
+++ b/test/CodeGen/Hexagon/csr-func-usedef.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+target triple = "hexagon"
+
+declare i8* @llvm.hexagon.circ.ldb(i8*, i8*, i32, i32) #1
+declare i8* @llvm.hexagon.circ.stb(i8*, i32, i32, i32) #1
+
+define zeroext i8 @circular_loop_test10(i8* %A, i8* %B, i32 %x, i32 %y, i32 %z, i32 %w) #0 {
+entry:
+ %element_load0 = alloca i8, align 1
+ %element_load2 = alloca i8, align 1
+ %element_load3 = alloca i8, align 1
+ %element_load5 = alloca i8, align 1
+ %or = or i32 %x, 100663296
+ %or5 = or i32 %z, 100663296
+ %or7 = or i32 %w, 100663296
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %p0.082 = phi i8* [ %A, %entry ], [ undef, %for.body ]
+ %element_load.080 = phi i32 [ 0, %entry ], [ %add18, %for.body ]
+ %p1.079 = phi i8* [ %B, %entry ], [ %1, %for.body ]
+ %p2.078 = phi i8* [ undef, %entry ], [ %3, %for.body ]
+ %p3.077 = phi i8* [ undef, %entry ], [ %4, %for.body ]
+ %0 = call i8* @llvm.hexagon.circ.ldb(i8* %p0.082, i8* nonnull %element_load0, i32 %or, i32 2)
+ %1 = call i8* @llvm.hexagon.circ.ldb(i8* %p1.079, i8* nonnull null, i32 0, i32 1)
+ %2 = call i8* @llvm.hexagon.circ.ldb(i8* %p2.078, i8* nonnull %element_load2, i32 %or5, i32 3)
+ %3 = call i8* @llvm.hexagon.circ.ldb(i8* %2, i8* nonnull %element_load5, i32 %or5, i32 1)
+ %4 = call i8* @llvm.hexagon.circ.ldb(i8* %p3.077, i8* nonnull %element_load3, i32 %or7, i32 1)
+ %5 = load i8, i8* null, align 1
+ %conv = zext i8 %5 to i32
+ %6 = load i8, i8* %element_load2, align 1
+ %conv8 = zext i8 %6 to i32
+ %7 = load i8, i8* %element_load3, align 1
+ %conv9 = zext i8 %7 to i32
+ %8 = load i8, i8* undef, align 1
+ %conv11 = zext i8 %8 to i32
+ %9 = load i8, i8* %element_load5, align 1
+ %conv13 = zext i8 %9 to i32
+ %10 = load i8, i8* %element_load0, align 1
+ %conv15 = zext i8 %10 to i32
+ %conv17 = and i32 %element_load.080, 255
+ %add = add nuw nsw i32 %conv, %conv17
+ %add10 = add nuw nsw i32 %add, %conv8
+ %add12 = add nuw nsw i32 %add10, %conv9
+ %add14 = add nuw nsw i32 %add12, %conv11
+ %add16 = add nuw nsw i32 %add14, %conv13
+ %add18 = add nuw nsw i32 %add16, %conv15
+ %exitcond84 = icmp eq i32 undef, 200
+ br i1 %exitcond84, label %for.body23, label %for.body
+
+for.body23: ; preds = %for.body23, %for.body
+ %11 = call i8* @llvm.hexagon.circ.stb(i8* undef, i32 undef, i32 %or, i32 3)
+ br i1 undef, label %for.body34, label %for.body23
+
+for.body34: ; preds = %for.body34, %for.body23
+ %element_load.173 = phi i32 [ %add38, %for.body34 ], [ %add18, %for.body23 ]
+ %arrayidx35 = getelementptr inbounds i8, i8* %B, i32 0
+ %12 = load i8, i8* %arrayidx35, align 1
+ %conv36 = zext i8 %12 to i32
+ %conv37 = and i32 %element_load.173, 255
+ %add38 = add nuw nsw i32 %conv36, %conv37
+ br i1 undef, label %for.end42, label %for.body34
+
+for.end42: ; preds = %for.body34
+ %conv39 = trunc i32 %add38 to i8
+ ret i8 %conv39
+}
+
+attributes #0 = { nounwind optsize }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/Hexagon/eliminate-pred-spill.ll b/test/CodeGen/Hexagon/eliminate-pred-spill.ll
new file mode 100644
index 000000000000..6fb0a3e2658d
--- /dev/null
+++ b/test/CodeGen/Hexagon/eliminate-pred-spill.ll
@@ -0,0 +1,144 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx-double \
+; RUN: -hexagon-bit=0 < %s | FileCheck %s
+
+; This spill should be eliminated.
+; CHECK-NOT: vmem(r29+#6)
+
+define void @test(i8* noalias nocapture %key, i8* noalias nocapture %data1) #0 {
+entry:
+ %0 = bitcast i8* %key to <32 x i32>*
+ %1 = bitcast i8* %data1 to <32 x i32>*
+ br label %for.body
+
+for.body:
+ %pkey.0542 = phi <32 x i32>* [ %0, %entry ], [ null, %for.body ]
+ %pdata0.0541 = phi <32 x i32>* [ null, %entry ], [ %add.ptr48, %for.body ]
+ %pdata1.0540 = phi <32 x i32>* [ %1, %entry ], [ %add.ptr49, %for.body ]
+ %dAccum0.0539 = phi <64 x i32> [ undef, %entry ], [ %86, %for.body ]
+ %2 = load <32 x i32>, <32 x i32>* %pkey.0542, align 128
+ %3 = load <32 x i32>, <32 x i32>* %pdata0.0541, align 128
+ %4 = load <32 x i32>, <32 x i32>* undef, align 128
+ %arrayidx4 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata0.0541, i32 2
+ %5 = load <32 x i32>, <32 x i32>* %arrayidx4, align 128
+ %arrayidx5 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata1.0540, i32 2
+ %6 = load <32 x i32>, <32 x i32>* %arrayidx5, align 128
+ %7 = load <32 x i32>, <32 x i32>* null, align 128
+ %8 = load <32 x i32>, <32 x i32>* undef, align 128
+ %9 = load <32 x i32>, <32 x i32>* null, align 128
+ %arrayidx9 = getelementptr inbounds <32 x i32>, <32 x i32>* %pkey.0542, i32 3
+ %arrayidx10 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata0.0541, i32 6
+ %10 = load <32 x i32>, <32 x i32>* %arrayidx10, align 128
+ %arrayidx12 = getelementptr inbounds <32 x i32>, <32 x i32>* %pkey.0542, i32 4
+ %11 = load <32 x i32>, <32 x i32>* %arrayidx12, align 128
+ %arrayidx13 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata0.0541, i32 8
+ %arrayidx14 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata1.0540, i32 8
+ %12 = load <32 x i32>, <32 x i32>* %arrayidx14, align 128
+ %arrayidx15 = getelementptr inbounds <32 x i32>, <32 x i32>* %pkey.0542, i32 5
+ %13 = load <32 x i32>, <32 x i32>* %arrayidx15, align 128
+ %arrayidx16 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata0.0541, i32 10
+ %arrayidx17 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata1.0540, i32 10
+ %14 = load <32 x i32>, <32 x i32>* %arrayidx17, align 128
+ %arrayidx18 = getelementptr inbounds <32 x i32>, <32 x i32>* %pkey.0542, i32 6
+ %15 = load <32 x i32>, <32 x i32>* %arrayidx18, align 128
+ %arrayidx19 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata0.0541, i32 12
+ %16 = load <32 x i32>, <32 x i32>* %arrayidx19, align 128
+ %arrayidx20 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata1.0540, i32 12
+ %17 = load <32 x i32>, <32 x i32>* %arrayidx20, align 128
+ %arrayidx22 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata0.0541, i32 14
+ %18 = load <32 x i32>, <32 x i32>* %arrayidx22, align 128
+ %arrayidx23 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata1.0540, i32 14
+ %19 = load <32 x i32>, <32 x i32>* %arrayidx23, align 128
+ %20 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %2, <32 x i32> %11)
+ %21 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %20, <32 x i32> %11, <32 x i32> %2)
+ %22 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %20, <32 x i32> %2, <32 x i32> %11)
+ %23 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %20, <32 x i32> undef, <32 x i32> %3)
+ %24 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %20, <32 x i32> %12, <32 x i32> undef)
+ %25 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %7, <32 x i32> %15)
+ %26 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %25, <32 x i32> %15, <32 x i32> %7)
+ %27 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %25, <32 x i32> %7, <32 x i32> %15)
+ %28 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %25, <32 x i32> %16, <32 x i32> %8)
+ %29 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %25, <32 x i32> %8, <32 x i32> %16)
+ %30 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %25, <32 x i32> %17, <32 x i32> %9)
+ %31 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %25, <32 x i32> %9, <32 x i32> %17)
+ %32 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %4, <32 x i32> %13)
+ %33 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %32, <32 x i32> %13, <32 x i32> %4)
+ %34 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %32, <32 x i32> %4, <32 x i32> %13)
+ %35 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %32, <32 x i32> undef, <32 x i32> %5)
+ %36 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %32, <32 x i32> %5, <32 x i32> undef)
+ %37 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %32, <32 x i32> %14, <32 x i32> %6)
+ %38 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %32, <32 x i32> %6, <32 x i32> %14)
+ %39 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> undef)
+ %40 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> zeroinitializer, <32 x i32> undef, <32 x i32> zeroinitializer)
+ %41 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> zeroinitializer, <32 x i32> %18, <32 x i32> %10)
+ %42 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> zeroinitializer, <32 x i32> %10, <32 x i32> %18)
+ %43 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> zeroinitializer, <32 x i32> %19, <32 x i32> undef)
+ %44 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> zeroinitializer, <32 x i32> undef, <32 x i32> %19)
+ %45 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %21, <32 x i32> %26)
+ %46 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %45, <32 x i32> %26, <32 x i32> %21)
+ %47 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %45, <32 x i32> %21, <32 x i32> %26)
+ %48 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %45, <32 x i32> %28, <32 x i32> %23)
+ %49 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %45, <32 x i32> %23, <32 x i32> %28)
+ %50 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %45, <32 x i32> %30, <32 x i32> %24)
+ %51 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %45, <32 x i32> %24, <32 x i32> %30)
+ %52 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %22, <32 x i32> %27)
+ %53 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %52, <32 x i32> %27, <32 x i32> %22)
+ %54 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %52, <32 x i32> %22, <32 x i32> %27)
+ %55 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %52, <32 x i32> %29, <32 x i32> undef)
+ %56 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %52, <32 x i32> undef, <32 x i32> %31)
+ %57 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %33, <32 x i32> %39)
+ %58 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %57, <32 x i32> %39, <32 x i32> %33)
+ %59 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %57, <32 x i32> %33, <32 x i32> %39)
+ %60 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %57, <32 x i32> %41, <32 x i32> %35)
+ %61 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %57, <32 x i32> %43, <32 x i32> %37)
+ %62 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %34, <32 x i32> %40)
+ %63 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %62, <32 x i32> %42, <32 x i32> %36)
+ %64 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %62, <32 x i32> %38, <32 x i32> %44)
+ %65 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %46, <32 x i32> %58)
+ %66 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %65, <32 x i32> %58, <32 x i32> %46)
+ %67 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %65, <32 x i32> %60, <32 x i32> %48)
+ %68 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %65, <32 x i32> %61, <32 x i32> %50)
+ %69 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %47, <32 x i32> %59)
+ %70 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %69, <32 x i32> %51, <32 x i32> zeroinitializer)
+ %71 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %53, <32 x i32> zeroinitializer)
+ %72 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %71, <32 x i32> %63, <32 x i32> %55)
+ %73 = tail call <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32> %54, <32 x i32> undef)
+ %74 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1> %73, <32 x i32> %56, <32 x i32> %64)
+ %75 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %68, <32 x i32> %67)
+ %76 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %70, <32 x i32> undef)
+ %77 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> zeroinitializer, <32 x i32> %72)
+ %78 = tail call <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32> %74, <32 x i32> zeroinitializer)
+ %79 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %dAccum0.0539, <32 x i32> %75, i32 65537)
+ %80 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %79, <32 x i32> zeroinitializer, i32 65537)
+ %81 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %80, <32 x i32> zeroinitializer, i32 65537)
+ %82 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %81, <32 x i32> %76, i32 65537)
+ %83 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %82, <32 x i32> %77, i32 65537)
+ %84 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %83, <32 x i32> zeroinitializer, i32 65537)
+ %85 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %84, <32 x i32> undef, i32 65537)
+ %86 = tail call <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32> %85, <32 x i32> %78, i32 65537)
+ store <32 x i32> %66, <32 x i32>* %pkey.0542, align 128
+ store <32 x i32> %75, <32 x i32>* %pdata0.0541, align 128
+ store <32 x i32> zeroinitializer, <32 x i32>* %arrayidx4, align 128
+ store <32 x i32> zeroinitializer, <32 x i32>* undef, align 128
+ store <32 x i32> zeroinitializer, <32 x i32>* %arrayidx20, align 128
+ store <32 x i32> zeroinitializer, <32 x i32>* null, align 128
+ %add.ptr48 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata0.0541, i32 16
+ %add.ptr49 = getelementptr inbounds <32 x i32>, <32 x i32>* %pdata1.0540, i32 16
+ br i1 false, label %for.end, label %for.body
+
+for.end:
+ %87 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %86)
+ ret void
+}
+
+declare <1024 x i1> @llvm.hexagon.V6.vgtb.128B(<32 x i32>, <32 x i32>) #1
+
+declare <32 x i32> @llvm.hexagon.V6.vmux.128B(<1024 x i1>, <32 x i32>, <32 x i32>) #1
+
+declare <32 x i32> @llvm.hexagon.V6.vshuffeb.128B(<32 x i32>, <32 x i32>) #1
+
+declare <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32>, <32 x i32>, i32) #1
+
+declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/expand-condsets-pred-undef.ll b/test/CodeGen/Hexagon/expand-condsets-pred-undef.ll
new file mode 100644
index 000000000000..284170b5edb4
--- /dev/null
+++ b/test/CodeGen/Hexagon/expand-condsets-pred-undef.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+target triple = "hexagon"
+
+%struct.0 = type { i64, i16 }
+
+declare void @foo(%struct.0* noalias nocapture sret, i8 zeroext, i32, i64) #0
+
+define hidden fastcc void @fred(%struct.0* noalias nocapture %p, i8 zeroext %t, i32 %r) unnamed_addr #0 {
+entry:
+ %. = select i1 undef, i64 549755813888, i64 1024
+ %cmp104 = icmp ult i64 undef, %.
+ %inc = zext i1 %cmp104 to i32
+ %inc.r = add nsw i32 %inc, %r
+ %.inc.r = select i1 undef, i32 0, i32 %inc.r
+ tail call void @foo(%struct.0* sret %p, i8 zeroext %t, i32 %.inc.r, i64 undef)
+ ret void
+}
+
+attributes #0 = { noinline nounwind }
+
diff --git a/test/CodeGen/Hexagon/extload-combine.ll b/test/CodeGen/Hexagon/extload-combine.ll
index 773b10b2b288..c492343d7915 100644
--- a/test/CodeGen/Hexagon/extload-combine.ll
+++ b/test/CodeGen/Hexagon/extload-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 -disable-hsdr < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 -disable-hsdr < %s | FileCheck %s
; Check that the combine/stxw instructions are being generated.
; In case of combine one of the operand should be 0 and another should be
; the output of absolute addressing load instruction.
diff --git a/test/CodeGen/Hexagon/gp-plus-offset-load.ll b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
index cd1aacc2318a..55edb22f46dc 100644
--- a/test/CodeGen/Hexagon/gp-plus-offset-load.ll
+++ b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
@@ -21,7 +21,7 @@ if.end: ; preds = %if.then, %entry
}
define void @loadByte(i32 %val1, i32 %val2, i8* nocapture %ival) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memb(##foo{{ *}}+{{ *}}1)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memub(##foo{{ *}}+{{ *}}1)
entry:
%cmp = icmp sgt i32 %val1, %val2
br i1 %cmp, label %if.then, label %if.end
@@ -36,7 +36,7 @@ if.end: ; preds = %if.then, %entry
}
define void @loadHWord(i32 %val1, i32 %val2, i16* %ival) nounwind {
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memh(##foo{{ *}}+{{ *}}2)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memuh(##foo{{ *}}+{{ *}}2)
entry:
%cmp = icmp sgt i32 %val1, %val2
br i1 %cmp, label %if.then, label %if.end
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 341f8db9e336..1e305e30f628 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -37,10 +37,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!29}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", isOptimized: true, emissionKind: 1, file: !28, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", isOptimized: true, emissionKind: FullDebug, file: !28, enums: !2, retainedTypes: !2, globals: !2)
!2 = !{}
-!3 = !{!5}
-!5 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !28, scope: null, type: !7, variables: !11)
+!5 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !28, scope: null, type: !7, variables: !11)
!6 = !DIFile(filename: "hwloop-dbg.c", directory: "/usr2/kparzysz/s.hex/t")
!7 = !DISubroutineType(types: !8)
!8 = !{null, !9, !9}
diff --git a/test/CodeGen/Hexagon/ifcvt-diamond-bad.ll b/test/CodeGen/Hexagon/ifcvt-diamond-bad.ll
new file mode 100644
index 000000000000..e4bee8354a7c
--- /dev/null
+++ b/test/CodeGen/Hexagon/ifcvt-diamond-bad.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=hexagon -minimum-jump-tables=1 < %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon"
+
+%struct.t0 = type { i8, [2 x i8] }
+%struct.t1 = type { i8, i8, [1900 x i8], %struct.t0 }
+
+@var = internal global [3 x %struct.t1] zeroinitializer, align 8
+declare void @foo() #2
+declare void @bar(i32, i32) #2
+
+; Function Attrs: nounwind
+define void @fred(i8 signext %a, i8 signext %b) #1 {
+entry:
+ %i = sext i8 %a to i32
+ %t = getelementptr inbounds [3 x %struct.t1], [3 x %struct.t1]* @var, i32 0, i32 %i, i32 3, i32 0
+ %0 = load i8, i8* %t, align 8
+ switch i8 %0, label %if.end14 [
+ i8 1, label %if.then
+ i8 0, label %do.body
+ ]
+
+if.then: ; preds = %entry
+ %j = sext i8 %b to i32
+ %u = getelementptr inbounds [3 x %struct.t1], [3 x %struct.t1]* @var, i32 0, i32 %i, i32 3, i32 1, i32 %j
+ store i8 1, i8* %u, align 1
+ tail call void @foo() #0
+ br label %if.end14
+
+do.body: ; preds = %entry
+ %conv11 = sext i8 %b to i32
+ tail call void @bar(i32 %i, i32 %conv11) #0
+ br label %if.end14
+
+if.end14: ; preds = %entry, %do.body, %if.then
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "disable-tail-calls"="false" }
+attributes #2 = { "disable-tail-calls"="false" }
diff --git a/test/CodeGen/Hexagon/inline-asm-qv.ll b/test/CodeGen/Hexagon/inline-asm-qv.ll
new file mode 100644
index 000000000000..256342170313
--- /dev/null
+++ b/test/CodeGen/Hexagon/inline-asm-qv.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that constraints q and v are handled correctly.
+; CHECK: q{{.}} = vgtw(v{{.}}.w,v{{.}}.w)
+; CHECK: vand
+; CHECK: vmem
+
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo(<16 x i32> %v0, <16 x i32> %v1, <16 x i32>* nocapture %p) #0 {
+entry:
+ %0 = tail call <16 x i32> asm "$0 = vgtw($1.w,$2.w)", "=q,v,v"(<16 x i32> %v0, <16 x i32> %v1) #1
+ store <16 x i32> %0, <16 x i32>* %p, align 64
+ ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/insert4.ll b/test/CodeGen/Hexagon/insert4.ll
new file mode 100644
index 000000000000..96c8bba24d7c
--- /dev/null
+++ b/test/CodeGen/Hexagon/insert4.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Check that we are generating insert instructions.
+; CHECK: insert
+; CHECK: insert
+; CHECK: insert
+; CHECK: insert
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+%struct.a = type { i16 }
+
+define i32 @fun(%struct.a* nocapture %pData, i64 %c, i64* nocapture %d, i64* nocapture %e, i64* nocapture %f) #0 {
+entry:
+ %g = getelementptr inbounds %struct.a, %struct.a* %pData, i32 0, i32 0
+ %0 = load i16, i16* %g, align 2, !tbaa !0
+ %conv185 = sext i16 %0 to i32
+ %shr86 = ashr i32 %conv185, 2
+ %cmp87 = icmp sgt i32 %shr86, 0
+ br i1 %cmp87, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph: ; preds = %entry
+ %h.sroa.0.0.extract.trunc = trunc i64 %c to i32
+ %sext = shl i32 %h.sroa.0.0.extract.trunc, 16
+ %conv8 = ashr exact i32 %sext, 16
+ %l.sroa.2.4.extract.shift = lshr i64 %c, 32
+ %sext76 = ashr i32 %h.sroa.0.0.extract.trunc, 16
+ %m.sroa.2.6.extract.shift = lshr i64 %c, 48
+ %sext7980 = shl nuw nsw i64 %l.sroa.2.4.extract.shift, 16
+ %sext79 = trunc i64 %sext7980 to i32
+ %conv38 = ashr exact i32 %sext79, 16
+ %sext8283 = shl nuw nsw i64 %m.sroa.2.6.extract.shift, 16
+ %sext82 = trunc i64 %sext8283 to i32
+ %conv53 = ashr exact i32 %sext82, 16
+ br label %for.body
+
+for.body: ; preds = %for.body.lr.ph, %for.body
+ %arrayidx.phi = phi i64* [ %d, %for.body.lr.ph ], [ %arrayidx.inc, %for.body ]
+ %arrayidx30.phi = phi i64* [ %f, %for.body.lr.ph ], [ %arrayidx30.inc, %for.body ]
+ %arrayidx60.phi = phi i64* [ %e, %for.body.lr.ph ], [ %arrayidx60.inc, %for.body ]
+ %j.088.pmt = phi i32 [ 0, %for.body.lr.ph ], [ %inc.pmt, %for.body ]
+ %1 = load i64, i64* %arrayidx.phi, align 8, !tbaa !1
+ %n_union3.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+ %n_union3.sroa.1.4.extract.shift = lshr i64 %1, 32
+ %2 = tail call i64 @llvm.hexagon.M2.dpmpyss.s0(i32 %n_union3.sroa.0.0.extract.trunc, i32 %conv8)
+ %3 = tail call i64 @llvm.hexagon.S2.asl.r.p(i64 %2, i32 -25)
+ %conv9 = trunc i64 %3 to i32
+ %4 = tail call i32 @llvm.hexagon.A2.sath(i32 %conv9)
+ %n_union13.sroa.1.4.extract.trunc = trunc i64 %n_union3.sroa.1.4.extract.shift to i32
+ %5 = tail call i64 @llvm.hexagon.M2.dpmpyss.s0(i32 %n_union13.sroa.1.4.extract.trunc, i32 %sext76)
+ %6 = tail call i64 @llvm.hexagon.S2.asl.r.p(i64 %5, i32 -25)
+ %conv24 = trunc i64 %6 to i32
+ %7 = tail call i32 @llvm.hexagon.A2.sath(i32 %conv24)
+ %8 = load i64, i64* %arrayidx30.phi, align 8, !tbaa !1
+ %n_union28.sroa.0.0.extract.trunc = trunc i64 %8 to i32
+ %n_union28.sroa.1.4.extract.shift = lshr i64 %8, 32
+ %9 = tail call i64 @llvm.hexagon.M2.dpmpyss.s0(i32 %n_union28.sroa.0.0.extract.trunc, i32 %conv38)
+ %10 = tail call i64 @llvm.hexagon.S2.asl.r.p(i64 %9, i32 -25)
+ %conv39 = trunc i64 %10 to i32
+ %11 = tail call i32 @llvm.hexagon.A2.sath(i32 %conv39)
+ %n_union43.sroa.1.4.extract.trunc = trunc i64 %n_union28.sroa.1.4.extract.shift to i32
+ %12 = tail call i64 @llvm.hexagon.M2.dpmpyss.s0(i32 %n_union43.sroa.1.4.extract.trunc, i32 %conv53)
+ %13 = tail call i64 @llvm.hexagon.S2.asl.r.p(i64 %12, i32 -25)
+ %conv54 = trunc i64 %13 to i32
+ %14 = tail call i32 @llvm.hexagon.A2.sath(i32 %conv54)
+ %n_union.sroa.3.6.insert.ext = zext i32 %14 to i64
+ %n_union.sroa.3.6.insert.shift = shl i64 %n_union.sroa.3.6.insert.ext, 48
+ %conv40.mask = and i32 %11, 65535
+ %n_union.sroa.2.4.insert.ext = zext i32 %conv40.mask to i64
+ %n_union.sroa.2.4.insert.shift = shl nuw nsw i64 %n_union.sroa.2.4.insert.ext, 32
+ %conv25.mask = and i32 %7, 65535
+ %n_union.sroa.1.2.insert.ext = zext i32 %conv25.mask to i64
+ %n_union.sroa.1.2.insert.shift = shl nuw nsw i64 %n_union.sroa.1.2.insert.ext, 16
+ %conv10.mask = and i32 %4, 65535
+ %n_union.sroa.0.0.insert.ext = zext i32 %conv10.mask to i64
+ %n_union.sroa.2.4.insert.insert = or i64 %n_union.sroa.1.2.insert.shift, %n_union.sroa.0.0.insert.ext
+ %n_union.sroa.1.2.insert.insert = or i64 %n_union.sroa.2.4.insert.insert, %n_union.sroa.2.4.insert.shift
+ %n_union.sroa.0.0.insert.insert = or i64 %n_union.sroa.1.2.insert.insert, %n_union.sroa.3.6.insert.shift
+ %15 = load i64, i64* %arrayidx60.phi, align 8, !tbaa !1
+ %16 = tail call i64 @llvm.hexagon.A2.vaddhs(i64 %15, i64 %n_union.sroa.0.0.insert.insert)
+ store i64 %16, i64* %arrayidx60.phi, align 8, !tbaa !1
+ %inc.pmt = add i32 %j.088.pmt, 1
+ %17 = load i16, i16* %g, align 2, !tbaa !0
+ %conv1 = sext i16 %17 to i32
+ %shr = ashr i32 %conv1, 2
+ %cmp = icmp slt i32 %inc.pmt, %shr
+ %arrayidx.inc = getelementptr i64, i64* %arrayidx.phi, i32 1
+ %arrayidx30.inc = getelementptr i64, i64* %arrayidx30.phi, i32 1
+ %arrayidx60.inc = getelementptr i64, i64* %arrayidx60.phi, i32 1
+ br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret i32 0
+}
+
+declare i32 @llvm.hexagon.A2.sath(i32) #1
+
+declare i64 @llvm.hexagon.S2.asl.r.p(i64, i32) #1
+
+declare i64 @llvm.hexagon.M2.dpmpyss.s0(i32, i32) #1
+
+declare i64 @llvm.hexagon.A2.vaddhs(i64, i64) #1
+
+attributes #0 = { nounwind "fp-contract-model"="standard" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="static" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"short", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/intrinsics/system_user.ll b/test/CodeGen/Hexagon/intrinsics/system_user.ll
new file mode 100644
index 000000000000..dad4effb0a14
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/system_user.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -O0 < %s | FileCheck -check-prefix=CHECK-CALL %s
+; Hexagon Programmer's Reference Manual 11.9.1 SYSTEM/USER
+
+; CHECK-CALL-NOT: call
+
+; Data cache prefetch
+declare void @llvm.hexagon.prefetch(i8*)
+define void @prefetch(i8* %a) {
+ call void @llvm.hexagon.prefetch(i8* %a)
+ ret void
+}
+; CHECK: dcfetch({{.*}} + #0)
diff --git a/test/CodeGen/Hexagon/memops-stack.ll b/test/CodeGen/Hexagon/memops-stack.ll
new file mode 100644
index 000000000000..a8dc664591e9
--- /dev/null
+++ b/test/CodeGen/Hexagon/memops-stack.ll
@@ -0,0 +1,147 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; CHECK-LABEL: test0
+; CHECK: memw(r29+#{{[0-9]+}}) += #1
+define void @test0() #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+; CHECK-LABEL: test1
+; CHECK: memw(r29+#{{[0-9]+}}) -= #1
+define void @test1() #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = sub nsw i32 %1, 1
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+; CHECK-LABEL: test2
+; CHECK: memw(r29+#{{[0-9]+}}) = setbit(#0)
+define void @test2() #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = or i32 %1, 1
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+; CHECK-LABEL: test3
+; CHECK: memw(r29+#{{[0-9]+}}) = clrbit(#0)
+define void @test3() #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = and i32 %1, -2
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+; CHECK-LABEL: test4
+; CHECK: memw(r29+#{{[0-9]+}}) += r
+define void @test4(i32 %a) #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = add nsw i32 %1, %a
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+; CHECK-LABEL: test5
+; CHECK: memw(r29+#{{[0-9]+}}) -= r
+define void @test5(i32 %a) #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = sub nsw i32 %1, %a
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+; CHECK-LABEL: test6
+; CHECK: memw(r29+#{{[0-9]+}}) |= r
+define void @test6(i32 %a) #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = or i32 %1, %a
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+; CHECK-LABEL: test7
+; CHECK: memw(r29+#{{[0-9]+}}) &= r
+define void @test7(i32 %a) #0 {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #3
+ call void @foo(i32* nonnull %x) #3
+ %1 = load i32, i32* %x, align 4, !tbaa !1
+ %inc = and i32 %1, %a
+ store i32 %inc, i32* %x, align 4, !tbaa !1
+ call void @foo(i32* nonnull %x) #3
+ call void @llvm.lifetime.end(i64 4, i8* %0) #3
+ ret void
+}
+
+
+declare void @foo(i32*) #2
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/memops.ll b/test/CodeGen/Hexagon/memops.ll
index e4a8bf7c95e9..011a4e50e5d8 100644
--- a/test/CodeGen/Hexagon/memops.ll
+++ b/test/CodeGen/Hexagon/memops.ll
@@ -3,6 +3,7 @@
define void @memop_unsigned_char_add5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_add5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%0 = load i8, i8* %p, align 1
%conv = zext i8 %0 to i32
@@ -14,6 +15,7 @@ entry:
define void @memop_unsigned_char_add(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_add:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv = zext i8 %x to i32
%0 = load i8, i8* %p, align 1
@@ -26,6 +28,7 @@ entry:
define void @memop_unsigned_char_sub(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_sub:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv = zext i8 %x to i32
%0 = load i8, i8* %p, align 1
@@ -38,6 +41,7 @@ entry:
define void @memop_unsigned_char_or(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_or:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%0 = load i8, i8* %p, align 1
%or3 = or i8 %0, %x
@@ -47,6 +51,7 @@ entry:
define void @memop_unsigned_char_and(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_and:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%0 = load i8, i8* %p, align 1
%and3 = and i8 %0, %x
@@ -56,6 +61,7 @@ entry:
define void @memop_unsigned_char_clrbit(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_clrbit:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%0 = load i8, i8* %p, align 1
%conv = zext i8 %0 to i32
@@ -67,6 +73,7 @@ entry:
define void @memop_unsigned_char_setbit(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_setbit:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%0 = load i8, i8* %p, align 1
%conv = zext i8 %0 to i32
@@ -78,6 +85,7 @@ entry:
define void @memop_unsigned_char_add5_index(i8* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_add5_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -90,6 +98,7 @@ entry:
define void @memop_unsigned_char_add_index(i8* nocapture %p, i32 %i, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_add_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
@@ -103,6 +112,7 @@ entry:
define void @memop_unsigned_char_sub_index(i8* nocapture %p, i32 %i, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_sub_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
@@ -116,6 +126,7 @@ entry:
define void @memop_unsigned_char_or_index(i8* nocapture %p, i32 %i, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_or_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -126,6 +137,7 @@ entry:
define void @memop_unsigned_char_and_index(i8* nocapture %p, i32 %i, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_and_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -136,6 +148,7 @@ entry:
define void @memop_unsigned_char_clrbit_index(i8* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_clrbit_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -148,6 +161,7 @@ entry:
define void @memop_unsigned_char_setbit_index(i8* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_setbit_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -160,6 +174,7 @@ entry:
define void @memop_unsigned_char_add5_index5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_add5_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -172,6 +187,7 @@ entry:
define void @memop_unsigned_char_add_index5(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_add_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}r{{[0-9]+}}
%conv = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
@@ -185,6 +201,7 @@ entry:
define void @memop_unsigned_char_sub_index5(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_sub_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}-={{ *}}r{{[0-9]+}}
%conv = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
@@ -198,6 +215,7 @@ entry:
define void @memop_unsigned_char_or_index5(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_or_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -208,6 +226,7 @@ entry:
define void @memop_unsigned_char_and_index5(i8* nocapture %p, i8 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_and_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -218,6 +237,7 @@ entry:
define void @memop_unsigned_char_clrbit_index5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_clrbit_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -230,6 +250,7 @@ entry:
define void @memop_unsigned_char_setbit_index5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_char_setbit_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -242,6 +263,7 @@ entry:
define void @memop_signed_char_add5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_add5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%0 = load i8, i8* %p, align 1
%conv2 = zext i8 %0 to i32
@@ -253,6 +275,7 @@ entry:
define void @memop_signed_char_add(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_add:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv4 = zext i8 %x to i32
%0 = load i8, i8* %p, align 1
@@ -265,6 +288,7 @@ entry:
define void @memop_signed_char_sub(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_sub:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv4 = zext i8 %x to i32
%0 = load i8, i8* %p, align 1
@@ -277,6 +301,7 @@ entry:
define void @memop_signed_char_or(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_or:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%0 = load i8, i8* %p, align 1
%or3 = or i8 %0, %x
@@ -286,6 +311,7 @@ entry:
define void @memop_signed_char_and(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_and:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%0 = load i8, i8* %p, align 1
%and3 = and i8 %0, %x
@@ -295,6 +321,7 @@ entry:
define void @memop_signed_char_clrbit(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_clrbit:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%0 = load i8, i8* %p, align 1
%conv2 = zext i8 %0 to i32
@@ -306,6 +333,7 @@ entry:
define void @memop_signed_char_setbit(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_setbit:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%0 = load i8, i8* %p, align 1
%conv2 = zext i8 %0 to i32
@@ -317,6 +345,7 @@ entry:
define void @memop_signed_char_add5_index(i8* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_add5_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -329,6 +358,7 @@ entry:
define void @memop_signed_char_add_index(i8* nocapture %p, i32 %i, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_add_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv4 = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
@@ -342,6 +372,7 @@ entry:
define void @memop_signed_char_sub_index(i8* nocapture %p, i32 %i, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_sub_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv4 = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
@@ -355,6 +386,7 @@ entry:
define void @memop_signed_char_or_index(i8* nocapture %p, i32 %i, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_or_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -365,6 +397,7 @@ entry:
define void @memop_signed_char_and_index(i8* nocapture %p, i32 %i, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_and_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -375,6 +408,7 @@ entry:
define void @memop_signed_char_clrbit_index(i8* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_clrbit_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -387,6 +421,7 @@ entry:
define void @memop_signed_char_setbit_index(i8* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_setbit_index:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 %i
%0 = load i8, i8* %add.ptr, align 1
@@ -399,6 +434,7 @@ entry:
define void @memop_signed_char_add5_index5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_add5_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -411,6 +447,7 @@ entry:
define void @memop_signed_char_add_index5(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_add_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}r{{[0-9]+}}
%conv4 = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
@@ -424,6 +461,7 @@ entry:
define void @memop_signed_char_sub_index5(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_sub_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}-={{ *}}r{{[0-9]+}}
%conv4 = zext i8 %x to i32
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
@@ -437,6 +475,7 @@ entry:
define void @memop_signed_char_or_index5(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_or_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -447,6 +486,7 @@ entry:
define void @memop_signed_char_and_index5(i8* nocapture %p, i8 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_and_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -457,6 +497,7 @@ entry:
define void @memop_signed_char_clrbit_index5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_clrbit_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -469,6 +510,7 @@ entry:
define void @memop_signed_char_setbit_index5(i8* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_char_setbit_index5:
; CHECK: memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i8, i8* %p, i32 5
%0 = load i8, i8* %add.ptr, align 1
@@ -481,6 +523,7 @@ entry:
define void @memop_unsigned_short_add5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_add5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%0 = load i16, i16* %p, align 2
%conv = zext i16 %0 to i32
@@ -492,6 +535,7 @@ entry:
define void @memop_unsigned_short_add(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_add:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv = zext i16 %x to i32
%0 = load i16, i16* %p, align 2
@@ -504,6 +548,7 @@ entry:
define void @memop_unsigned_short_sub(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_sub:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv = zext i16 %x to i32
%0 = load i16, i16* %p, align 2
@@ -516,6 +561,7 @@ entry:
define void @memop_unsigned_short_or(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_or:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%0 = load i16, i16* %p, align 2
%or3 = or i16 %0, %x
@@ -525,6 +571,7 @@ entry:
define void @memop_unsigned_short_and(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_and:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%0 = load i16, i16* %p, align 2
%and3 = and i16 %0, %x
@@ -534,6 +581,7 @@ entry:
define void @memop_unsigned_short_clrbit(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_clrbit:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%0 = load i16, i16* %p, align 2
%conv = zext i16 %0 to i32
@@ -545,6 +593,7 @@ entry:
define void @memop_unsigned_short_setbit(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_setbit:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%0 = load i16, i16* %p, align 2
%conv = zext i16 %0 to i32
@@ -556,6 +605,7 @@ entry:
define void @memop_unsigned_short_add5_index(i16* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_add5_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -568,6 +618,7 @@ entry:
define void @memop_unsigned_short_add_index(i16* nocapture %p, i32 %i, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_add_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
@@ -581,6 +632,7 @@ entry:
define void @memop_unsigned_short_sub_index(i16* nocapture %p, i32 %i, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_sub_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
@@ -594,6 +646,7 @@ entry:
define void @memop_unsigned_short_or_index(i16* nocapture %p, i32 %i, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_or_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -604,6 +657,7 @@ entry:
define void @memop_unsigned_short_and_index(i16* nocapture %p, i32 %i, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_and_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -614,6 +668,7 @@ entry:
define void @memop_unsigned_short_clrbit_index(i16* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_clrbit_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -626,6 +681,7 @@ entry:
define void @memop_unsigned_short_setbit_index(i16* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_setbit_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -638,6 +694,7 @@ entry:
define void @memop_unsigned_short_add5_index5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_add5_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -650,6 +707,7 @@ entry:
define void @memop_unsigned_short_add_index5(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_add_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}r{{[0-9]+}}
%conv = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
@@ -663,6 +721,7 @@ entry:
define void @memop_unsigned_short_sub_index5(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_sub_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}-={{ *}}r{{[0-9]+}}
%conv = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
@@ -676,6 +735,7 @@ entry:
define void @memop_unsigned_short_or_index5(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_or_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -686,6 +746,7 @@ entry:
define void @memop_unsigned_short_and_index5(i16* nocapture %p, i16 zeroext %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_and_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -696,6 +757,7 @@ entry:
define void @memop_unsigned_short_clrbit_index5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_clrbit_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -708,6 +770,7 @@ entry:
define void @memop_unsigned_short_setbit_index5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_short_setbit_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -720,6 +783,7 @@ entry:
define void @memop_signed_short_add5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_add5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%0 = load i16, i16* %p, align 2
%conv2 = zext i16 %0 to i32
@@ -731,6 +795,7 @@ entry:
define void @memop_signed_short_add(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_add:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv4 = zext i16 %x to i32
%0 = load i16, i16* %p, align 2
@@ -743,6 +808,7 @@ entry:
define void @memop_signed_short_sub(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_sub:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv4 = zext i16 %x to i32
%0 = load i16, i16* %p, align 2
@@ -755,6 +821,7 @@ entry:
define void @memop_signed_short_or(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_or:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%0 = load i16, i16* %p, align 2
%or3 = or i16 %0, %x
@@ -764,6 +831,7 @@ entry:
define void @memop_signed_short_and(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_and:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%0 = load i16, i16* %p, align 2
%and3 = and i16 %0, %x
@@ -773,6 +841,7 @@ entry:
define void @memop_signed_short_clrbit(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_clrbit:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%0 = load i16, i16* %p, align 2
%conv2 = zext i16 %0 to i32
@@ -784,6 +853,7 @@ entry:
define void @memop_signed_short_setbit(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_setbit:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%0 = load i16, i16* %p, align 2
%conv2 = zext i16 %0 to i32
@@ -795,6 +865,7 @@ entry:
define void @memop_signed_short_add5_index(i16* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_add5_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -807,6 +878,7 @@ entry:
define void @memop_signed_short_add_index(i16* nocapture %p, i32 %i, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_add_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%conv4 = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
@@ -820,6 +892,7 @@ entry:
define void @memop_signed_short_sub_index(i16* nocapture %p, i32 %i, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_sub_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%conv4 = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
@@ -833,6 +906,7 @@ entry:
define void @memop_signed_short_or_index(i16* nocapture %p, i32 %i, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_or_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -843,6 +917,7 @@ entry:
define void @memop_signed_short_and_index(i16* nocapture %p, i32 %i, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_and_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -853,6 +928,7 @@ entry:
define void @memop_signed_short_clrbit_index(i16* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_clrbit_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -865,6 +941,7 @@ entry:
define void @memop_signed_short_setbit_index(i16* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_setbit_index:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 %i
%0 = load i16, i16* %add.ptr, align 2
@@ -877,6 +954,7 @@ entry:
define void @memop_signed_short_add5_index5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_add5_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -889,6 +967,7 @@ entry:
define void @memop_signed_short_add_index5(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_add_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}r{{[0-9]+}}
%conv4 = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
@@ -902,6 +981,7 @@ entry:
define void @memop_signed_short_sub_index5(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_sub_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}-={{ *}}r{{[0-9]+}}
%conv4 = zext i16 %x to i32
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
@@ -915,6 +995,7 @@ entry:
define void @memop_signed_short_or_index5(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_or_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -925,6 +1006,7 @@ entry:
define void @memop_signed_short_and_index5(i16* nocapture %p, i16 signext %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_and_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -935,6 +1017,7 @@ entry:
define void @memop_signed_short_clrbit_index5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_clrbit_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -947,6 +1030,7 @@ entry:
define void @memop_signed_short_setbit_index5(i16* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_short_setbit_index5:
; CHECK: memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i16, i16* %p, i32 5
%0 = load i16, i16* %add.ptr, align 2
@@ -959,6 +1043,7 @@ entry:
define void @memop_signed_int_add5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_add5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%0 = load i32, i32* %p, align 4
%add = add i32 %0, 5
@@ -968,6 +1053,7 @@ entry:
define void @memop_signed_int_add(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_add:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%add = add i32 %0, %x
@@ -977,6 +1063,7 @@ entry:
define void @memop_signed_int_sub(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_sub:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%sub = sub i32 %0, %x
@@ -986,6 +1073,7 @@ entry:
define void @memop_signed_int_or(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_or:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%or = or i32 %0, %x
@@ -995,6 +1083,7 @@ entry:
define void @memop_signed_int_and(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_and:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%and = and i32 %0, %x
@@ -1004,6 +1093,7 @@ entry:
define void @memop_signed_int_clrbit(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_clrbit:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%0 = load i32, i32* %p, align 4
%and = and i32 %0, -33
@@ -1013,6 +1103,7 @@ entry:
define void @memop_signed_int_setbit(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_setbit:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%0 = load i32, i32* %p, align 4
%or = or i32 %0, 128
@@ -1022,6 +1113,7 @@ entry:
define void @memop_signed_int_add5_index(i32* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_add5_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1032,6 +1124,7 @@ entry:
define void @memop_signed_int_add_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_add_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1042,6 +1135,7 @@ entry:
define void @memop_signed_int_sub_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_sub_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1052,6 +1146,7 @@ entry:
define void @memop_signed_int_or_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_or_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1062,6 +1157,7 @@ entry:
define void @memop_signed_int_and_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_and_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1072,6 +1168,7 @@ entry:
define void @memop_signed_int_clrbit_index(i32* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_clrbit_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1082,6 +1179,7 @@ entry:
define void @memop_signed_int_setbit_index(i32* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_setbit_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1092,6 +1190,7 @@ entry:
define void @memop_signed_int_add5_index5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_add5_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1102,6 +1201,7 @@ entry:
define void @memop_signed_int_add_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_add_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1112,6 +1212,7 @@ entry:
define void @memop_signed_int_sub_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_sub_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}-={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1122,6 +1223,7 @@ entry:
define void @memop_signed_int_or_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_or_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1132,6 +1234,7 @@ entry:
define void @memop_signed_int_and_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_and_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1142,6 +1245,7 @@ entry:
define void @memop_signed_int_clrbit_index5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_clrbit_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1152,6 +1256,7 @@ entry:
define void @memop_signed_int_setbit_index5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_signed_int_setbit_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1162,6 +1267,7 @@ entry:
define void @memop_unsigned_int_add5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_add5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%0 = load i32, i32* %p, align 4
%add = add nsw i32 %0, 5
@@ -1171,6 +1277,7 @@ entry:
define void @memop_unsigned_int_add(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_add:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%add = add nsw i32 %0, %x
@@ -1180,6 +1287,7 @@ entry:
define void @memop_unsigned_int_sub(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_sub:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%sub = sub nsw i32 %0, %x
@@ -1189,6 +1297,7 @@ entry:
define void @memop_unsigned_int_or(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_or:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%or = or i32 %0, %x
@@ -1198,6 +1307,7 @@ entry:
define void @memop_unsigned_int_and(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_and:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%0 = load i32, i32* %p, align 4
%and = and i32 %0, %x
@@ -1207,6 +1317,7 @@ entry:
define void @memop_unsigned_int_clrbit(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_clrbit:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%0 = load i32, i32* %p, align 4
%and = and i32 %0, -33
@@ -1216,6 +1327,7 @@ entry:
define void @memop_unsigned_int_setbit(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_setbit:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%0 = load i32, i32* %p, align 4
%or = or i32 %0, 128
@@ -1225,6 +1337,7 @@ entry:
define void @memop_unsigned_int_add5_index(i32* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_add5_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1235,6 +1348,7 @@ entry:
define void @memop_unsigned_int_add_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_add_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1245,6 +1359,7 @@ entry:
define void @memop_unsigned_int_sub_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_sub_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1255,6 +1370,7 @@ entry:
define void @memop_unsigned_int_or_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_or_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1265,6 +1381,7 @@ entry:
define void @memop_unsigned_int_and_index(i32* nocapture %p, i32 %i, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_and_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1275,6 +1392,7 @@ entry:
define void @memop_unsigned_int_clrbit_index(i32* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_clrbit_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1285,6 +1403,7 @@ entry:
define void @memop_unsigned_int_setbit_index(i32* nocapture %p, i32 %i) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_setbit_index:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 %i
%0 = load i32, i32* %add.ptr, align 4
@@ -1295,6 +1414,7 @@ entry:
define void @memop_unsigned_int_add5_index5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_add5_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}#5
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1305,6 +1425,7 @@ entry:
define void @memop_unsigned_int_add_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_add_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1315,6 +1436,7 @@ entry:
define void @memop_unsigned_int_sub_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_sub_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}-={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1325,6 +1447,7 @@ entry:
define void @memop_unsigned_int_or_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_or_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}|={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1335,6 +1458,7 @@ entry:
define void @memop_unsigned_int_and_index5(i32* nocapture %p, i32 %x) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_and_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}&={{ *}}r{{[0-9]+}}
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1345,6 +1469,7 @@ entry:
define void @memop_unsigned_int_clrbit_index5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_clrbit_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
@@ -1355,6 +1480,7 @@ entry:
define void @memop_unsigned_int_setbit_index5(i32* nocapture %p) nounwind {
entry:
+; CHECK-LABEL: memop_unsigned_int_setbit_index5:
; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
%add.ptr = getelementptr inbounds i32, i32* %p, i32 5
%0 = load i32, i32* %add.ptr, align 4
diff --git a/test/CodeGen/Hexagon/misched-top-rptracker-sync.ll b/test/CodeGen/Hexagon/misched-top-rptracker-sync.ll
new file mode 100644
index 000000000000..5fe16db6f806
--- /dev/null
+++ b/test/CodeGen/Hexagon/misched-top-rptracker-sync.ll
@@ -0,0 +1,151 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+; Check that we no longer get this error:
+; void llvm::ScheduleDAGMILive::scheduleMI(llvm::SUnit *, bool):
+; Assertion `TopRPTracker.getPos() == CurrentTop && "out of sync"' failed.
+
+target triple = "hexagon"
+
+%struct.A = type { %struct.B*, %struct.B* }
+%struct.B = type { i8*, %struct.B*, %struct.B* }
+
+@.str.4 = external hidden unnamed_addr constant [41 x i8], align 1
+@__func__.fred = external hidden unnamed_addr constant [16 x i8], align 1
+@.str.5 = external hidden unnamed_addr constant [43 x i8], align 1
+
+; Function Attrs: nounwind
+declare void @_Assert(i8*, i8*) #0
+
+; Function Attrs: nounwind
+define void @fred(%struct.A* %pA, %struct.B* %p) #0 !dbg !6 {
+entry:
+ tail call void @llvm.dbg.value(metadata %struct.A* %pA, i64 0, metadata !26, metadata !28), !dbg !29
+ tail call void @llvm.dbg.value(metadata %struct.B* %p, i64 0, metadata !27, metadata !28), !dbg !30
+ %cmp = icmp eq %struct.B* %p, null, !dbg !31
+ br i1 %cmp, label %cond.false, label %cond.end, !dbg !31
+
+cond.false: ; preds = %entry
+ tail call void @_Assert(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @__func__.fred, i32 0, i32 0)) #0, !dbg !32
+ br label %cond.end, !dbg !32
+
+cond.end: ; preds = %cond.false, %entry
+ %cmp1 = icmp eq %struct.A* %pA, null, !dbg !34
+ br i1 %cmp1, label %cond.false3, label %cond.end4, !dbg !34
+
+cond.false3: ; preds = %cond.end
+ tail call void @_Assert(i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @__func__.fred, i32 0, i32 0)) #0, !dbg !35
+ br label %cond.end4, !dbg !35
+
+cond.end4: ; preds = %cond.false3, %cond.end
+ %p2 = getelementptr inbounds %struct.A, %struct.A* %pA, i32 0, i32 0, !dbg !36
+ %0 = load %struct.B*, %struct.B** %p2, align 4, !dbg !38, !tbaa !39
+ %cmp5 = icmp eq %struct.B* %0, null, !dbg !44
+ br i1 %cmp5, label %if.then, label %if.end, !dbg !45
+
+if.then: ; preds = %cond.end4
+ %p1 = getelementptr inbounds %struct.A, %struct.A* %pA, i32 0, i32 1, !dbg !46
+ store %struct.B* %p, %struct.B** %p1, align 4, !dbg !48, !tbaa !49
+ store %struct.B* %p, %struct.B** %p2, align 4, !dbg !50, !tbaa !39
+ %p4 = getelementptr inbounds %struct.B, %struct.B* %p, i32 0, i32 1, !dbg !51
+ store %struct.B* null, %struct.B** %p4, align 4, !dbg !52, !tbaa !53
+ %p5 = getelementptr inbounds %struct.B, %struct.B* %p, i32 0, i32 2, !dbg !55
+ store %struct.B* null, %struct.B** %p5, align 4, !dbg !56, !tbaa !57
+ br label %return, !dbg !58
+
+if.end: ; preds = %cond.end4
+ %1 = ptrtoint %struct.B* %0 to i32, !dbg !59
+ %p57 = getelementptr inbounds %struct.B, %struct.B* %p, i32 0, i32 2, !dbg !60
+ store %struct.B* null, %struct.B** %p57, align 4, !dbg !61, !tbaa !57
+ %p49 = getelementptr inbounds %struct.B, %struct.B* %p, i32 0, i32 1, !dbg !62
+ %2 = bitcast %struct.B** %p49 to i32*, !dbg !63
+ store i32 %1, i32* %2, align 4, !dbg !63, !tbaa !53
+ %p511 = getelementptr inbounds %struct.B, %struct.B* %0, i32 0, i32 2, !dbg !64
+ store %struct.B* %p, %struct.B** %p511, align 4, !dbg !65, !tbaa !57
+ store %struct.B* %p, %struct.B** %p2, align 4, !dbg !66, !tbaa !39
+ br label %return, !dbg !67
+
+return: ; preds = %if.end, %if.then
+ ret void, !dbg !68
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (http://llvm.org/git/clang.git 4b380bc1db8b0c72bdbdaf0e4697b1a84100a369) (http://llvm.org/git/llvm.git 6217a62bc009d55e160dbb694f2e94a22c80809f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "bug.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 3.9.0 (http://llvm.org/git/clang.git 4b380bc1db8b0c72bdbdaf0e4697b1a84100a369) (http://llvm.org/git/llvm.git 6217a62bc009d55e160dbb694f2e94a22c80809f)"}
+!6 = distinct !DISubprogram(name: "fred", scope: !1, file: !1, line: 138, type: !7, isLocal: false, isDefinition: true, scopeLine: 139, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !25)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null, !9, !15}
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
+!10 = !DIDerivedType(tag: DW_TAG_typedef, name: "A", file: !11, line: 57, baseType: !12)
+!11 = !DIFile(filename: "bug.h", directory: "/")
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, file: !11, line: 54, size: 64, align: 32, elements: !13)
+!13 = !{!14, !24}
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "p2", scope: !12, file: !11, line: 55, baseType: !15, size: 32, align: 32)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32, align: 32)
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "B", file: !11, line: 50, baseType: !17)
+!17 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "B", file: !11, line: 45, size: 96, align: 32, elements: !18)
+!18 = !{!19, !21, !23}
+!19 = !DIDerivedType(tag: DW_TAG_member, name: "p3", scope: !17, file: !11, line: 47, baseType: !20, size: 32, align: 32)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 32, align: 32)
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "p4", scope: !17, file: !11, line: 48, baseType: !22, size: 32, align: 32, offset: 32)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 32, align: 32)
+!23 = !DIDerivedType(tag: DW_TAG_member, name: "p5", scope: !17, file: !11, line: 49, baseType: !22, size: 32, align: 32, offset: 64)
+!24 = !DIDerivedType(tag: DW_TAG_member, name: "p1", scope: !12, file: !11, line: 56, baseType: !15, size: 32, align: 32, offset: 32)
+!25 = !{!26, !27}
+!26 = !DILocalVariable(name: "pA", arg: 1, scope: !6, file: !1, line: 138, type: !9)
+!27 = !DILocalVariable(name: "p", arg: 2, scope: !6, file: !1, line: 138, type: !15)
+!28 = !DIExpression()
+!29 = !DILocation(line: 138, column: 34, scope: !6)
+!30 = !DILocation(line: 138, column: 57, scope: !6)
+!31 = !DILocation(line: 140, column: 5, scope: !6)
+!32 = !DILocation(line: 140, column: 5, scope: !33)
+!33 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 2)
+!34 = !DILocation(line: 141, column: 5, scope: !6)
+!35 = !DILocation(line: 141, column: 5, scope: !33)
+!36 = !DILocation(line: 143, column: 30, scope: !37)
+!37 = distinct !DILexicalBlock(scope: !6, file: !1, line: 143, column: 9)
+!38 = !DILocation(line: 155, column: 18, scope: !6)
+!39 = !{!40, !41, i64 0}
+!40 = !{!"", !41, i64 0, !41, i64 4}
+!41 = !{!"any pointer", !42, i64 0}
+!42 = !{!"omnipotent char", !43, i64 0}
+!43 = !{!"Simple C/C++ TBAA"}
+!44 = !DILocation(line: 143, column: 14, scope: !37)
+!45 = !DILocation(line: 143, column: 9, scope: !6)
+!46 = !DILocation(line: 146, column: 26, scope: !47)
+!47 = distinct !DILexicalBlock(scope: !37, file: !1, line: 143, column: 41)
+!48 = !DILocation(line: 146, column: 36, scope: !47)
+!49 = !{!40, !41, i64 4}
+!50 = !DILocation(line: 145, column: 32, scope: !47)
+!51 = !DILocation(line: 147, column: 20, scope: !47)
+!52 = !DILocation(line: 147, column: 29, scope: !47)
+!53 = !{!54, !41, i64 4}
+!54 = !{!"B", !41, i64 0, !41, i64 4, !41, i64 8}
+!55 = !DILocation(line: 148, column: 20, scope: !47)
+!56 = !DILocation(line: 148, column: 29, scope: !47)
+!57 = !{!54, !41, i64 8}
+!58 = !DILocation(line: 149, column: 9, scope: !47)
+!59 = !DILocation(line: 154, column: 41, scope: !6)
+!60 = !DILocation(line: 153, column: 16, scope: !6)
+!61 = !DILocation(line: 153, column: 25, scope: !6)
+!62 = !DILocation(line: 154, column: 16, scope: !6)
+!63 = !DILocation(line: 154, column: 26, scope: !6)
+!64 = !DILocation(line: 155, column: 29, scope: !6)
+!65 = !DILocation(line: 155, column: 39, scope: !6)
+!66 = !DILocation(line: 156, column: 28, scope: !6)
+!67 = !DILocation(line: 157, column: 1, scope: !6)
+!68 = !DILocation(line: 157, column: 1, scope: !69)
+!69 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 1)
diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
index 13cbba2d08e1..cc1ff00ecdcd 100644
--- a/test/CodeGen/Hexagon/newvaluestore.ll
+++ b/test/CodeGen/Hexagon/newvaluestore.ll
@@ -1,22 +1,13 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
-; Check that we generate new value store packet in V4
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Check that we generate new value store.
@i = global i32 0, align 4
-@j = global i32 10, align 4
-@k = global i32 100, align 4
-define i32 @main() nounwind {
+define i32 @main(i32 %x, i32* %p) nounwind {
entry:
; CHECK: memw(r{{[0-9]+}}+#{{[0-9]+}}) = r{{[0-9]+}}.new
- %number1 = alloca i32, align 4
- %number2 = alloca i32, align 4
- %number3 = alloca i32, align 4
- %0 = load i32 , i32 * @i, align 4
- store i32 %0, i32* %number1, align 4
- %1 = load i32 , i32 * @j, align 4
- store i32 %1, i32* %number2, align 4
- %2 = load i32 , i32 * @k, align 4
- store i32 %2, i32* %number3, align 4
- ret i32 %0
+ %t0 = load i32, i32* @i, align 4
+ store i32 %t0, i32* %p, align 4
+ ret i32 %x
}
diff --git a/test/CodeGen/Hexagon/opt-addr-mode.ll b/test/CodeGen/Hexagon/opt-addr-mode.ll
new file mode 100644
index 000000000000..7cb437c327cf
--- /dev/null
+++ b/test/CodeGen/Hexagon/opt-addr-mode.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 -disable-hexagon-amodeopt < %s | FileCheck %s --check-prefix=CHECK-NO-AMODE
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 -disable-hexagon-amodeopt=0 -hexagon-amode-growth-limit=4 < %s | FileCheck %s --check-prefix=CHECK-AMODE
+
+; CHECK-NO-AMODE: [[REG0:(r[0-9]+)]] = ##global_2
+; CHECK-NO-AMODE: memw([[REG0]] + {{.*}}<<#2) =
+
+; CHECK-AMODE: [[REG1:(r[0-9]+)]] = memw(##global_1)
+; CHECK-AMODE: memw([[REG1]]<<#2 + ##global_2) =
+
+@global_1 = external global i32, align 4
+@global_2 = external global [128 x i32], align 8
+
+declare i32 @foo(i32, i32) #0
+
+define i32 @fred(i32 %a0, i32 %a1, i32* %p) #0 {
+entry:
+ %call24 = tail call i32 @foo(i32 %a0, i32 1) #0
+ %tobool26 = icmp eq i32 %call24, 0
+ br i1 %tobool26, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph: ; preds = %entry
+ %cmp3 = icmp sgt i32 %a1, 19
+ %sub = sub nsw i32 19, %a0
+ %xor = xor i32 %a0, 1
+ br i1 %cmp3, label %while.body.us.preheader, label %while.body.preheader
+
+while.body.preheader: ; preds = %while.body.lr.ph
+ br label %while.body
+
+while.body.us.preheader: ; preds = %while.body.lr.ph
+ br label %while.body.us
+
+while.body.us: ; preds = %while.body.us.preheader, %while.cond.backedge.us
+ %call27.us = phi i32 [ %call.us, %while.cond.backedge.us ], [ %call24, %while.body.us.preheader ]
+ %x0 = load i32, i32* %p, align 4, !tbaa !4
+ %cmp.us = icmp sgt i32 %x0, 0
+ br i1 %cmp.us, label %if.then.us, label %if.end.us
+
+if.then.us: ; preds = %while.body.us
+ %sext.us = shl i32 %call27.us, 24
+ %conv2.us = ashr i32 %sext.us, 24
+ %x10 = tail call i32 @foo(i32 %conv2.us, i32 %sext.us) #0
+ br label %if.end.us
+
+if.end.us: ; preds = %if.then.us, %while.body.us
+ %x1 = load i32, i32* %p, align 4, !tbaa !4
+ %call8.us = tail call i32 @foo(i32 %sub, i32 %a1) #0
+ %tobool11.us = icmp eq i32 %call8.us, 0
+ br i1 %tobool11.us, label %while.cond.backedge.us, label %if.then12.us
+
+if.then12.us: ; preds = %if.end.us
+ %x3 = load i32, i32* %p, align 4, !tbaa !4
+ %sub13.us = sub i32 %x3, %x1
+ %x4 = load i32, i32* @global_1, align 4, !tbaa !4
+ %arrayidx.us = getelementptr inbounds [128 x i32], [128 x i32]* @global_2, i32 0, i32 %x4
+ store i32 %sub13.us, i32* %arrayidx.us, align 4, !tbaa !4
+ br label %while.cond.backedge.us
+
+while.cond.backedge.us: ; preds = %if.then12.us, %if.end.us
+ %call.us = tail call i32 @foo(i32 %a0, i32 2) #0
+ %tobool.us = icmp eq i32 %call.us, 0
+ br i1 %tobool.us, label %while.end.loopexit, label %while.body.us
+
+while.body: ; preds = %while.body.preheader, %while.cond.backedge
+ %call27 = phi i32 [ %call, %while.cond.backedge ], [ %call24, %while.body.preheader ]
+ %x5 = load i32, i32* %p, align 4, !tbaa !4
+ %cmp = icmp sgt i32 %x5, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %while.body
+ %sext = shl i32 %call27, 24
+ %conv2 = ashr i32 %sext, 24
+ %x11 = tail call i32 @foo(i32 %conv2, i32 %sext) #0
+ br label %if.end
+
+if.end: ; preds = %if.then, %while.body
+ %tobool11 = icmp eq i32 %call27, 0
+ br i1 %tobool11, label %while.cond.backedge, label %if.then12
+
+if.then12: ; preds = %if.end
+ %x7 = load i32, i32* @global_1, align 4, !tbaa !4
+ %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @global_2, i32 0, i32 %x7
+ store i32 0, i32* %arrayidx, align 4, !tbaa !4
+ br label %while.cond.backedge
+
+while.cond.backedge: ; preds = %if.then12, %if.end
+ %call = tail call i32 @foo(i32 %a0, i32 3) #0
+ %tobool = icmp eq i32 %call, 0
+ br i1 %tobool, label %while.end.loopexit33, label %while.body
+
+while.end.loopexit: ; preds = %while.cond.backedge.us
+ br label %while.end
+
+while.end.loopexit33: ; preds = %while.cond.backedge
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit33, %while.end.loopexit, %entry
+ ret i32 0
+}
+
+attributes #0 = { nounwind }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
diff --git a/test/CodeGen/Hexagon/packetize-tailcall-arg.ll b/test/CodeGen/Hexagon/packetize-tailcall-arg.ll
new file mode 100644
index 000000000000..17afd7df94a3
--- /dev/null
+++ b/test/CodeGen/Hexagon/packetize-tailcall-arg.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; There should only be one packet:
+; {
+; jump free
+; r0 = memw(r0 + #-4)
+; }
+;
+; CHECK: {
+; CHECK-NOT: {
+
+define void @fred(i8* %p) nounwind {
+entry:
+ %arrayidx = getelementptr inbounds i8, i8* %p, i32 -4
+ %t0 = bitcast i8* %arrayidx to i8**
+ %t1 = load i8*, i8** %t0, align 4
+ tail call void @free(i8* %t1)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) nounwind
+
diff --git a/test/CodeGen/Hexagon/peephole-op-swap.ll b/test/CodeGen/Hexagon/peephole-op-swap.ll
new file mode 100644
index 000000000000..32db7851fb8b
--- /dev/null
+++ b/test/CodeGen/Hexagon/peephole-op-swap.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+; The operand-swapping code in HexagonPeephole was not handling subregisters
+; correctly, resulting in a crash on this code.
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+@float_rounding_mode = external global i8, align 1
+@float_exception_flags = external global i8, align 1
+
+; Function Attrs: nounwind
+define i64 @fred(i32 %a) #0 {
+entry:
+ br i1 undef, label %cleanup, label %lor.lhs.false
+
+lor.lhs.false: ; preds = %entry
+ %cmp3 = icmp eq i32 undef, 255
+ %tobool4 = icmp ne i32 undef, 0
+ %or.cond = and i1 %tobool4, %cmp3
+ %. = select i1 %or.cond, i64 9223372036854775807, i64 -9223372036854775808
+ br label %cleanup
+
+cleanup: ; preds = %lor.lhs.false, %entry
+ %retval.0 = phi i64 [ 9223372036854775807, %entry ], [ %., %lor.lhs.false ]
+ ret i64 %retval.0
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Hexagon/pic-local.ll b/test/CodeGen/Hexagon/pic-local.ll
new file mode 100644
index 000000000000..48b0096aa652
--- /dev/null
+++ b/test/CodeGen/Hexagon/pic-local.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -relocation-model=pic < %s | FileCheck %s
+
+define private void @f1() {
+ ret void
+}
+
+define internal void @f2() {
+ ret void
+}
+
+define void()* @get_f1() {
+ ; CHECK: r0 = add(pc, ##.Lf1@PCREL)
+ ret void()* @f1
+}
+
+define void()* @get_f2() {
+ ; CHECK: r0 = add(pc, ##f2@PCREL)
+ ret void()* @f2
+}
diff --git a/test/CodeGen/Hexagon/pic-regusage.ll b/test/CodeGen/Hexagon/pic-regusage.ll
new file mode 100644
index 000000000000..53c4ba40cb9a
--- /dev/null
+++ b/test/CodeGen/Hexagon/pic-regusage.ll
@@ -0,0 +1,69 @@
+; RUN: llc -march=hexagon -relocation-model=pic < %s | FileCheck %s
+
+; Force the use of R14 (by clobbering everything else in the inline asm).
+; Make sure that R14 is not set before the __save call (which will clobber
+; R14, R15 and R28).
+; CHECK: call __save_r16_through_r27
+; CHECK: }
+; CHECK: r14{{ *}}=
+
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+; Function Attrs: nounwind optsize
+define i32 @_Z7testR14Pi(i32* nocapture %res) #0 {
+entry:
+ %0 = load i32, i32* %res, align 4
+ %1 = tail call { i32, i32 } asm "r0=$2\0A\09$1=add(r0,#$3)\0A\09$0=add(r0,#$4)\0A\09", "=r,=r,r,i,i,~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r15},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27}"(i32 %0, i32 40, i32 50) #1
+ %asmresult = extractvalue { i32, i32 } %1, 0
+ %asmresult1 = extractvalue { i32, i32 } %1, 1
+ store i32 %asmresult, i32* %res, align 4
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %asmresult1) #2
+ %2 = load i32, i32* %res, align 4
+ %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %2) #2
+ ret i32 0
+}
+
+; Function Attrs: nounwind optsize
+declare i32 @printf(i8*, ...) #0
+
+; Same as above for R15.
+; CHECK: call __save_r16_through_r27
+; CHECK: }
+; CHECK: r15{{ *}}=
+
+; Function Attrs: nounwind optsize
+define i32 @_Z7testR15Pi(i32* nocapture %res) #0 {
+entry:
+ %0 = load i32, i32* %res, align 4
+ %1 = tail call { i32, i32 } asm "r0=$2\0A\09$1=add(r0,#$3)\0A\09$0=add(r0,#$4)\0A\09", "=r,=r,r,i,i,~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27}"(i32 %0, i32 40, i32 50) #1
+ %asmresult = extractvalue { i32, i32 } %1, 0
+ %asmresult1 = extractvalue { i32, i32 } %1, 1
+ store i32 %asmresult, i32* %res, align 4
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %asmresult1) #2
+ %2 = load i32, i32* %res, align 4
+ %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %2) #2
+ ret i32 0
+}
+
+; Same as above for R28.
+; CHECK: call __save_r16_through_r27
+; CHECK: }
+; CHECK: r28{{ *}}=
+
+; Function Attrs: nounwind optsize
+define i32 @_Z7testR28Pi(i32* nocapture %res) #0 {
+entry:
+ %0 = load i32, i32* %res, align 4
+ %1 = tail call { i32, i32 } asm "r0=$2\0A\09$1=add(r0,#$3)\0A\09$0=add(r0,#$4)\0A\09", "=r,=r,r,i,i,~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26}"(i32 %0, i32 40, i32 50) #1
+ %asmresult = extractvalue { i32, i32 } %1, 0
+ %asmresult1 = extractvalue { i32, i32 } %1, 1
+ store i32 %asmresult, i32* %res, align 4
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %asmresult1) #2
+ %2 = load i32, i32* %res, align 4
+ %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %2) #2
+ ret i32 0
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind optsize }
diff --git a/test/CodeGen/Hexagon/rdf-copy-undef2.ll b/test/CodeGen/Hexagon/rdf-copy-undef2.ll
new file mode 100644
index 000000000000..5f29d414ffc1
--- /dev/null
+++ b/test/CodeGen/Hexagon/rdf-copy-undef2.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+target triple = "hexagon"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare signext i16 @cat(i16 signext) #1
+declare void @danny(i16 signext, i16 signext, i16 signext, i16* nocapture readonly, i16 signext, i16* nocapture) #1
+declare void @sammy(i16* nocapture readonly, i16* nocapture readonly, i16* nocapture readonly, i32* nocapture, i16* nocapture, i16 signext, i16 signext, i16 signext) #1
+declare i8* @llvm.stacksave() #2
+declare void @llvm.stackrestore(i8*) #2
+
+define i32 @fred(i16 signext %p0, i16 signext %p1, i16* nocapture readonly %p2, i16 signext %p3, i16* nocapture readonly %p4, i16* nocapture %p5) #1 {
+entry:
+ %0 = tail call i8* @llvm.stacksave()
+ %vla = alloca i16, i32 undef, align 8
+ %call17 = call signext i16 @cat(i16 signext 1) #1
+ br i1 undef, label %for.cond23.preheader, label %for.end47
+
+for.cond23.preheader: ; preds = %for.end40, %entry
+ %i.190 = phi i16 [ %inc46, %for.end40 ], [ 0, %entry ]
+ br i1 undef, label %for.body27, label %for.end40
+
+for.body27: ; preds = %for.body27, %for.cond23.preheader
+ %indvars.iv = phi i32 [ %indvars.iv.next, %for.body27 ], [ 0, %for.cond23.preheader ]
+ %call30 = call signext i16 @cat(i16 signext 7) #1
+ %arrayidx32 = getelementptr inbounds i16, i16* %vla, i32 %indvars.iv
+ store i16 %call30, i16* %arrayidx32, align 2
+ %arrayidx37 = getelementptr inbounds i16, i16* undef, i32 %indvars.iv
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i16 undef, %p3
+ br i1 %exitcond, label %for.end40, label %for.body27
+
+for.end40: ; preds = %for.body27, %for.cond23.preheader
+ call void @sammy(i16* nonnull undef, i16* undef, i16* %p4, i32* null, i16* undef, i16 signext undef, i16 signext undef, i16 signext undef) #1
+ %inc46 = add nuw nsw i16 %i.190, 1
+ %exitcond94 = icmp eq i16 %inc46, %call17
+ br i1 %exitcond94, label %for.end47.loopexit, label %for.cond23.preheader
+
+for.end47.loopexit: ; preds = %for.end40
+ %.pre = load i16, i16* undef, align 2
+ br label %for.end47
+
+for.end47: ; preds = %for.end47.loopexit, %entry
+ %1 = phi i16 [ %.pre, %for.end47.loopexit ], [ 0, %entry ]
+ call void @danny(i16 signext %1, i16 signext %p0, i16 signext %p1, i16* %p2, i16 signext %p3, i16* %p5) #1
+ call void @llvm.stackrestore(i8* %0)
+ ret i32 undef
+}
+
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { optsize }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/Hexagon/rdf-copy.ll b/test/CodeGen/Hexagon/rdf-copy.ll
index 96153ca31fa4..afb03a6315d7 100644
--- a/test/CodeGen/Hexagon/rdf-copy.ll
+++ b/test/CodeGen/Hexagon/rdf-copy.ll
@@ -16,7 +16,7 @@
; CHECK-LABEL: LBB0_1
; CHECK: [[DST:r[0-9]+]] = [[SRC:r[0-9]+]]
; CHECK-DAG: memw([[SRC]]
-; CHECK-DAG-NOT: memw([[DST]]
+; CHECK-NOT: memw([[DST]]
; CHECK-LABEL: LBB0_2
target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
diff --git a/test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll b/test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll
new file mode 100644
index 000000000000..7adf7e8a5355
--- /dev/null
+++ b/test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: r0 = #24
+; CHECK-NEXT: r1 =
+; // R2 should be assigned a value from R3+.
+; CHECK-NEXT: r2 = r{{[3-9]}}
+; CHECK-NEXT: trap0
+
+target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %status) #0 {
+entry:
+ %arg1 = alloca i32, align 4
+ %0 = bitcast i32* %arg1 to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #2
+ store i32 %status, i32* %arg1, align 4, !tbaa !1
+ %1 = call i32 asm sideeffect "r0 = #$1\0Ar1 = $2\0Ar2 = $4\0Atrap0 (#0)\0A$0 = r0", "=r,i,r,*m,r,~{r0},~{r1},~{r2}"(i32 24, i32* nonnull %arg1, i32* nonnull %arg1, i32 %status) #2, !srcloc !5
+ call void @llvm.lifetime.end(i64 4, i8* %0) #2
+ ret i32 %1
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{i32 110, i32 129, i32 146, i32 163, i32 183}
diff --git a/test/CodeGen/Hexagon/rdf-inline-asm.ll b/test/CodeGen/Hexagon/rdf-inline-asm.ll
new file mode 100644
index 000000000000..ae09062638dc
--- /dev/null
+++ b/test/CodeGen/Hexagon/rdf-inline-asm.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon"
+
+@x = common global i32* null, align 4
+
+; Function Attrs: nounwind
+define i32 @inotify_init() #0 {
+entry:
+ %0 = tail call i32 asm sideeffect "trap0(#1);\0A", "={r0},{r6},~{memory}"(i32 1043) #1, !srcloc !1
+ %cmp = icmp sgt i32 %0, -4096
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %sub = sub nsw i32 0, %0
+ %1 = load i32*, i32** @x, align 4, !tbaa !2
+ store i32 %sub, i32* %1, align 4, !tbaa !6
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %retval1.0 = phi i32 [ -1, %if.then ], [ %0, %entry ]
+ ret i32 %retval1.0
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!1 = !{i32 155}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"any pointer", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"long", !4, i64 0}
diff --git a/test/CodeGen/Hexagon/rdf-reset-kills.ll b/test/CodeGen/Hexagon/rdf-reset-kills.ll
new file mode 100644
index 000000000000..37db8c5f64e6
--- /dev/null
+++ b/test/CodeGen/Hexagon/rdf-reset-kills.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+; This test used to crash in register scavenger due to incorrectly set
+; kill flags.
+
+target triple = "hexagon"
+
+define void @foo(i64 %a) #0 {
+entry:
+ %conv.i = and i64 %a, 9218868437227405312
+ %cmp = icmp ne i64 %conv.i, 9218868437227405312
+ %and.i37 = and i64 %a, 4503599627370495
+ %tobool = icmp eq i64 %and.i37, 0
+ %or.cond = or i1 %cmp, %tobool
+ br i1 %or.cond, label %lor.lhs.false, label %if.then
+
+lor.lhs.false: ; preds = %entry
+ br i1 undef, label %return, label %if.then
+
+if.then: ; preds = %lor.lhs.false, %entry
+ br label %return
+
+return: ; preds = %if.then, %lor.lhs.false
+ ret void
+}
+
+attributes #0 = { norecurse nounwind }
diff --git a/test/CodeGen/Hexagon/reg-scavengebug-3.ll b/test/CodeGen/Hexagon/reg-scavengebug-3.ll
new file mode 100644
index 000000000000..db9ed55d2da6
--- /dev/null
+++ b/test/CodeGen/Hexagon/reg-scavengebug-3.ll
@@ -0,0 +1,80 @@
+; RUN: llc -O0 -march=hexagon -mcpu=hexagonv60 < %s | FileCheck %s
+
+; CHECK: vmem
+
+target triple = "hexagon"
+
+@vecpreds = external global [15 x <16 x i32>], align 64
+@vectors = external global [15 x <16 x i32>], align 64
+@vector_pairs = external global [15 x <32 x i32>], align 128
+@.str1 = external hidden unnamed_addr constant [20 x i8], align 1
+@.str2 = external hidden unnamed_addr constant [43 x i8], align 1
+@Q6VecPredResult = external global <16 x i32>, align 64
+@.str52 = external hidden unnamed_addr constant [57 x i8], align 1
+@.str54 = external hidden unnamed_addr constant [59 x i8], align 1
+@VectorResult = external global <16 x i32>, align 64
+@.str243 = external hidden unnamed_addr constant [60 x i8], align 1
+@.str251 = external hidden unnamed_addr constant [77 x i8], align 1
+@.str290 = external hidden unnamed_addr constant [65 x i8], align 1
+@VectorPairResult = external global <32 x i32>, align 128
+
+; Function Attrs: nounwind
+declare void @print_vector(i32, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @printf(i8*, ...) #0
+
+; Function Attrs: nounwind
+declare void @print_vecpred(i32, i8*) #0
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vandqrt(<512 x i1>, i32) #1
+
+; Function Attrs: nounwind
+declare void @init_vectors() #0
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #1
+
+; Function Attrs: nounwind
+declare void @init_addresses() #0
+
+; Function Attrs: nounwind
+declare <16 x i32> @llvm.hexagon.V6.vsubhnq(<512 x i1>, <16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+ %0 = load <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vecpreds, i32 0, i32 0), align 64
+ %1 = load <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+ call void @print_vecpred(i32 64, i8* bitcast (<16 x i32>* @Q6VecPredResult to i8*))
+ %2 = load <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+ %call50 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([57 x i8], [57 x i8]* @.str52, i32 0, i32 0)) #3
+ %3 = load <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+ %call52 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([59 x i8], [59 x i8]* @.str54, i32 0, i32 0)) #3
+ %4 = load <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+ %call300 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str290, i32 0, i32 0)) #3
+ %5 = load <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 0), align 64
+ %6 = load <16 x i32>, <16 x i32>* getelementptr inbounds ([15 x <16 x i32>], [15 x <16 x i32>]* @vectors, i32 0, i32 1), align 64
+ %call1373 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str1, i32 0, i32 0), i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str2, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8], [60 x i8]* @.str243, i32 0, i32 0)) #3
+ %7 = call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 1)
+ %call1381 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str1, i32 0, i32 0), i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str2, i32 0, i32 0), i8* getelementptr inbounds ([77 x i8], [77 x i8]* @.str251, i32 0, i32 0)) #3
+ %8 = call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 1)
+ %9 = call <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32> %8, i32 16843009)
+ call void @print_vector(i32 64, i8* bitcast (<16 x i32>* @VectorResult to i8*))
+ %10 = call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 1)
+ %11 = call <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32> %10, i32 16843009)
+ %12 = bitcast <512 x i1> %11 to <16 x i32>
+ %13 = bitcast <16 x i32> %12 to <512 x i1>
+ %14 = call <16 x i32> @llvm.hexagon.V6.vsubhnq(<512 x i1> %13, <16 x i32> undef, <16 x i32> undef)
+ store <16 x i32> %14, <16 x i32>* @VectorResult, align 64
+ ret i32 0
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/Hexagon/reg-scavenger-valid-slot.ll b/test/CodeGen/Hexagon/reg-scavenger-valid-slot.ll
new file mode 100644
index 000000000000..78c4b989b7ac
--- /dev/null
+++ b/test/CodeGen/Hexagon/reg-scavenger-valid-slot.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+; This testcase tries to force spills of both vector and int registers
+; in a function where scavenging slots were reserved for both register
+; classes. The original problem was that the scavenger selected an int
+; slot (with size/alignment of 4) for a vector register (with size/
+; alignment of 64). This caused an assertion in the assembler printer
+; due to an offset in a vector store having unexpected low-order bits.
+
+; We cannot directly whether the bits appear or not, since they will be
+; truncated off by the time we see the output, but we can check that
+; we got to the end of the function without crashing.
+
+; CHECK: endloop
+; CHECK: dealloc_return
+
+target triple = "hexagon"
+
+define void @foo(<16 x i32>* nocapture readnone %p) #0 {
+entry:
+ %0 = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r"() #1
+ %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 0
+ %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 1
+ %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 2
+ %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 3
+ %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 4
+ %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 5
+ %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 6
+ %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 7
+ %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 8
+ %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 9
+ %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 10
+ %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 11
+ %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 12
+ %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 13
+ %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 14
+ %asmresult15 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 15
+ %asmresult16 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 16
+ %asmresult17 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 17
+ %asmresult18 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 18
+ %asmresult19 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 19
+ %asmresult20 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 20
+ %asmresult21 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 21
+ %asmresult22 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 22
+ %asmresult23 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 23
+ %asmresult24 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 24
+ %asmresult25 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 25
+ %asmresult26 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 26
+ %asmresult27 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 27
+ %asmresult28 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %0, 28
+ %1 = tail call { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } asm "nop", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v"() #1
+ %asmresult29 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 0
+ %asmresult30 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 1
+ %asmresult31 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 2
+ %asmresult32 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 3
+ %asmresult33 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 4
+ %asmresult34 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 5
+ %asmresult35 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 6
+ %asmresult36 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 7
+ %asmresult37 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 8
+ %asmresult38 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 9
+ %asmresult39 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 10
+ %asmresult40 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 11
+ %asmresult41 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 12
+ %asmresult42 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 13
+ %asmresult43 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 14
+ %asmresult44 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 15
+ %asmresult45 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 16
+ %asmresult46 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 17
+ %asmresult47 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 18
+ %asmresult48 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 19
+ %asmresult49 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 20
+ %asmresult50 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 21
+ %asmresult51 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 22
+ %asmresult52 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 23
+ %asmresult53 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 24
+ %asmresult54 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 25
+ %asmresult55 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 26
+ %asmresult56 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 27
+ %asmresult57 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 28
+ %asmresult58 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 29
+ %asmresult59 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 30
+ %asmresult60 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %1, 31
+ %2 = tail call { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } asm "nop", "=q,=q,=q,=q"() #1
+ %asmresult61 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %2, 0
+ %asmresult62 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %2, 1
+ %asmresult63 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %2, 2
+ %asmresult64 = extractvalue { <16 x i32>, <16 x i32>, <16 x i32>, <16 x i32> } %2, 3
+ %3 = tail call <16 x i32> asm "nop", "=q,q,q,q,q"(<16 x i32> %asmresult61, <16 x i32> %asmresult62, <16 x i32> %asmresult63, <16 x i32> %asmresult64) #1
+ tail call void asm sideeffect "nop", "q,q,q"(<16 x i32> %asmresult61, <16 x i32> %asmresult62, <16 x i32> %asmresult63) #2
+ tail call void asm sideeffect "nop", "q,q"(<16 x i32> %asmresult64, <16 x i32> %3) #2
+ tail call void asm sideeffect "nop", "v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v"(<16 x i32> %asmresult29, <16 x i32> %asmresult30, <16 x i32> %asmresult31, <16 x i32> %asmresult32, <16 x i32> %asmresult33, <16 x i32> %asmresult34, <16 x i32> %asmresult35, <16 x i32> %asmresult36, <16 x i32> %asmresult37, <16 x i32> %asmresult38, <16 x i32> %asmresult39, <16 x i32> %asmresult40, <16 x i32> %asmresult41, <16 x i32> %asmresult42, <16 x i32> %asmresult43, <16 x i32> %asmresult44, <16 x i32> %asmresult45, <16 x i32> %asmresult46, <16 x i32> %asmresult47, <16 x i32> %asmresult48, <16 x i32> %asmresult49, <16 x i32> %asmresult50, <16 x i32> %asmresult51, <16 x i32> %asmresult52, <16 x i32> %asmresult53, <16 x i32> %asmresult54, <16 x i32> %asmresult55, <16 x i32> %asmresult56, <16 x i32> %asmresult57, <16 x i32> %asmresult58, <16 x i32> %asmresult59, <16 x i32> %asmresult60) #2
+ tail call void asm sideeffect "nop", "r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7, i32 %asmresult8, i32 %asmresult9, i32 %asmresult10, i32 %asmresult11, i32 %asmresult12, i32 %asmresult13, i32 %asmresult14, i32 %asmresult15, i32 %asmresult16, i32 %asmresult17, i32 %asmresult18, i32 %asmresult19, i32 %asmresult20, i32 %asmresult21, i32 %asmresult22, i32 %asmresult23, i32 %asmresult24, i32 %asmresult25, i32 %asmresult26, i32 %asmresult27, i32 %asmresult28) #2
+ ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/Hexagon/restore-single-reg.ll b/test/CodeGen/Hexagon/restore-single-reg.ll
new file mode 100644
index 000000000000..8abd4a855463
--- /dev/null
+++ b/test/CodeGen/Hexagon/restore-single-reg.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Generate the inline restore for single register pair for functions
+; with "optsize" attribute.
+
+; CHECK-LABEL: fred_os
+; CHECK-DAG: memd{{.*}} = r17:16
+; CHECK-DAG: r17:16 = memd{{.*}}
+; CHECK-DAG: deallocframe
+; CHECK-NOT: call __restore
+
+define i32 @fred_os(i32 %x) #0 {
+entry:
+ %call = tail call i32 @foo(i32 %x) #2
+ %call1 = tail call i32 @bar(i32 %x, i32 %call) #2
+ ret i32 %call1
+}
+
+; Generate the restoring call for single register pair for functions
+; with "minsize" attribute.
+
+; CHECK-LABEL: fred_oz
+; CHECK-DAG: memd{{.*}} = r17:16
+; CHECK-NOT: r17:16 = memd{{.*}}
+; CHECK-DAG: call __restore
+
+define i32 @fred_oz(i32 %x) #1 {
+entry:
+ %call = tail call i32 @foo(i32 %x) #2
+ %call1 = tail call i32 @bar(i32 %x, i32 %call) #2
+ ret i32 %call1
+}
+
+declare i32 @foo(i32) #2
+declare i32 @bar(i32, i32) #2
+
+attributes #0 = { nounwind optsize "disable-tail-calls"="false" }
+attributes #1 = { nounwind minsize "disable-tail-calls"="false" }
+attributes #2 = { nounwind optsize }
diff --git a/test/CodeGen/Hexagon/ret-struct-by-val.ll b/test/CodeGen/Hexagon/ret-struct-by-val.ll
new file mode 100644
index 000000000000..26ed2ff36f77
--- /dev/null
+++ b/test/CodeGen/Hexagon/ret-struct-by-val.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: r0 = add(r0, r1)
+
+; Allow simple structures to be returned by value.
+
+%s = type { i32, i32 }
+
+declare %s @foo() #0
+
+define i32 @fred() #0 {
+ %t0 = call %s @foo()
+ %x = extractvalue %s %t0, 0
+ %y = extractvalue %s %t0, 1
+ %r = add i32 %x, %y
+ ret i32 %r
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/Hexagon/runtime-stkchk.ll b/test/CodeGen/Hexagon/runtime-stkchk.ll
new file mode 100644
index 000000000000..a4e8f117679e
--- /dev/null
+++ b/test/CodeGen/Hexagon/runtime-stkchk.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv55 -enable-stackovf-sanitizer < %s | FileCheck %s
+
+; CHECK-LABEL: foo_1
+; CHECK: __runtime_stack_check
+define i32 @foo_1(i32 %n) #0 {
+entry:
+ %local = alloca [1024 x i32], align 8
+ %0 = bitcast [1024 x i32]* %local to i8*
+ call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+ %arraydecay = getelementptr inbounds [1024 x i32], [1024 x i32]* %local, i32 0, i32 0
+ call void @baz_1(i32* %arraydecay) #3
+ %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %local, i32 0, i32 %n
+ %1 = load i32, i32* %arrayidx, align 4
+ call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+ ret i32 %1
+}
+
+; CHECK-LABEL: foo_2
+; CHECK: __save_r16_through_r19_stkchk
+define i32 @foo_2(i32 %n, i32* %y) #0 {
+entry:
+ %local = alloca [2048 x i32], align 8
+ %0 = bitcast [2048 x i32]* %local to i8*
+ call void @llvm.lifetime.start(i64 8192, i8* %0) #1
+ %arraydecay = getelementptr inbounds [2048 x i32], [2048 x i32]* %local, i32 0, i32 0
+ call void @baz_2(i32* %y, i32* %arraydecay) #3
+ %1 = load i32, i32* %y, align 4
+ %add = add nsw i32 %n, %1
+ %arrayidx = getelementptr inbounds [2048 x i32], [2048 x i32]* %local, i32 0, i32 %add
+ %2 = load i32, i32* %arrayidx, align 4
+ call void @llvm.lifetime.end(i64 8192, i8* %0) #1
+ ret i32 %2
+}
+
+declare void @baz_1(i32*) #2
+declare void @baz_2(i32*, i32*) #2
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { optsize }
+
diff --git a/test/CodeGen/Hexagon/sdata-array.ll b/test/CodeGen/Hexagon/sdata-array.ll
new file mode 100644
index 000000000000..89ef46079f7c
--- /dev/null
+++ b/test/CodeGen/Hexagon/sdata-array.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; No arrays in sdata.
+; CHECK: memb(##foo)
+
+@foo = common global [4 x i8] zeroinitializer, align 1
+
+define void @set() nounwind {
+entry:
+ store i8 0, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @foo, i32 0, i32 0), align 1
+ ret void
+}
+
diff --git a/test/CodeGen/Hexagon/sdata-basic.ll b/test/CodeGen/Hexagon/sdata-basic.ll
new file mode 100644
index 000000000000..db7375417df9
--- /dev/null
+++ b/test/CodeGen/Hexagon/sdata-basic.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=hexagon -O2 < %s | FileCheck %s
+; CHECK-NOT: ##var
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+@var = external global i32
+
+define i32 @foo() nounwind readonly {
+entry:
+ %0 = load i32, i32* @var, align 4, !tbaa !0
+ ret i32 %0
+}
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/section_7275.ll b/test/CodeGen/Hexagon/section_7275.ll
new file mode 100644
index 000000000000..c2b80ae3f69d
--- /dev/null
+++ b/test/CodeGen/Hexagon/section_7275.ll
@@ -0,0 +1,54 @@
+; The reason for the bug was that when deciding if a global
+; variable can be part of sdata, we were wrongly ignoring
+; the presence of any section specified for the variable
+; using the section attribute. If such a section is specified,
+; and that section is not sdata*/sbss* then the variable
+; cannot use GPREL addressing, i.e. memw(#variablename).
+
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK-LABEL: foo
+; CHECK-DAG: memw(##b)
+; CHECK-DAG: memw(#d)
+; CHECK-DAG: memw(##g)
+; CHECK-DAG: memw(#h)
+; CHECK-DAG: memw(#f)
+; CHECK-DAG: memw(##e)
+; CHECK-DAG: memw(#a)
+; CHECK-DAG: memw(#c)
+; CHECK-LABEL: bar
+; CHECK: memw(##b)
+
+@b = global i32 0, section ".data.section", align 4
+@a = common global i32 0, align 4
+@d = global i32 0, section ".sbss", align 4
+@c = global i32 0, section ".sdata", align 4
+@f = global i32 0, section ".sbss.4", align 4
+@e = global i32 0, section ".sdatafoo", align 4
+@h = global i32 0, section ".sdata.4", align 4
+@g = global i32 0, section ".sbssfoo", align 4
+
+define void @foo() nounwind {
+entry:
+ %0 = load i32, i32* @b, align 4
+ store i32 %0, i32* @a, align 4
+ %1 = load i32, i32* @d, align 4
+ store i32 %1, i32* @c, align 4
+ %2 = load i32, i32* @f, align 4
+ store i32 %2, i32* @e, align 4
+ %3 = load i32, i32* @h, align 4
+ store i32 %3, i32* @g, align 4
+ ret void
+}
+
+define void @bar() nounwind section ".function.section" {
+entry:
+ %0 = load i32, i32* @a, align 4
+ store i32 %0, i32* @b, align 4
+ ret void
+}
+
+define i32 @main() nounwind readnone {
+entry:
+ ret i32 0
+}
+
diff --git a/test/CodeGen/Hexagon/select-instr-align.ll b/test/CodeGen/Hexagon/select-instr-align.ll
new file mode 100644
index 000000000000..e3b2929d52f1
--- /dev/null
+++ b/test/CodeGen/Hexagon/select-instr-align.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=hexagon -enable-hexagon-hvx < %s | FileCheck %s
+; CHECK-LABEL: aligned_load:
+; CHECK: = vmem({{.*}})
+; CHECK-LABEL: aligned_store:
+; CHECK: vmem({{.*}}) =
+; CHECK-LABEL: unaligned_load:
+; CHECK: = vmemu({{.*}})
+; CHECK-LABEL: unaligned_store:
+; CHECK: vmemu({{.*}}) =
+
+define <16 x i32> @aligned_load(<16 x i32>* %p, <16 x i32> %a) {
+ %v = load <16 x i32>, <16 x i32>* %p, align 64
+ ret <16 x i32> %v
+}
+
+define void @aligned_store(<16 x i32>* %p, <16 x i32> %a) {
+ store <16 x i32> %a, <16 x i32>* %p, align 64
+ ret void
+}
+
+define <16 x i32> @unaligned_load(<16 x i32>* %p, <16 x i32> %a) {
+ %v = load <16 x i32>, <16 x i32>* %p, align 32
+ ret <16 x i32> %v
+}
+
+define void @unaligned_store(<16 x i32>* %p, <16 x i32> %a) {
+ store <16 x i32> %a, <16 x i32>* %p, align 32
+ ret void
+}
+
+
diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index 760b8b559725..c3237b748881 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched -disable-hexagon-misched < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
@num = external global i32
@acc = external global i32
@val = external global i32
-; CHECK: memw(##num)
-; CHECK: memw(##acc)
-; CHECK: memw(##val)
+; CHECK-DAG: memw(#num)
+; CHECK-DAG: memw(#acc)
+; CHECK-DAG: memw(#val)
define void @foo() nounwind {
entry:
diff --git a/test/CodeGen/Hexagon/store-shift.ll b/test/CodeGen/Hexagon/store-shift.ll
new file mode 100644
index 000000000000..866930990baa
--- /dev/null
+++ b/test/CodeGen/Hexagon/store-shift.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK-DAG: r[[BASE:[0-9]+]] += add
+; CHECK-DAG: r[[IDX0:[0-9]+]] = add(r2, #5)
+; CHECK-DAG: r[[IDX1:[0-9]+]] = add(r2, #6)
+; CHECK-DAG: memw(r0 + r[[IDX0]]<<#2) = r3
+; CHECK-DAG: memw(r0 + r[[IDX1]]<<#2) = r3
+; CHECK-DAG: memw(r[[BASE]] + r[[IDX0]]<<#2) = r[[IDX0]]
+; CHECK-DAG: memw(r[[BASE]] + r[[IDX1]]<<#2) = r[[IDX0]]
+
+target triple = "hexagon"
+
+@G = external global i32, align 4
+
+; Function Attrs: norecurse nounwind
+define void @fred(i32* nocapture %A, [50 x i32]* nocapture %B, i32 %N, i32 %M) #0 {
+entry:
+ %add = add nsw i32 %N, 5
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+ store i32 %M, i32* %arrayidx, align 4, !tbaa !1
+ %add2 = add nsw i32 %N, 6
+ %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %add2
+ store i32 %M, i32* %arrayidx3, align 4, !tbaa !1
+ %add4 = add nsw i32 %N, 35
+ %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 %add4
+ store i32 %add, i32* %arrayidx5, align 4, !tbaa !1
+ %arrayidx8 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 %add, i32 %add
+ store i32 %add, i32* %arrayidx8, align 4, !tbaa !1
+ %inc = add nsw i32 %N, 6
+ %arrayidx8.1 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 %add, i32 %inc
+ store i32 %add, i32* %arrayidx8.1, align 4, !tbaa !1
+ %sub = add nsw i32 %N, 4
+ %arrayidx10 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 %add, i32 %sub
+ %0 = load i32, i32* %arrayidx10, align 4, !tbaa !1
+ %add11 = add nsw i32 %0, 1
+ store i32 %add11, i32* %arrayidx10, align 4, !tbaa !1
+ %1 = load i32, i32* %arrayidx, align 4, !tbaa !1
+ %add13 = add nsw i32 %N, 25
+ %arrayidx15 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 %add13, i32 %add
+ store i32 %1, i32* %arrayidx15, align 4, !tbaa !1
+ store i32 5, i32* @G, align 4, !tbaa !1
+ ret void
+}
+
+attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/storerinewabs.ll b/test/CodeGen/Hexagon/storerinewabs.ll
new file mode 100644
index 000000000000..73e513a8bcee
--- /dev/null
+++ b/test/CodeGen/Hexagon/storerinewabs.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 < %s | FileCheck %s
+
+@global = external global i32, align 4
+
+; There was a bug causing ### to be printed. Make sure we print ## instead.
+; CHECK-LABEL: foo
+; CHECK: memw(##global) =
+
+define void @foo(i32 %x) #0 {
+entry:
+ %add = add nsw i32 %x, 1
+ store i32 %add, i32* @global, align 4
+ ret void
+}
+
+attributes #0 = { norecurse nounwind }
+
diff --git a/test/CodeGen/Hexagon/struct_args_large.ll b/test/CodeGen/Hexagon/struct_args_large.ll
index 1438d73eacf7..fb4780b0e5a9 100644
--- a/test/CodeGen/Hexagon/struct_args_large.ll
+++ b/test/CodeGen/Hexagon/struct_args_large.ll
@@ -1,4 +1,4 @@
-; XFAIL:
+; XFAIL: *
; RUN: llc -march=hexagon < %s | FileCheck %s
; CHECK: r[[T0:[0-9]+]] = CONST32(#s2)
; CHECK: memw(r29+#0) = r{{.}}
diff --git a/test/CodeGen/Hexagon/sube.ll b/test/CodeGen/Hexagon/sube.ll
index fab3dcaefa86..7bc00759303f 100644
--- a/test/CodeGen/Hexagon/sube.ll
+++ b/test/CodeGen/Hexagon/sube.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 -disable-post-ra < %s | FileCheck %s
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #1)
; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #0)
+; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #1)
; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
diff --git a/test/CodeGen/Hexagon/tail-dup-subreg-map.ll b/test/CodeGen/Hexagon/tail-dup-subreg-map.ll
new file mode 100644
index 000000000000..08dadeb9aaa4
--- /dev/null
+++ b/test/CodeGen/Hexagon/tail-dup-subreg-map.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: asserts
+
+; When tail-duplicating a block with PHI nodes that use subregisters, the
+; subregisters were dropped by the tail duplicator, resulting in invalid
+; COPY instructions being generated.
+
+; CHECK: = extractu(r{{[0-9]+}}, #15, #17)
+
+target triple = "hexagon"
+
+%struct.0 = type { i64, i16 }
+%struct.1 = type { i64, i64 }
+
+declare hidden fastcc void @foo(%struct.0* noalias nocapture, i8 signext, i8 zeroext, i32, i64, i64) unnamed_addr #0
+
+define void @fred(%struct.0* noalias nocapture sret %agg.result, %struct.1* byval nocapture readonly align 8 %a) #1 {
+entry:
+ %0 = load i64, i64* undef, align 8
+ switch i32 undef, label %if.else [
+ i32 32767, label %if.then
+ i32 0, label %if.then7
+ ]
+
+if.then: ; preds = %entry
+ ret void
+
+if.then7: ; preds = %entry
+ br i1 undef, label %if.then.i, label %if.else16.i
+
+if.then.i: ; preds = %if.then7
+ br i1 undef, label %if.then5.i, label %if.else.i
+
+if.then5.i: ; preds = %if.then.i
+ %shl.i21 = shl i64 %0, 0
+ br label %if.end.i
+
+if.else.i: ; preds = %if.then.i
+ %shl12.i = shl i64 %0, undef
+ br label %if.end.i
+
+if.end.i: ; preds = %if.else.i, %if.then5.i
+ %aSig0.0 = phi i64 [ undef, %if.then5.i ], [ %shl12.i, %if.else.i ]
+ %storemerge43.i = phi i64 [ %shl.i21, %if.then5.i ], [ 0, %if.else.i ]
+ %sub15.i = sub nsw i32 -63, undef
+ br label %if.end13
+
+if.else16.i: ; preds = %if.then7
+ br label %if.end13
+
+if.else: ; preds = %entry
+ %or12 = or i64 undef, 281474976710656
+ br label %if.end13
+
+if.end13: ; preds = %if.else, %if.else16.i, %if.end.i
+ %aSig1.1 = phi i64 [ %0, %if.else ], [ %storemerge43.i, %if.end.i ], [ undef, %if.else16.i ]
+ %aSig0.2 = phi i64 [ %or12, %if.else ], [ %aSig0.0, %if.end.i ], [ undef, %if.else16.i ]
+ %aExp.0 = phi i32 [ undef, %if.else ], [ %sub15.i, %if.end.i ], [ undef, %if.else16.i ]
+ %shl2.i = shl i64 %aSig0.2, 15
+ %shr.i = lshr i64 %aSig1.1, 49
+ %or.i = or i64 %shl2.i, %shr.i
+ tail call fastcc void @foo(%struct.0* noalias %agg.result, i8 signext 80, i8 zeroext undef, i32 %aExp.0, i64 %or.i, i64 undef)
+ unreachable
+}
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/Hexagon/tls_pic.ll b/test/CodeGen/Hexagon/tls_pic.ll
new file mode 100644
index 000000000000..190e1d71d39b
--- /dev/null
+++ b/test/CodeGen/Hexagon/tls_pic.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 -march=hexagon -relocation-model=pic < %s | FileCheck %s
+
+@dst_ie = thread_local(initialexec) global i32 0, align 4
+@src_ie = thread_local(initialexec) global i32 0, align 4
+
+; CHECK-LABEL: test_initial_exec
+; CHECK-DAG: = add(pc, ##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK-DAG: = ##src_ie@IEGOT
+; CHECK-DAG: = ##dst_ie@IEGOT
+; CHECK-NOT: call
+define i32 @test_initial_exec() nounwind {
+entry:
+ %0 = load i32, i32* @src_ie, align 4
+ store i32 %0, i32* @dst_ie, align 4
+ ret i32 0
+}
+
+@dst_gd = external thread_local global i32
+@src_gd = external thread_local global i32
+
+; At the moment, the local-dynamic model uses the same code as the
+; general-dynamic model.
+
+; CHECK-LABEL: test_dynamic
+; CHECK-DAG: = add(pc, ##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK-DAG: = ##src_gd@GDGOT
+; CHECK-DAG: = ##dst_gd@GDGOT
+; CHECK-DAG: call src_gd@GDPLT
+; CHECK-DAG: call dst_gd@GDPLT
+
+define i32 @test_dynamic() nounwind {
+entry:
+ %0 = load i32, i32* @src_gd, align 4
+ store i32 %0, i32* @dst_gd, align 4
+ ret i32 0
+}
+
diff --git a/test/CodeGen/Hexagon/tls_static.ll b/test/CodeGen/Hexagon/tls_static.ll
new file mode 100644
index 000000000000..ad2ca716b70d
--- /dev/null
+++ b/test/CodeGen/Hexagon/tls_static.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O0 -march=hexagon -relocation-model=static < %s | FileCheck %s
+
+@dst_le = thread_local global i32 0, align 4
+@src_le = thread_local global i32 0, align 4
+
+; CHECK-LABEL: test_local_exec
+; CHECK-DAG: = ##src_le@TPREL
+; CHECK-DAG: = ##dst_le@TPREL
+define i32 @test_local_exec() nounwind {
+entry:
+ %0 = load i32, i32* @src_le, align 4
+ store i32 %0, i32* @dst_le, align 4
+ ret i32 0
+}
+
+@dst_ie = external thread_local global i32
+@src_ie = external thread_local global i32
+
+; CHECK-LABEL: test_initial_exec:
+; CHECK-DAG: = memw(##src_ie@IE)
+; CHECK-DAG: = memw(##dst_ie@IE)
+define i32 @test_initial_exec() nounwind {
+entry:
+ %0 = load i32, i32* @src_ie, align 4
+ store i32 %0, i32* @dst_ie, align 4
+ ret i32 0
+}
+
diff --git a/test/CodeGen/Hexagon/v60-cur.ll b/test/CodeGen/Hexagon/v60-cur.ll
new file mode 100644
index 000000000000..fe24309f5b87
--- /dev/null
+++ b/test/CodeGen/Hexagon/v60-cur.ll
@@ -0,0 +1,62 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Test that we generate a .cur
+
+; CHECK: v{{[0-9]*}}.cur{{ *}}
+; CHECK: v{{[0-9]*}}.cur{{ *}}
+
+define void @conv3x3_i(i8* noalias nocapture readonly %iptr0, i32 %shift, i32 %width) #0 {
+entry:
+ br i1 undef, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %iptr0.pn = phi i8* [ %iptr0, %for.body.lr.ph ], [ %iptr0.addr.0121, %for.body ]
+ %j.0115 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+ %sline000.0114 = phi <16 x i32> [ zeroinitializer, %for.body.lr.ph ], [ %1, %for.body ]
+ %sline100.0113 = phi <16 x i32> [ zeroinitializer, %for.body.lr.ph ], [ zeroinitializer, %for.body ]
+ %iptr0.addr.0121 = getelementptr inbounds i8, i8* %iptr0.pn, i32 64
+ %0 = bitcast i8* %iptr0.addr.0121 to <16 x i32>*
+ %1 = load <16 x i32>, <16 x i32>* %0, align 64, !tbaa !1
+ %2 = load <16 x i32>, <16 x i32>* null, align 64, !tbaa !1
+ %3 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %1, <16 x i32> %sline000.0114, i32 4)
+ %4 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> zeroinitializer, <16 x i32> %sline100.0113, i32 4)
+ %5 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %2, <16 x i32> zeroinitializer, i32 4)
+ %6 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %3, <16 x i32> %sline000.0114)
+ %7 = tail call <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32> %5, <16 x i32> zeroinitializer)
+ %8 = tail call <32 x i32> @llvm.hexagon.V6.vrmpybusi(<32 x i32> %6, i32 0, i32 0)
+ %9 = tail call <32 x i32> @llvm.hexagon.V6.vrmpybusi.acc(<32 x i32> %8, <32 x i32> zeroinitializer, i32 undef, i32 0)
+ %10 = tail call <32 x i32> @llvm.hexagon.V6.vrmpybusi.acc(<32 x i32> %9, <32 x i32> undef, i32 undef, i32 0)
+ %11 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %10)
+ %12 = tail call <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32> %11, <16 x i32> undef, i32 %shift)
+ %13 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> undef, <16 x i32> %12)
+ store <16 x i32> %13, <16 x i32>* undef, align 64, !tbaa !1
+ %14 = tail call <32 x i32> @llvm.hexagon.V6.vrmpybusi.acc(<32 x i32> zeroinitializer, <32 x i32> %7, i32 undef, i32 1)
+ %15 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %14)
+ %16 = tail call <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32> %15, <16 x i32> undef, i32 %shift)
+ %17 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %16, <16 x i32> undef)
+ store <16 x i32> %17, <16 x i32>* undef, align 64, !tbaa !1
+ %add = add nsw i32 %j.0115, 64
+ %cmp = icmp slt i32 %add, %width
+ br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32) #1
+declare <32 x i32> @llvm.hexagon.V6.vcombine(<16 x i32>, <16 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vrmpybusi(<32 x i32>, i32, i32) #1
+declare <32 x i32> @llvm.hexagon.V6.vrmpybusi.acc(<32 x i32>, <32 x i32>, i32, i32) #1
+declare <16 x i32> @llvm.hexagon.V6.vasrwh(<16 x i32>, <16 x i32>, i32) #1
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx" }
+attributes #1 = { nounwind readnone }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/v60Intrins.ll b/test/CodeGen/Hexagon/v60Intrins.ll
index 5f4f294c405b..d0064c50e71d 100644
--- a/test/CodeGen/Hexagon/v60Intrins.ll
+++ b/test/CodeGen/Hexagon/v60Intrins.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv60 -O2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -O2 -disable-post-ra < %s | FileCheck %s
; CHECK: q{{[0-3]}} = vand(v{{[0-9]*}},r{{[0-9]*}})
; CHECK: q{{[0-3]}} = vsetq(r{{[0-9]*}})
diff --git a/test/CodeGen/Hexagon/vec-pred-spill1.ll b/test/CodeGen/Hexagon/vec-pred-spill1.ll
new file mode 100644
index 000000000000..d120295fa52c
--- /dev/null
+++ b/test/CodeGen/Hexagon/vec-pred-spill1.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -O2 -enable-hexagon-hvx < %s | FileCheck %s
+
+; CHECK: vmem(r{{[0-9]+}}+#3) = v{{[0-9]+}}
+; CHECK: call puts
+; CHECK: call print_vecpred
+; CHECK: v{{[0-9]+}}{{ *}}={{ *}}vmem(r{{[0-9]+}}+#3)
+
+target triple = "hexagon"
+
+@K = global i64 0, align 8
+@src = global i32 -1, align 4
+@Q6VecPredResult = common global <16 x i32> zeroinitializer, align 64
+@dst_addresses = common global [15 x i64] zeroinitializer, align 8
+@ptr_addresses = common global [15 x i8*] zeroinitializer, align 8
+@src_addresses = common global [15 x i8*] zeroinitializer, align 8
+@ptr = common global [32768 x i32] zeroinitializer, align 8
+@vecpreds = common global [15 x <16 x i32>] zeroinitializer, align 64
+@VectorResult = common global <16 x i32> zeroinitializer, align 64
+@vectors = common global [15 x <16 x i32>] zeroinitializer, align 64
+@VectorPairResult = common global <32 x i32> zeroinitializer, align 128
+@vector_pairs = common global [15 x <32 x i32>] zeroinitializer, align 128
+@str = private unnamed_addr constant [106 x i8] c"Q6VecPred4 : Q6_Q_vandor_QVR(Q6_Q_vand_VR(Q6_V_vsplat_R(1+1),(0x01010101)),Q6_V_vsplat_R(0+1),INT32_MIN)\00"
+@str3 = private unnamed_addr constant [99 x i8] c"Q6VecPred4 : Q6_Q_vandor_QVR(Q6_Q_vand_VR(Q6_V_vsplat_R(1+1),(0x01010101)),Q6_V_vsplat_R(0+1),-1)\00"
+@str4 = private unnamed_addr constant [98 x i8] c"Q6VecPred4 : Q6_Q_vandor_QVR(Q6_Q_vand_VR(Q6_V_vsplat_R(1+1),(0x01010101)),Q6_V_vsplat_R(0+1),0)\00"
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+ %call = tail call i32 bitcast (i32 (...)* @init_addresses to i32 ()*)() #3
+ %call1 = tail call i32 @acquire_vector_unit(i8 zeroext 0) #3
+ tail call void @init_vectors() #3
+ %0 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 2)
+ %1 = tail call <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32> %0, i32 16843009)
+ %2 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 1)
+ %3 = tail call <512 x i1> @llvm.hexagon.V6.vandvrt.acc(<512 x i1> %1, <16 x i32> %2, i32 -2147483648)
+ %4 = bitcast <512 x i1> %3 to <16 x i32>
+ store <16 x i32> %4, <16 x i32>* @Q6VecPredResult, align 64, !tbaa !1
+ %puts = tail call i32 @puts(i8* getelementptr inbounds ([106 x i8], [106 x i8]* @str, i32 0, i32 0))
+ tail call void @print_vecpred(i32 512, i8* bitcast (<16 x i32>* @Q6VecPredResult to i8*)) #3
+ %5 = tail call <512 x i1> @llvm.hexagon.V6.vandvrt.acc(<512 x i1> %1, <16 x i32> %2, i32 -1)
+ %6 = bitcast <512 x i1> %5 to <16 x i32>
+ store <16 x i32> %6, <16 x i32>* @Q6VecPredResult, align 64, !tbaa !1
+ %puts5 = tail call i32 @puts(i8* getelementptr inbounds ([99 x i8], [99 x i8]* @str3, i32 0, i32 0))
+ tail call void @print_vecpred(i32 512, i8* bitcast (<16 x i32>* @Q6VecPredResult to i8*)) #3
+ %7 = tail call <512 x i1> @llvm.hexagon.V6.vandvrt.acc(<512 x i1> %1, <16 x i32> %2, i32 0)
+ %8 = bitcast <512 x i1> %7 to <16 x i32>
+ store <16 x i32> %8, <16 x i32>* @Q6VecPredResult, align 64, !tbaa !1
+ %puts6 = tail call i32 @puts(i8* getelementptr inbounds ([98 x i8], [98 x i8]* @str4, i32 0, i32 0))
+ tail call void @print_vecpred(i32 512, i8* bitcast (<16 x i32>* @Q6VecPredResult to i8*)) #3
+ ret i32 0
+}
+
+declare i32 @init_addresses(...) #1
+
+declare i32 @acquire_vector_unit(i8 zeroext) #1
+
+declare void @init_vectors() #1
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vandvrt.acc(<512 x i1>, <16 x i32>, i32) #2
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32>, i32) #2
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #2
+
+declare void @print_vecpred(i32, i8*) #1
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #3
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/vector-align.ll b/test/CodeGen/Hexagon/vector-align.ll
new file mode 100644
index 000000000000..557ee3f97f2e
--- /dev/null
+++ b/test/CodeGen/Hexagon/vector-align.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx < %s \
+; RUN: | FileCheck %s
+
+; Check that the store to Q6VecPredResult does not get expanded into multiple
+; stores. There should be no memd's. This relies on the alignment specified
+; in the data layout string, so don't provide one here to make sure that the
+; default one from HexagonTargetMachine is correct.
+
+; CHECK-NOT: memd
+
+
+@Q6VecPredResult = common global <16 x i32> zeroinitializer, align 64
+
+; Function Attrs: nounwind
+define i32 @foo() #0 {
+entry:
+ %0 = tail call <16 x i32> @llvm.hexagon.V6.lvsplatw(i32 1)
+ %1 = tail call <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32> %0, i32 -2147483648)
+ store <512 x i1> %1, <512 x i1>* bitcast (<16 x i32>* @Q6VecPredResult to <512 x i1>*), align 64, !tbaa !1
+ tail call void @print_vecpred(i32 64, i8* bitcast (<16 x i32>* @Q6VecPredResult to i8*)) #3
+ ret i32 0
+}
+
+; Function Attrs: nounwind readnone
+declare <512 x i1> @llvm.hexagon.V6.vandvrt(<16 x i32>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lvsplatw(i32) #1
+
+declare void @print_vecpred(i32, i8*) #2
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/vload-postinc-sel.ll b/test/CodeGen/Hexagon/vload-postinc-sel.ll
new file mode 100644
index 000000000000..70ed3a9b1e8d
--- /dev/null
+++ b/test/CodeGen/Hexagon/vload-postinc-sel.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: = vmem(r{{[0-9]+}}++#1)
+
+target triple = "hexagon-unknown--elf"
+
+declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #0
+declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>) #0
+declare <64 x i32> @llvm.hexagon.V6.vzb.128B(<32 x i32>) #0
+declare <32 x i32> @llvm.hexagon.V6.vsathub.128B(<32 x i32>, <32 x i32>) #0
+declare <64 x i32> @llvm.hexagon.V6.vaddh.dv.128B(<64 x i32>, <64 x i32>) #0
+declare <64 x i32> @llvm.hexagon.V6.vadduhsat.dv.128B(<64 x i32>, <64 x i32>) #0
+declare <32 x i32> @llvm.hexagon.V6.vabsdiffuh.128B(<32 x i32>, <32 x i32>) #0
+
+define void @fred() #1 {
+entry:
+ br i1 undef, label %b1, label %call_destructor.exit
+
+b1: ; preds = %entry
+ br label %b2
+
+b2: ; preds = %b1, %b2
+ %c2.host32.sroa.3.0 = phi <128 x i8> [ %5, %b2 ], [ undef, %b1 ]
+ %sobel_halide.s0.x.x = phi i32 [ %17, %b2 ], [ 0, %b1 ]
+ %0 = add nsw i32 %sobel_halide.s0.x.x, undef
+ %1 = shl i32 %0, 7
+ %2 = add nsw i32 %1, 128
+ %3 = getelementptr inbounds i8, i8* undef, i32 %2
+ %4 = bitcast i8* %3 to <128 x i8>*
+ %5 = load <128 x i8>, <128 x i8>* %4, align 128
+ %6 = bitcast <128 x i8> %c2.host32.sroa.3.0 to <32 x i32>
+ %7 = tail call <32 x i32> @llvm.hexagon.V6.valignbi.128B(<32 x i32> undef, <32 x i32> %6, i32 1)
+ %8 = tail call <64 x i32> @llvm.hexagon.V6.vzb.128B(<32 x i32> %7) #1
+ %9 = tail call <64 x i32> @llvm.hexagon.V6.vadduhsat.dv.128B(<64 x i32> undef, <64 x i32> %8) #1
+ %10 = tail call <64 x i32> @llvm.hexagon.V6.vadduhsat.dv.128B(<64 x i32> %9, <64 x i32> undef) #1
+ %11 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %10)
+ %12 = tail call <32 x i32> @llvm.hexagon.V6.vabsdiffuh.128B(<32 x i32> undef, <32 x i32> %11) #1
+ %13 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %12, <32 x i32> undef)
+ %14 = tail call <64 x i32> @llvm.hexagon.V6.vaddh.dv.128B(<64 x i32> undef, <64 x i32> %13) #1
+ %15 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %14) #1
+ %16 = tail call <32 x i32> @llvm.hexagon.V6.vsathub.128B(<32 x i32> %15, <32 x i32> undef) #1
+ store <32 x i32> %16, <32 x i32>* undef, align 128
+ %17 = add nuw nsw i32 %sobel_halide.s0.x.x, 1
+ br label %b2
+
+call_destructor.exit: ; preds = %entry
+ ret void
+}
+
+declare <32 x i32> @llvm.hexagon.V6.valignbi.128B(<32 x i32>, <32 x i32>, i32) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
diff --git a/test/CodeGen/Hexagon/vselect-pseudo.ll b/test/CodeGen/Hexagon/vselect-pseudo.ll
new file mode 100644
index 000000000000..ef86e47e3959
--- /dev/null
+++ b/test/CodeGen/Hexagon/vselect-pseudo.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @fred() #0 {
+entry:
+ br label %for.body9.us
+
+for.body9.us:
+ %cmp10.us = icmp eq i32 0, undef
+ %.h63h32.2.us = select i1 %cmp10.us, <16 x i32> zeroinitializer, <16 x i32> undef
+ %0 = tail call <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32> %.h63h32.2.us, <16 x i32> undef, i32 2)
+ %1 = tail call <32 x i32> @llvm.hexagon.V6.vswap(<512 x i1> undef, <16 x i32> undef, <16 x i32> %0)
+ %2 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %1)
+ %3 = tail call <32 x i32> @llvm.hexagon.V6.vshuffvdd(<16 x i32> undef, <16 x i32> %2, i32 62)
+ %4 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %3)
+ store <16 x i32> %4, <16 x i32>* undef, align 64
+ br i1 undef, label %for.body9.us, label %for.body43.us.preheader
+
+for.body43.us.preheader: ; preds = %for.body9.us
+ ret void
+}
+
+declare <16 x i32> @llvm.hexagon.V6.valignbi(<16 x i32>, <16 x i32>, i32) #1
+declare <32 x i32> @llvm.hexagon.V6.vswap(<512 x i1>, <16 x i32>, <16 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vshuffvdd(<16 x i32>, <16 x i32>, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/vsplat-isel.ll b/test/CodeGen/Hexagon/vsplat-isel.ll
new file mode 100644
index 000000000000..9c5e3e17c4e8
--- /dev/null
+++ b/test/CodeGen/Hexagon/vsplat-isel.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+; CHECK: vsplatb
+
+declare i32 @llvm.hexagon.S2.vsplatrb(i32) #0
+
+define i32 @foo(i8 %x) {
+ %p0 = zext i8 %x to i32
+ %p1 = tail call i32 @llvm.hexagon.S2.vsplatrb(i32 %p0)
+ ret i32 %p1
+}
diff --git a/test/CodeGen/Hexagon/zextloadi1.ll b/test/CodeGen/Hexagon/zextloadi1.ll
index 9ce7bea9fce6..582120d0f355 100644
--- a/test/CodeGen/Hexagon/zextloadi1.ll
+++ b/test/CodeGen/Hexagon/zextloadi1.ll
@@ -1,23 +1,28 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-
-; CHECK: r{{[0-9]+}} = ##i129_l+16
-; CHECK: r{{[0-9]+}} = ##i129_s+16
-; CHECK: memd(##i129_s) = r{{[0-9]+:[0-9]+}}
-; CHECK: r{{[0-9]+}} = ##i65_l+8
-; CHECK: r{{[0-9]+}} = ##i65_s+8
-; CHECK: memd(##i65_s) = r{{[0-9]+:[0-9]+}}
+; RUN: llc -march=hexagon < %s | FileCheck %s
@i65_l = external global i65
@i65_s = external global i65
@i129_l = external global i129
@i129_s = external global i129
+; CHECK-LABEL: i129_ls
+; CHECK-DAG: r[[REG0:[0-9:]+]] = memd(##i129_l)
+; CHECK-DAG: r[[REG1:[0-9:]+]] = memd(##i129_l+8)
+; CHECK-DAG: r[[REG2:[0-9]+]] = memub(##i129_l+16)
+; CHECK-DAG: memb(##i129_s+16) = r[[REG2]]
+; CHECK-DAG: memd(##i129_s+8) = r[[REG1]]
+; CHECK-DAG: memd(##i129_s) = r[[REG0]]
define void @i129_ls() nounwind {
%tmp = load i129, i129* @i129_l
store i129 %tmp, i129* @i129_s
ret void
}
+; CHECK-LABEL: i65_ls
+; CHECK-DAG: r[[REG0:[0-9:]+]] = memd(##i65_l)
+; CHECK-DAG: r[[REG1:[0-9]+]] = memub(##i65_l+8)
+; CHECK-DAG: memd(##i65_s) = r[[REG0]]
+; CHECK-DAG: memb(##i65_s+8) = r[[REG1]]
define void @i65_ls() nounwind {
%tmp = load i65, i65* @i65_l
store i65 %tmp, i65* @i65_s
diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index 440073fea153..ce015c0727ca 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
@@ -14,9 +14,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!13}
-!0 = distinct !DISubprogram(name: "main", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !12, scope: !1, type: !3)
+!0 = distinct !DISubprogram(name: "main", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, file: !12, scope: !1, type: !3)
!1 = !DIFile(filename: "/tmp/x.c", directory: "/Users/manav")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 120996)", isOptimized: false, emissionKind: 0, file: !12, enums: !6, retainedTypes: !6, subprograms: !11)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 120996)", isOptimized: false, emissionKind: FullDebug, file: !12, enums: !6, retainedTypes: !6)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -25,6 +25,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!8 = distinct !DILexicalBlock(line: 2, column: 12, file: !12, scope: !0)
!9 = !DILocation(line: 3, column: 11, scope: !8)
!10 = !DILocation(line: 4, column: 2, scope: !8)
-!11 = !{!0}
!12 = !DIFile(filename: "/tmp/x.c", directory: "/Users/manav")
!13 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/Lanai/codemodel.ll b/test/CodeGen/Lanai/codemodel.ll
new file mode 100644
index 000000000000..e5ec7265924e
--- /dev/null
+++ b/test/CodeGen/Lanai/codemodel.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=lanai < %s | FileCheck %s
+; RUN: llc -march=lanai < %s -code-model=small | FileCheck -check-prefix CHECK-SMALL %s
+
+@data = external global [0 x i32] ; <[0 x i32]*> [#uses=5]
+
+define i32 @foo() nounwind readonly {
+entry:
+; CHECK-SMALL-LABEL: foo:
+; CHECK-SMALL: ld [data], %rv
+; CHECK-LABEL: foo:
+; CHECK: mov hi(data), %r[[REGISTER:[0-9]+]]
+; CHECK: or %r[[REGISTER]], lo(data), %r[[REGISTER]]
+; CHECK: ld 0[%r[[REGISTER]]], %rv
+ %0 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @data, i64 0, i64 0), align 4 ; <i32> [#uses=1]
+ ret i32 %0
+}
+
+define i32 @foo1() nounwind readonly {
+entry:
+; CHECK-SMALL-LABEL: foo1:
+; CHECK-SMALL: mov data, %r[[REGISTER:[0-9]+]]
+; CHECK-SMALL: ld 40[%r[[REGISTER]]], %rv
+; CHECK-LABEL: foo1:
+; CHECK: mov hi(data), %r[[REGISTER:[0-9]+]]
+; CHECK: or %r[[REGISTER]], lo(data), %r[[REGISTER]]
+; CHECK: ld 40[%r[[REGISTER]]], %rv
+ %0 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @data, i32 0, i64 10), align 4 ; <i32> [#uses=1]
+ ret i32 %0
+}
+
diff --git a/test/CodeGen/Lanai/comparisons_i32.ll b/test/CodeGen/Lanai/comparisons_i32.ll
new file mode 100644
index 000000000000..fd8ca725c4cb
--- /dev/null
+++ b/test/CodeGen/Lanai/comparisons_i32.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test that basic 32-bit integer comparison operations assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; CHECK-LABEL: eq_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: seq
+define i32 @eq_i32(i32 %x, i32 %y) {
+ %a = icmp eq i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ne_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: sne
+define i32 @ne_i32(i32 %x, i32 %y) {
+ %a = icmp ne i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: slt_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: slt
+define i32 @slt_i32(i32 %x, i32 %y) {
+ %a = icmp slt i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: sle_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: sle
+define i32 @sle_i32(i32 %x, i32 %y) {
+ %a = icmp sle i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ult_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: sult
+define i32 @ult_i32(i32 %x, i32 %y) {
+ %a = icmp ult i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ule_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: sule
+define i32 @ule_i32(i32 %x, i32 %y) {
+ %a = icmp ule i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: sgt_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: sgt
+define i32 @sgt_i32(i32 %x, i32 %y) {
+ %a = icmp sgt i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: sge_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: sge
+define i32 @sge_i32(i32 %x, i32 %y) {
+ %a = icmp sge i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ugt_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: sugt
+define i32 @ugt_i32(i32 %x, i32 %y) {
+ %a = icmp ugt i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: uge_i32:
+; CHECK: sub.f %r{{[0-9]+}}, %r{{[0-9]+}}, %r0
+; CHECK-NEXT: suge
+define i32 @uge_i32(i32 %x, i32 %y) {
+ %a = icmp uge i32 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
diff --git a/test/CodeGen/Lanai/comparisons_i64.ll b/test/CodeGen/Lanai/comparisons_i64.ll
new file mode 100644
index 000000000000..853ac139b2d6
--- /dev/null
+++ b/test/CodeGen/Lanai/comparisons_i64.ll
@@ -0,0 +1,108 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test that basic 64-bit integer comparison operations assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; CHECK-LABEL: eq_i64:
+; CHECK: xor
+; CHECK: xor
+; CHECK: or.f
+; CHECK-NEXT: seq
+define i32 @eq_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp eq i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ne_i64:
+; CHECK: xor
+; CHECK: xor
+; CHECK: or.f
+; CHECK-NEXT: sne
+define i32 @ne_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp ne i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: slt_i64:
+; CHECK: sub.f %r7, %r19, %r3
+; CHECK: subb.f %r6, %r18, %r3
+; CHECK-NEXT: slt
+define i32 @slt_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp slt i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: sle_i64:
+; CHECK: sub.f %r19, %r7, %r3
+; CHECK: subb.f %r18, %r6, %r3
+; CHECK-NEXT: sge %rv
+define i32 @sle_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp sle i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ult_i64:
+; CHECK: sub.f %r7, %r19, %r3
+; CHECK: subb.f %r6, %r18, %r3
+; CHECK-NEXT: sult %rv
+define i32 @ult_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp ult i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ule_i64:
+; CHECK: sub.f %r19, %r7, %r3
+; CHECK: subb.f %r18, %r6, %r3
+; CHECK-NEXT: suge %rv
+define i32 @ule_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp ule i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: sgt_i64:
+; CHECK: sub.f %r19, %r7, %r3
+; CHECK: subb.f %r18, %r6, %r3
+; CHECK-NEXT: slt %rv
+define i32 @sgt_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp sgt i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: sge_i64:
+; CHECK: sub.f %r7, %r19, %r3
+; CHECK: subb.f %r6, %r18, %r3
+; CHECK-NEXT: sge %rv
+define i32 @sge_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp sge i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: ugt_i64:
+; CHECK: sub.f %r19, %r7, %r3
+; CHECK: subb.f %r18, %r6, %r3
+; CHECK-NEXT: sult %rv
+define i32 @ugt_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp ugt i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: uge_i64:
+; CHECK: sub.f %r7, %r19, %r3
+; CHECK: subb.f %r6, %r18, %r3
+; CHECK-NEXT: suge %rv
+define i32 @uge_i64(i64 inreg %x, i64 inreg %y) {
+ %a = icmp uge i64 %x, %y
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
diff --git a/test/CodeGen/Lanai/constant_multiply.ll b/test/CodeGen/Lanai/constant_multiply.ll
new file mode 100644
index 000000000000..77c9805e4419
--- /dev/null
+++ b/test/CodeGen/Lanai/constant_multiply.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test custom lowering for 32-bit integer multiplication.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; CHECK-LABEL: f6:
+; CHECK: sh %r6, 0x1, %r{{[0-9]+}}
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @f6(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, 6
+ ret i32 %1
+}
+
+; CHECK-LABEL: f7:
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r6, %rv
+define i32 @f7(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, 7
+ ret i32 %1
+}
+
+; CHECK-LABEL: f8:
+; CHECK: sh %r6, 0x3, %rv
+define i32 @f8(i32 inreg %a) #0 {
+ %1 = shl nsw i32 %a, 3
+ ret i32 %1
+}
+
+; CHECK-LABEL: f9:
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: add %r{{[0-9]+}}, %r6, %rv
+define i32 @f9(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, 9
+ ret i32 %1
+}
+
+; CHECK-LABEL: f10:
+; CHECK: sh %r6, 0x1, %r{{[0-9]+}}
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: add %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @f10(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, 10
+ ret i32 %1
+}
+
+; CHECK-LABEL: f1280:
+; CHECK: sh %r6, 0x8, %r{{[0-9]+}}
+; CHECK: sh %r6, 0xa, %r{{[0-9]+}}
+; CHECK: add %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @f1280(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, 1280
+ ret i32 %1
+}
+
+; CHECK-LABEL: fm6:
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sh %r6, 0x1, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @fm6(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, -6
+ ret i32 %1
+}
+
+; CHECK-LABEL: fm7:
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r6, %r{{[0-9]+}}, %rv
+define i32 @fm7(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, -7
+ ret i32 %1
+}
+
+; CHECK-LABEL: fm8:
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @fm8(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, -8
+ ret i32 %1
+}
+
+; CHECK-LABEL: fm9:
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r6, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @fm9(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, -9
+ ret i32 %1
+}
+
+; CHECK-LABEL: fm10:
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sh %r6, 0x1, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @fm10(i32 inreg %a) #0 {
+ %1 = mul nsw i32 %a, -10
+ ret i32 %1
+}
+
+; CHECK-LABEL: h1:
+; CHECK: __mulsi3
+define i32 @h1(i32 inreg %a) #0 {
+ %1 = mul i32 %a, -1431655765
+ ret i32 %1
+}
diff --git a/test/CodeGen/Lanai/delay_filler.ll b/test/CodeGen/Lanai/delay_filler.ll
new file mode 100644
index 000000000000..bb74276d46de
--- /dev/null
+++ b/test/CodeGen/Lanai/delay_filler.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=lanai < %s | FileCheck %s
+; RUN: llc -march=lanai --lanai-nop-delay-filler < %s | \
+; RUN: FileCheck %s --check-prefix=NOP
+
+; CHECK: bt f
+; CHECK-NEXT: or
+; NOP: bt f
+; NOP-NEXT: nop
+
+; ModuleID = 'delay_filler.c'
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; Function Attrs: nounwind
+define i32 @g(i32 inreg %n) #0 {
+entry:
+ %cmp5 = icmp sgt i32 %n, 0
+ br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %call.lcssa = phi i32 [ %call, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %a.0.lcssa = phi i32 [ undef, %entry ], [ %call.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %a.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %a.06 = phi i32 [ %call, %for.body ], [ undef, %for.body.preheader ]
+ %call = tail call i32 @f(i32 inreg %a.06) #2
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32 @f(i32 inreg) #1
+
diff --git a/test/CodeGen/Lanai/i32.ll b/test/CodeGen/Lanai/i32.ll
new file mode 100644
index 000000000000..632cc467d681
--- /dev/null
+++ b/test/CodeGen/Lanai/i32.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that basic 32-bit integer operations assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctpop.i32(i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.cttz.i32(i32, i1) #1
+
+; CHECK-LABEL: add32:
+; CHECK: add %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @add32(i32 %x, i32 %y) {
+ %a = add i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: sub32:
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @sub32(i32 %x, i32 %y) {
+ %a = sub i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: mul32:
+; CHECK: bt __mulsi3
+define i32 @mul32(i32 %x, i32 %y) {
+ %a = mul i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: sdiv32:
+; CHECK: bt __divsi3
+define i32 @sdiv32(i32 %x, i32 %y) {
+ %a = sdiv i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: udiv32:
+; CHECK: bt __udivsi3
+define i32 @udiv32(i32 %x, i32 %y) {
+ %a = udiv i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: srem32:
+; CHECK: bt __modsi3
+define i32 @srem32(i32 %x, i32 %y) {
+ %a = srem i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: urem32:
+; CHECK: bt __umodsi3
+define i32 @urem32(i32 %x, i32 %y) {
+ %a = urem i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: and32:
+; CHECK: and %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @and32(i32 %x, i32 %y) {
+ %a = and i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: or32:
+; CHECK: or %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @or32(i32 %x, i32 %y) {
+ %a = or i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: xor32:
+; CHECK: xor %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @xor32(i32 %x, i32 %y) {
+ %a = xor i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: shl32:
+; CHECK: sh %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @shl32(i32 %x, i32 %y) {
+ %a = shl i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: shr32:
+; CHECK: sub %r0, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: sh %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @shr32(i32 %x, i32 %y) {
+ %a = lshr i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: sar32
+; CHECK: sub %r0, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: sha %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+define i32 @sar32(i32 %x, i32 %y) {
+ %a = ashr i32 %x, %y
+ ret i32 %a
+}
+
+; CHECK-LABEL: clz32:
+; CHECK: leadz %r{{[0-9]+}}, %rv
+define i32 @clz32(i32 %x) {
+ %a = call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+ ret i32 %a
+}
+
+; CHECK-LABEL: clz32_zero_undef:
+; CHECK-NOT: sub.f
+; CHECK: leadz %r{{[0-9]+}}, %rv
+define i32 @clz32_zero_undef(i32 %x) {
+ %a = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+ ret i32 %a
+}
+
+; CHECK-LABEL: ctz32:
+; CHECK: trailz %r{{[0-9]+}}, %rv
+define i32 @ctz32(i32 %x) {
+ %a = call i32 @llvm.cttz.i32(i32 %x, i1 false)
+ ret i32 %a
+}
+
+; CHECK-LABEL: ctz32_zero_undef:
+; CHECK-NOT: sub.f
+; CHECK: trailz %r{{[0-9]+}}, %rv
+define i32 @ctz32_zero_undef(i32 %x) {
+ %a = call i32 @llvm.cttz.i32(i32 %x, i1 true)
+ ret i32 %a
+}
+
+; CHECK-LABEL: popcnt32:
+; CHECK: popc %r{{[0-9]+}}, %rv
+define i32 @popcnt32(i32 %x) {
+ %a = call i32 @llvm.ctpop.i32(i32 %x)
+ ret i32 %a
+}
diff --git a/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
new file mode 100644
index 000000000000..619417f02c15
--- /dev/null
+++ b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
@@ -0,0 +1,55 @@
+; RUN: llc %s -mtriple=lanai-unknown-unknown -debug-only=misched -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure there are no control dependencies between memory operations that
+; are trivially disjoint.
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @foo(i8* inreg nocapture %x) {
+entry:
+ %0 = bitcast i8* %x to i32*
+ store i32 1, i32* %0, align 4
+ %arrayidx1 = getelementptr inbounds i8, i8* %x, i32 4
+ %1 = bitcast i8* %arrayidx1 to i32*
+ store i32 2, i32* %1, align 4
+ %arrayidx2 = getelementptr inbounds i8, i8* %x, i32 12
+ %2 = bitcast i8* %arrayidx2 to i32*
+ %3 = load i32, i32* %2, align 4
+ %arrayidx3 = getelementptr inbounds i8, i8* %x, i32 10
+ %4 = bitcast i8* %arrayidx3 to i16*
+ store i16 3, i16* %4, align 2
+ %5 = bitcast i8* %arrayidx2 to i16*
+ store i16 4, i16* %5, align 2
+ %arrayidx5 = getelementptr inbounds i8, i8* %x, i32 14
+ store i8 5, i8* %arrayidx5, align 1
+ %arrayidx6 = getelementptr inbounds i8, i8* %x, i32 15
+ store i8 6, i8* %arrayidx6, align 1
+ %arrayidx7 = getelementptr inbounds i8, i8* %x, i32 16
+ store i8 7, i8* %arrayidx7, align 1
+ ret i32 %3
+}
+
+; CHECK-LABEL: foo
+; CHECK-LABEL: SU({{.*}}): SW_RI{{.*}}, 0,
+; CHECK: # preds left : 2
+; CHECK: # succs left : 0
+; CHECK-LABEL: SU({{.*}}): SW_RI{{.*}}, 4,
+; CHECK: # preds left : 2
+; CHECK: # succs left : 0
+; CHECK-LABEL: SU({{.*}}): %vreg{{.*}}<def> = LDW_RI{{.*}}, 12,
+; CHECK: # preds left : 1
+; CHECK: # succs left : 4
+; CHECK-LABEL: SU({{.*}}): STH_RI{{.*}}, 10,
+; CHECK: # preds left : 2
+; CHECK: # succs left : 0
+; CHECK-LABEL: SU({{.*}}): STH_RI{{.*}}, 12,
+; CHECK: # preds left : 3
+; CHECK: # succs left : 0
+; CHECK-LABEL: SU({{.*}}): STB_RI{{.*}}, 14,
+; CHECK: # preds left : 3
+; CHECK: # succs left : 0
+; CHECK-LABEL: SU({{.*}}): STB_RI{{.*}}, 15,
+; CHECK: # preds left : 3
+; CHECK: # succs left : 0
+; CHECK-LABEL: SU({{.*}}): STB_RI{{.*}}, 16,
+; CHECK: # preds left : 2
+; CHECK: # succs left : 0 \ No newline at end of file
diff --git a/test/CodeGen/Lanai/lit.local.cfg b/test/CodeGen/Lanai/lit.local.cfg
new file mode 100644
index 000000000000..3f30d055364c
--- /dev/null
+++ b/test/CodeGen/Lanai/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'Lanai' in config.root.targets:
+ config.unsupported = True
+
diff --git a/test/CodeGen/Lanai/mem_alu_combiner.ll b/test/CodeGen/Lanai/mem_alu_combiner.ll
new file mode 100644
index 000000000000..4e6e77361444
--- /dev/null
+++ b/test/CodeGen/Lanai/mem_alu_combiner.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=lanai | FileCheck %s
+; RUN: llc < %s -march=lanai -disable-lanai-mem-alu-combiner | \
+; RUN: FileCheck %s -check-prefix=CHECK-DIS
+
+; CHECK-LABEL: sum,
+; CHECK: ld [%r{{[0-9]+}}++], %r{{[0-9]+}}{{$}}
+; CHECK-DIS-LABEL: sum,
+; CHECK-DIS-NOT: ++],
+
+define i32 @sum(i32* inreg nocapture readonly %data, i32 inreg %n) {
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum_.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %sum_.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %sum_.07 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %data, i32 %i.08
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %0, %sum_.07
+ %inc = add nuw nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/test/CodeGen/Lanai/multiply.ll b/test/CodeGen/Lanai/multiply.ll
new file mode 100644
index 000000000000..c92a06c3f017
--- /dev/null
+++ b/test/CodeGen/Lanai/multiply.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=lanai < %s | FileCheck %s
+
+; Test the in place lowering of mul i32.
+
+define i32 @f6(i32 inreg %a) #0 {
+entry:
+ %mul = mul nsw i32 %a, 6
+ ret i32 %mul
+}
+; CHECK: sh %r6, 0x1, %r{{[0-9]+}}
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+
+define i32 @f7(i32 inreg %a) #0 {
+entry:
+ %mul = mul nsw i32 %a, 7
+ ret i32 %mul
+}
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r6, %rv
+
+define i32 @f8(i32 inreg %a) #0 {
+entry:
+ %mul = shl nsw i32 %a, 3
+ ret i32 %mul
+}
+; CHECK: sh %r6, 0x3, %rv
+
+define i32 @fm6(i32 inreg %a) #0 {
+entry:
+ %mul = mul nsw i32 %a, -6
+ ret i32 %mul
+}
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sh %r6, 0x1, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+
+define i32 @fm7(i32 inreg %a) #0 {
+entry:
+ %mul = mul nsw i32 %a, -7
+ ret i32 %mul
+}
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r6, %r{{[0-9]+}}, %rv
+
+define i32 @fm8(i32 inreg %a) #0 {
+entry:
+ %mul = mul nsw i32 %a, -8
+ ret i32 %mul
+}
+; CHECK: sh %r6, 0x3, %r{{[0-9]+}}
+; CHECK: sub %r{{[0-9]+}}, %r{{[0-9]+}}, %rv
+
+define i32 @h1(i32 inreg %a) #0 {
+entry:
+ %mul = mul i32 %a, -1431655765
+ ret i32 %mul
+}
+; CHECK: h1
+; CHECK: mulsi3
diff --git a/test/CodeGen/Lanai/rshift64.ll b/test/CodeGen/Lanai/rshift64.ll
new file mode 100644
index 000000000000..2009edd001c8
--- /dev/null
+++ b/test/CodeGen/Lanai/rshift64.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=lanai-unknown-unknown | FileCheck %s
+
+; Test right-shift i64 lowering does not result in call being inserted.
+
+; CHECK-LABEL: shift
+; CHECK-NOT: bt __lshrdi3
+; CHECK: %rv
+define i64 @shift(i64 inreg, i32 inreg) {
+ %3 = zext i32 %1 to i64
+ %4 = lshr i64 %0, %3
+ ret i64 %4
+}
diff --git a/test/CodeGen/Lanai/select.ll b/test/CodeGen/Lanai/select.ll
new file mode 100644
index 000000000000..0c5b2307ab2c
--- /dev/null
+++ b/test/CodeGen/Lanai/select.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test that Lanai select instruction is selected from LLVM select instruction.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; CHECK-LABEL: select_i32_bool:
+; CHECK: sub.f %r6, 0x0, %r0
+; CHECK: sel.ne %r7, %r18, %rv
+define i32 @select_i32_bool(i1 zeroext inreg %a, i32 inreg %b, i32 inreg %c) {
+ %cond = select i1 %a, i32 %b, i32 %c
+ ret i32 %cond
+}
+
+; CHECK-LABEL: select_i32_eq:
+; CHECK: sub.f %r6, 0x0, %r0
+; CHECK: sel.eq %r7, %r18, %rv
+define i32 @select_i32_eq(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+ %cmp = icmp eq i32 %a, 0
+ %cond = select i1 %cmp, i32 %b, i32 %c
+ ret i32 %cond
+}
+
+; CHECK-LABEL: select_i32_ne:
+; CHECK: sub.f %r6, 0x0, %r0
+; CHECK: sel.ne %r7, %r18, %rv
+define i32 @select_i32_ne(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+ %cmp = icmp ne i32 %a, 0
+ %cond = select i1 %cmp, i32 %b, i32 %c
+ ret i32 %cond
+}
+
+; CHECK-LABEL: select_i32_lt:
+; CHECK: sub.f %r6, %r7, %r0
+; CHECK: sel.lt %r6, %r7, %rv
+define i32 @select_i32_lt(i32 inreg %x, i32 inreg %y) #0 {
+ %1 = icmp slt i32 %x, %y
+ %2 = select i1 %1, i32 %x, i32 %y
+ ret i32 %2
+}
diff --git a/test/CodeGen/Lanai/set_and_hi.ll b/test/CodeGen/Lanai/set_and_hi.ll
new file mode 100644
index 000000000000..bfce094050cb
--- /dev/null
+++ b/test/CodeGen/Lanai/set_and_hi.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test matching of and_hi.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+@x = common global i32 0, align 4
+
+; CHECK-LABEL: setandhi:
+; CHECK: mov 0xfffffe4a, %r{{[0-9]+}}
+define void @setandhi() #0 {
+ store volatile i32 -438, i32* @x, align 4
+ ret void
+}
diff --git a/test/CodeGen/Lanai/shift.ll b/test/CodeGen/Lanai/shift.ll
new file mode 100644
index 000000000000..df5f91122ed8
--- /dev/null
+++ b/test/CodeGen/Lanai/shift.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=lanai | FileCheck %s
+
+; Test lowering of shifts.
+
+define i32 @irs(i32 inreg %a) #0 {
+entry:
+ %shr = ashr i32 %a, 13
+ ret i32 %shr
+}
+; CHECK-LABEL: irs
+; CHECK: sha %r6, -0xd, %rv
+
+define i32 @urs(i32 inreg %a) #0 {
+entry:
+ %shr = lshr i32 %a, 13
+ ret i32 %shr
+}
+; CHECK-LABEL: urs
+; CHECK: sh %r6, -0xd, %rv
+
+define i32 @ls(i32 inreg %a) #0 {
+entry:
+ %shl = shl i32 %a, 13
+ ret i32 %shl
+}
+; CHECK-LABEL: ls
+; CHECK: sh %r6, 0xd, %rv
+
diff --git a/test/CodeGen/Lanai/stack-frame.ll b/test/CodeGen/Lanai/stack-frame.ll
new file mode 100644
index 000000000000..3564658fa0fd
--- /dev/null
+++ b/test/CodeGen/Lanai/stack-frame.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=lanai < %s -o - | FileCheck %s
+
+define void @f1() {
+ %c = alloca i8, align 1
+ ret void
+}
+; CHECK-LABEL: f1:
+; CHECK: sub %sp, 0x10
+
+define i32 @f2() {
+ ret i32 1
+}
+; CHECK-LABEL: f2:
+; CHECK: sub %sp, 0x8
diff --git a/test/CodeGen/Lanai/sub-cmp-peephole.ll b/test/CodeGen/Lanai/sub-cmp-peephole.ll
new file mode 100644
index 000000000000..7e88364f273a
--- /dev/null
+++ b/test/CodeGen/Lanai/sub-cmp-peephole.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=lanai | FileCheck %s
+
+define i32 @f(i32 inreg %a, i32 inreg %b) nounwind ssp {
+entry:
+; CHECK-LABEL: f:
+; CHECK: sub.f %r6, %r7, [[IN:%.*]]
+; CHECK: sel.gt [[IN]], %r0, %rv
+ %cmp = icmp sgt i32 %a, %b
+ %sub = sub nsw i32 %a, %b
+ %sub. = select i1 %cmp, i32 %sub, i32 0
+ ret i32 %sub.
+}
+
+define i32 @g(i32 inreg %a, i32 inreg %b) nounwind ssp {
+entry:
+; CHECK-LABEL: g:
+; CHECK: sub.f %r7, %r6, [[IN:%.*]]
+; CHECK: sel.lt [[IN]], %r0, %rv
+ %cmp = icmp slt i32 %a, %b
+ %sub = sub nsw i32 %b, %a
+ %sub. = select i1 %cmp, i32 %sub, i32 0
+ ret i32 %sub.
+}
+
+define i32 @h(i32 inreg %a, i32 inreg %b) nounwind ssp {
+entry:
+; CHECK-LABEL: h:
+; CHECK: sub.f %r6, 0x3, [[IN:%.*]]
+; CHECK: sel.gt [[IN]], %r7, %rv
+ %cmp = icmp sgt i32 %a, 3
+ %sub = sub nsw i32 %a, 3
+ %sub. = select i1 %cmp, i32 %sub, i32 %b
+ ret i32 %sub.
+}
+
+define i32 @i(i32 inreg %a, i32 inreg %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: i:
+; CHECK: sub.f %r7, %r6, [[IN:%.*]]
+; CHECK: sel.ult [[IN]], %r0, %rv
+ %cmp = icmp ult i32 %a, %b
+ %sub = sub i32 %b, %a
+ %sub. = select i1 %cmp, i32 %sub, i32 0
+ ret i32 %sub.
+}
+; If SR is live-out, we can't remove cmp if there exists a swapped sub.
+define i32 @j(i32 inreg %a, i32 inreg %b) nounwind {
+entry:
+; CHECK-LABEL: j:
+; CHECK: sub.f %r7, %r6, %r0
+; CHECK: sub %r6, %r7, %rv
+ %cmp = icmp eq i32 %b, %a
+ %sub = sub nsw i32 %a, %b
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = icmp sgt i32 %b, %a
+ %sel = select i1 %cmp2, i32 %sub, i32 %a
+ ret i32 %sel
+
+if.else:
+ ret i32 %sub
+}
+
+declare void @abort()
+declare void @exit(i32)
+@t = common global i32 0
+
+; If the comparison uses the C bit (signed overflow/underflow), we can't
+; omit the comparison.
+define i32 @cmp_ult0(i32 inreg %a, i32 inreg %b, i32 inreg %x, i32 inreg %y) {
+entry:
+; CHECK-LABEL: cmp_ult0
+; CHECK: sub {{.*}}, 0x11, [[IN:%.*]]
+; CHECK: sub.f [[IN]], 0x0, %r0
+ %load = load i32, i32* @t, align 4
+ %sub = sub i32 %load, 17
+ %cmp = icmp ult i32 %sub, 0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ call void @abort()
+ unreachable
+
+if.else:
+ call void @exit(i32 0)
+ unreachable
+}
+
+; Same for the V bit.
+; TODO: add test that exercises V bit individually (VC/VS).
+define i32 @cmp_gt0(i32 inreg %a, i32 inreg %b, i32 inreg %x, i32 inreg %y) {
+entry:
+; CHECK-LABEL: cmp_gt0
+; CHECK: sub {{.*}}, 0x11, [[IN:%.*]]
+; CHECK: sub.f [[IN]], 0x1, %r0
+ %load = load i32, i32* @t, align 4
+ %sub = sub i32 %load, 17
+ %cmp = icmp sgt i32 %sub, 0
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ call void @abort()
+ unreachable
+
+if.else:
+ call void @exit(i32 0)
+ unreachable
+}
diff --git a/test/CodeGen/Lanai/subword.ll b/test/CodeGen/Lanai/subword.ll
new file mode 100644
index 000000000000..c0e1eaf6ad36
--- /dev/null
+++ b/test/CodeGen/Lanai/subword.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=lanai-unknown-unknown | FileCheck %s
+
+; Test scheduling of subwords.
+
+%struct.X = type { i16, i16 }
+
+define void @f(%struct.X* inreg nocapture %c) #0 {
+entry:
+ %a = getelementptr inbounds %struct.X, %struct.X* %c, i32 0, i32 0
+ %0 = load i16, i16* %a, align 2
+ %inc = add i16 %0, 1
+ store i16 %inc, i16* %a, align 2
+ %b = getelementptr inbounds %struct.X, %struct.X* %c, i32 0, i32 1
+ %1 = load i16, i16* %b, align 2
+ %dec = add i16 %1, -1
+ store i16 %dec, i16* %b, align 2
+ ret void
+}
+
+; Verify that the two loads occur before the stores. Without memory
+; disambiguation and subword schedule, the resultant code was a per subword
+; load-modify-store sequence instead of the more optimal schedule where all
+; loads occurred before modification and storage.
+; CHECK: uld.h
+; CHECK-NEXT: uld.h
+; CHECK-NEXT: add
+; CHECK-NEXT: st.h
+; CHECK-NEXT: sub
+; CHECK-NEXT: st.h
diff --git a/test/CodeGen/MIR/AArch64/cfi-def-cfa.mir b/test/CodeGen/MIR/AArch64/cfi-def-cfa.mir
index cf7572ecad37..9a6f8dbafa00 100644
--- a/test/CodeGen/MIR/AArch64/cfi-def-cfa.mir
+++ b/test/CodeGen/MIR/AArch64/cfi-def-cfa.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the .cfi_def_cfa operands
# correctly.
diff --git a/test/CodeGen/MIR/AArch64/expected-target-flag-name.mir b/test/CodeGen/MIR/AArch64/expected-target-flag-name.mir
index b7bac2682c70..f94f09a485d9 100644
--- a/test/CodeGen/MIR/AArch64/expected-target-flag-name.mir
+++ b/test/CodeGen/MIR/AArch64/expected-target-flag-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple=aarch64-none-linux-gnu -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir b/test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir
new file mode 100644
index 000000000000..b3d8c5c3d361
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/generic-virtual-registers-error.mir
@@ -0,0 +1,43 @@
+# RUN: not llc -mtriple=aarch64-apple-ios -run-pass none -o - %s 2> %t.log \
+# RUN: | FileCheck %s --check-prefix=CHECK
+# RUN: FileCheck %s -input-file=%t.log --check-prefix=ERR
+# RUN: rm -f %t.log
+# REQUIRES: global-isel
+# This test ensures that the MIR parser errors out when
+# generic virtual register definitions are not correct.
+
+--- |
+ define void @bar() { ret void }
+
+ define void @baz() { ret void }
+...
+
+---
+name: bar
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+registers:
+ - { id: 0, class: gpr }
+body: |
+ bb.0:
+ liveins: %w0
+ ; ERR: generic virtual registers must have a size
+ ; ERR-NEXT: %0
+ %0 = G_ADD i32 %w0, %w0
+...
+
+---
+name: baz
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: _ }
+registers:
+ - { id: 0, class: _ }
+body: |
+ bb.0:
+ liveins: %w0
+ ; ERR: generic virtual registers must have a size
+ ; ERR-NEXT: %0
+ %0 = G_ADD i32 %w0, %w0
+...
diff --git a/test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir b/test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir
index d4145b8961df..e2a257535314 100644
--- a/test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir
+++ b/test/CodeGen/MIR/AArch64/invalid-target-flag-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple=aarch64-none-linux-gnu -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/AArch64/machine-dead-copy.mir b/test/CodeGen/MIR/AArch64/machine-dead-copy.mir
new file mode 100644
index 000000000000..90f2f3c09993
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/machine-dead-copy.mir
@@ -0,0 +1,71 @@
+
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s
+
+--- |
+ define i32 @copyprop1(i32 %a, i32 %b) { ret i32 %a }
+ define i32 @copyprop2(i32 %a, i32 %b) { ret i32 %a }
+ define i32 @copyprop3(i32 %a, i32 %b) { ret i32 %a }
+ define i32 @copyprop4(i32 %a, i32 %b) { ret i32 %a }
+ declare i32 @foo(i32)
+...
+---
+# The first copy is dead copy which is not used.
+# CHECK-LABEL: name: copyprop1
+# CHECK: bb.0:
+# CHECK-NOT: %w20 = COPY
+name: copyprop1
+allVRegsAllocated: true
+body: |
+ bb.0:
+ liveins: %w0, %w1
+ %w20 = COPY %w1
+ BL @foo, csr_aarch64_aapcs, implicit %w0, implicit-def %w0
+ RET_ReallyLR implicit %w0
+...
+---
+# The first copy is not a dead copy which is used in the second copy after the
+# call.
+# CHECK-LABEL: name: copyprop2
+# CHECK: bb.0:
+# CHECK: %w20 = COPY
+name: copyprop2
+allVRegsAllocated: true
+body: |
+ bb.0:
+ liveins: %w0, %w1
+ %w20 = COPY %w1
+ BL @foo, csr_aarch64_aapcs, implicit %w0, implicit-def %w0
+ %w0 = COPY %w20
+ RET_ReallyLR implicit %w0
+...
+---
+# Both the first and second copy are dead copies which are not used.
+# CHECK-LABEL: name: copyprop3
+# CHECK: bb.0:
+# CHECK-NOT: COPY
+name: copyprop3
+allVRegsAllocated: true
+body: |
+ bb.0:
+ liveins: %w0, %w1
+ %w20 = COPY %w1
+ BL @foo, csr_aarch64_aapcs, implicit %w0, implicit-def %w0
+ %w20 = COPY %w0
+ RET_ReallyLR implicit %w0
+...
+# The second copy is removed as a NOP copy, after then the first copy become
+# dead which should be removed as well.
+# CHECK-LABEL: name: copyprop4
+# CHECK: bb.0:
+# CHECK-NOT: COPY
+name: copyprop4
+allVRegsAllocated: true
+body: |
+ bb.0:
+ liveins: %w0, %w1
+ %w20 = COPY %w0
+ %w0 = COPY %w20
+ BL @foo, csr_aarch64_aapcs, implicit %w0, implicit-def %w0
+ RET_ReallyLR implicit %w0
+...
+
diff --git a/test/CodeGen/MIR/AArch64/machine-scheduler.mir b/test/CodeGen/MIR/AArch64/machine-scheduler.mir
new file mode 100644
index 000000000000..9ea5c6811b65
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/machine-scheduler.mir
@@ -0,0 +1,35 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-scheduler -verify-machineinstrs -o - %s | FileCheck %s
+
+--- |
+ define i64 @load_imp-def(i64* nocapture %P, i32 %v) {
+ entry:
+ %0 = bitcast i64* %P to i32*
+ %1 = load i32, i32* %0
+ %conv = zext i32 %1 to i64
+ %arrayidx19 = getelementptr inbounds i64, i64* %P, i64 1
+ %arrayidx1 = bitcast i64* %arrayidx19 to i32*
+ store i32 %v, i32* %arrayidx1
+ %2 = load i64, i64* %arrayidx19
+ %and = and i64 %2, 4294967295
+ %add = add nuw nsw i64 %and, %conv
+ ret i64 %add
+ }
+...
+---
+# CHECK-LABEL: name: load_imp-def
+# CHECK: bb.0.entry:
+# CHECK: LDRWui %x0, 0
+# CHECK: LDRWui %x0, 1
+# CHECK: STRWui %w1, %x0, 2
+name: load_imp-def
+isSSA: true
+body: |
+ bb.0.entry:
+ liveins: %w1, %x0
+ %w8 = LDRWui %x0, 1, implicit-def %x8 :: (load 4 from %ir.0)
+ STRWui killed %w1, %x0, 2 :: (store 4 into %ir.arrayidx1)
+ %w9 = LDRWui killed %x0, 0, implicit-def %x9 :: (load 4 from %ir.arrayidx19, align 8)
+ %x0 = ADDXrr killed %x9, killed %x8
+ RET_ReallyLR implicit %x0
+...
+
diff --git a/test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir b/test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir
index e23a352dff21..e19b618123de 100644
--- a/test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir
+++ b/test/CodeGen/MIR/AArch64/multiple-lhs-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser can parse multiple register machine
# operands before '='.
diff --git a/test/CodeGen/MIR/AArch64/stack-object-local-offset.mir b/test/CodeGen/MIR/AArch64/stack-object-local-offset.mir
index 9471516db647..a2ad2092cb0e 100644
--- a/test/CodeGen/MIR/AArch64/stack-object-local-offset.mir
+++ b/test/CodeGen/MIR/AArch64/stack-object-local-offset.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
--- |
@var = global i64 0
@@ -26,7 +26,7 @@ frameInfo:
maxAlignment: 8
# CHECK-LABEL: stack_local
# CHECK: stack:
-# CHECK_NEXT: { id:0, name:local_var, offset:0, size:8, alignment:8, local-offset: -8 }
+# CHECK-NEXT: { id: 0, name: local_var, offset: 0, size: 8, alignment: 8, local-offset: -8 }
stack:
- { id: 0,name: local_var,offset: 0,size: 8,alignment: 8, local-offset: -8 }
body: |
diff --git a/test/CodeGen/MIR/AArch64/target-flags.mir b/test/CodeGen/MIR/AArch64/target-flags.mir
index e96fce7c2f2b..e0a41015531b 100644
--- a/test/CodeGen/MIR/AArch64/target-flags.mir
+++ b/test/CodeGen/MIR/AArch64/target-flags.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
index 34793880a60b..d4d2ae15af96 100644
--- a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
+++ b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=amdgcn -mcpu=SI -start-after postrapseudos -stop-after postrapseudos -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=amdgcn -mcpu=SI -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
index e20cf376414a..1b67edc6bb43 100644
--- a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
+++ b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=amdgcn -mcpu=SI -start-after postrapseudos -stop-after postrapseudos -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=amdgcn -mcpu=SI -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
index 839fd3212c61..b0b7ea4eabd0 100644
--- a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
+++ b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=SI -start-after postrapseudos -stop-after postrapseudos -o /dev/null %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=SI -run-pass none -o - %s | FileCheck %s
# This test verifies that the MIR parser can parse target index operands.
--- |
diff --git a/test/CodeGen/MIR/ARM/bundled-instructions.mir b/test/CodeGen/MIR/ARM/bundled-instructions.mir
index 814c4e188ea5..56e21e362707 100644
--- a/test/CodeGen/MIR/ARM/bundled-instructions.mir
+++ b/test/CodeGen/MIR/ARM/bundled-instructions.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s | FileCheck %s
+# RUN: llc -mtriple thumbv7-apple-ios -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the bundled machine instructions
# and 'internal' register flags correctly.
diff --git a/test/CodeGen/MIR/ARM/cfi-same-value.mir b/test/CodeGen/MIR/ARM/cfi-same-value.mir
index f9850abe0463..32d0a85b5484 100644
--- a/test/CodeGen/MIR/ARM/cfi-same-value.mir
+++ b/test/CodeGen/MIR/ARM/cfi-same-value.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=arm-linux-unknown-gnueabi -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -mtriple=arm-linux-unknown-gnueabi -run-pass none -o - %s | FileCheck %s
--- |
declare void @dummy_use(i32*, i32)
diff --git a/test/CodeGen/MIR/ARM/expected-closing-brace.mir b/test/CodeGen/MIR/ARM/expected-closing-brace.mir
index 78d91aead247..4304935067ad 100644
--- a/test/CodeGen/MIR/ARM/expected-closing-brace.mir
+++ b/test/CodeGen/MIR/ARM/expected-closing-brace.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple thumbv7-apple-ios -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
@G = external global i32
diff --git a/test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir b/test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir
index a069dd307936..fcd938efbb91 100644
--- a/test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir
+++ b/test/CodeGen/MIR/ARM/extraneous-closing-brace-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple thumbv7-apple-ios -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test1(i32 %a) {
diff --git a/test/CodeGen/MIR/ARM/imm-peephole-arm.mir b/test/CodeGen/MIR/ARM/imm-peephole-arm.mir
new file mode 100644
index 000000000000..cd30bdb74d57
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/imm-peephole-arm.mir
@@ -0,0 +1,60 @@
+# RUN: llc -run-pass=peephole-opt %s -o - | FileCheck %s
+
+# CHECK: [[IN:%.*]] = COPY %r0
+# CHECK: [[SUM1TMP:%.*]] = ADDri [[IN]], 133
+# CHECK: [[SUM1:%.*]] = ADDri killed [[SUM1TMP]], 25600
+
+# CHECK: [[SUM2TMP:%.*]] = SUBri [[IN]], 133
+# CHECK: [[SUM2:%.*]] = SUBri killed [[SUM2TMP]], 25600
+
+# CHECK: [[SUM3TMP:%.*]] = SUBri [[IN]], 133
+# CHECK: [[SUM3:%.*]] = SUBri killed [[SUM3TMP]], 25600
+
+# CHECK: [[SUM4TMP:%.*]] = ADDri killed [[IN]], 133
+# CHECK: [[SUM4:%.*]] = ADDri killed [[SUM4TMP]], 25600
+
+
+--- |
+ target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+ target triple = "armv7-apple-ios"
+
+ define i32 @foo(i32 %in) {
+ ret i32 undef
+ }
+...
+---
+name: foo
+registers:
+ - { id: 0, class: gprnopc }
+ - { id: 1, class: rgpr }
+ - { id: 2, class: rgpr }
+ - { id: 3, class: rgpr }
+ - { id: 4, class: rgpr }
+ - { id: 5, class: rgpr }
+ - { id: 6, class: rgpr }
+ - { id: 7, class: rgpr }
+ - { id: 8, class: rgpr }
+liveins:
+ - { reg: '%r0', virtual-reg: '%0' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %r0
+
+ %0 = COPY %r0
+ %1 = MOVi32imm -25733
+ %2 = SUBrr %0, killed %1, 14, _, _
+
+ %3 = MOVi32imm 25733
+ %4 = SUBrr %0, killed %3, 14, _, _
+
+ %5 = MOVi32imm -25733
+ %6 = ADDrr %0, killed %5, 14, _, _
+
+ %7 = MOVi32imm 25733
+ %8 = ADDrr killed %0, killed %7, 14, _, _
+
+ %r0 = COPY killed %8
+ BX_RET 14, _, implicit %r0
+
+...
+
diff --git a/test/CodeGen/MIR/ARM/imm-peephole-thumb.mir b/test/CodeGen/MIR/ARM/imm-peephole-thumb.mir
new file mode 100644
index 000000000000..3d342902d80d
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/imm-peephole-thumb.mir
@@ -0,0 +1,59 @@
+# RUN: llc -run-pass=peephole-opt %s -o - | FileCheck %s
+
+# CHECK: [[IN:%.*]] = COPY %r0
+# CHECK: [[SUM1TMP:%.*]] = t2ADDri [[IN]], 25600
+# CHECK: [[SUM1:%.*]] = t2ADDri killed [[SUM1TMP]], 133
+
+# CHECK: [[SUM2TMP:%.*]] = t2SUBri [[IN]], 25600
+# CHECK: [[SUM2:%.*]] = t2SUBri killed [[SUM2TMP]], 133
+
+# CHECK: [[SUM3TMP:%.*]] = t2SUBri [[IN]], 25600
+# CHECK: [[SUM3:%.*]] = t2SUBri killed [[SUM3TMP]], 133
+
+# CHECK: [[SUM4TMP:%.*]] = t2ADDri killed [[IN]], 25600
+# CHECK: [[SUM4:%.*]] = t2ADDri killed [[SUM4TMP]], 133
+
+
+--- |
+ target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+ target triple = "thumbv7-apple-ios"
+
+ define i32 @foo(i32 %in) {
+ ret i32 undef
+ }
+...
+---
+name: foo
+registers:
+ - { id: 0, class: gprnopc }
+ - { id: 1, class: rgpr }
+ - { id: 2, class: rgpr }
+ - { id: 3, class: rgpr }
+ - { id: 4, class: rgpr }
+ - { id: 5, class: rgpr }
+ - { id: 6, class: rgpr }
+ - { id: 7, class: rgpr }
+ - { id: 8, class: rgpr }
+liveins:
+ - { reg: '%r0', virtual-reg: '%0' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %r0
+ %0 = COPY %r0
+ %1 = t2MOVi32imm -25733
+ %2 = t2SUBrr %0, killed %1, 14, _, _
+
+ %3 = t2MOVi32imm 25733
+ %4 = t2SUBrr %0, killed %3, 14, _, _
+
+ %5 = t2MOVi32imm -25733
+ %6= t2ADDrr %0, killed %5, 14, _, _
+
+ %7 = t2MOVi32imm 25733
+ %8 = t2ADDrr killed %0, killed %7, 14, _, _
+
+ %r0 = COPY killed %8
+ tBX_RET 14, _, implicit %r0
+
+...
+
diff --git a/test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir b/test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir
index b93697857e79..63b997046d02 100644
--- a/test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir
+++ b/test/CodeGen/MIR/ARM/nested-instruction-bundle-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -mtriple thumbv7-apple-ios -start-after block-placement -stop-after block-placement -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple thumbv7-apple-ios -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test1(i32 %a) {
diff --git a/test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir b/test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir
index 5b5750b8d1e8..eb4a44b7e175 100644
--- a/test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir
+++ b/test/CodeGen/MIR/ARM/sched-it-debug-nodes.mir
@@ -1,11 +1,11 @@
-# RUN: llc -mtriple thumbv7 -start-after if-converter -print-before=post-RA-sched -print-after=post-RA-sched %s -o /dev/null 2>&1 | FileCheck %s
+# RUN: llc -mtriple thumbv7 -verify-machineinstrs -start-after if-converter -print-before post-RA-sched -print-after post-RA-sched %s -o /dev/null 2>&1 | FileCheck %s
--- |
; ModuleID = '/Volumes/Data/llvm/test/CodeGen/ARM/sched-it-debug-nodes.ll'
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv7"
-
+
%struct.s = type opaque
-
+
; Function Attrs: nounwind
define arm_aapcscc i32 @f(%struct.s* %s, i32 %u, i8* %b, i32 %n) #0 !dbg !4 {
entry:
@@ -15,11 +15,11 @@
tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !21, metadata !27), !dbg !28
%cmp = icmp ult i32 %n, 4, !dbg !29
br i1 %cmp, label %return, label %if.end, !dbg !31
-
+
if.end: ; preds = %entry
tail call arm_aapcscc void @g(%struct.s* %s, i8* %b, i32 %n) #3, !dbg !32
br label %return, !dbg !33
-
+
return: ; preds = %if.end, %entry
%retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
ret i32 %retval.0, !dbg !34
@@ -31,30 +31,29 @@
; attempts to schedule the Machine Instr, and tries to tag the register in the
; debug value as KILL'ed, resulting in a DEBUG_VALUE node changing codegen! (or
; hopefully, triggering an assert).
-
+
; CHECK: BUNDLE %ITSTATE<imp-def,dead>
; CHECK: * DBG_VALUE %R1, %noreg, !"u"
; CHECK-NOT: * DBG_VALUE %R1<kill>, %noreg, !"u"
-
+
declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1
-
+
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
-
+
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
-
+
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22, !23, !24, !25}
!llvm.ident = !{!26}
-
- !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+
+ !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (llvm/trunk 237059)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "<stdin>", directory: "/Users/compnerd/Source/llvm")
!2 = !{}
- !3 = !{!4}
- !4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !5, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, variables: !17)
+ !4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !5, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17)
!5 = !DISubroutineType(types: !6)
!6 = !{!7, !8, !11, !12, !16}
!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -92,28 +91,30 @@ name: f
alignment: 1
exposesReturnsTwice: false
hasInlineAsm: false
+allVRegsAllocated: true
isSSA: false
tracksRegLiveness: true
tracksSubRegLiveness: false
-liveins:
+liveins:
- { reg: '%r0' }
+ - { reg: '%r1' }
- { reg: '%r2' }
- { reg: '%r3' }
-calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13',
- '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4',
- '%r5', '%r6', '%r7', '%r8', '%r9', '%r10', '%r11',
- '%s16', '%s17', '%s18', '%s19', '%s20', '%s21',
- '%s22', '%s23', '%s24', '%s25', '%s26', '%s27',
- '%s28', '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11',
- '%d10_d12', '%d11_d13', '%d12_d14', '%d13_d15',
- '%q4_q5', '%q5_q6', '%q6_q7', '%q4_q5_q6_q7', '%r4_r5',
- '%r6_r7', '%r8_r9', '%r10_r11', '%d8_d9_d10', '%d9_d10_d11',
- '%d10_d11_d12', '%d11_d12_d13', '%d12_d13_d14',
- '%d13_d14_d15', '%d8_d10_d12', '%d9_d11_d13', '%d10_d12_d14',
- '%d11_d13_d15', '%d8_d10_d12_d14', '%d9_d11_d13_d15',
- '%d9_d10', '%d11_d12', '%d13_d14', '%d9_d10_d11_d12',
+calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13',
+ '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4',
+ '%r5', '%r6', '%r7', '%r8', '%r9', '%r10', '%r11',
+ '%s16', '%s17', '%s18', '%s19', '%s20', '%s21',
+ '%s22', '%s23', '%s24', '%s25', '%s26', '%s27',
+ '%s28', '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11',
+ '%d10_d12', '%d11_d13', '%d12_d14', '%d13_d15',
+ '%q4_q5', '%q5_q6', '%q6_q7', '%q4_q5_q6_q7', '%r4_r5',
+ '%r6_r7', '%r8_r9', '%r10_r11', '%d8_d9_d10', '%d9_d10_d11',
+ '%d10_d11_d12', '%d11_d12_d13', '%d12_d13_d14',
+ '%d13_d14_d15', '%d8_d10_d12', '%d9_d11_d13', '%d10_d12_d14',
+ '%d11_d13_d15', '%d8_d10_d12_d14', '%d9_d11_d13_d15',
+ '%d9_d10', '%d11_d12', '%d13_d14', '%d9_d10_d11_d12',
'%d11_d12_d13_d14' ]
-frameInfo:
+frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
@@ -127,13 +128,13 @@ frameInfo:
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
-stack:
+stack:
- { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '%lr' }
- { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '%r7' }
body: |
bb.0.entry:
- liveins: %r0, %r2, %r3, %lr, %r7
-
+ liveins: %r0, %r1, %r2, %r3, %lr, %r7
+
DBG_VALUE debug-use %r0, debug-use _, !18, !27, debug-location !28
DBG_VALUE debug-use %r1, debug-use _, !19, !27, debug-location !28
DBG_VALUE debug-use %r2, debug-use _, !20, !27, debug-location !28
diff --git a/test/CodeGen/MIR/Generic/basic-blocks.mir b/test/CodeGen/MIR/Generic/basic-blocks.mir
index 22f8d28290db..0df7a9c8c633 100644
--- a/test/CodeGen/MIR/Generic/basic-blocks.mir
+++ b/test/CodeGen/MIR/Generic/basic-blocks.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses machine functions correctly.
--- |
diff --git a/test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir b/test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir
index 892258666d10..040ab7c44c4a 100644
--- a/test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir
+++ b/test/CodeGen/MIR/Generic/expected-colon-after-basic-block.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir b/test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir
index a5e04f86c6d1..42996568fe27 100644
--- a/test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir
+++ b/test/CodeGen/MIR/Generic/expected-mbb-reference-for-successor-mbb.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/Generic/frame-info.mir b/test/CodeGen/MIR/Generic/frame-info.mir
index 6e4e3955cb17..71448c8a71ba 100644
--- a/test/CodeGen/MIR/Generic/frame-info.mir
+++ b/test/CodeGen/MIR/Generic/frame-info.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses machine frame info properties
# correctly.
diff --git a/test/CodeGen/MIR/Generic/function-missing-machine-function.mir b/test/CodeGen/MIR/Generic/function-missing-machine-function.mir
index 71b5b2845340..f3a834801671 100644
--- a/test/CodeGen/MIR/Generic/function-missing-machine-function.mir
+++ b/test/CodeGen/MIR/Generic/function-missing-machine-function.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test verifies that an error is reported when a MIR file has some
# function but is missing a corresponding machine function.
diff --git a/test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir b/test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir
index 576de4bd9dc7..5f96d2d57e7c 100644
--- a/test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir
+++ b/test/CodeGen/MIR/Generic/invalid-jump-table-kind.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir b/test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir
index 3508c341c44d..15824cb2ca6d 100644
--- a/test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir
+++ b/test/CodeGen/MIR/Generic/llvm-ir-error-reported.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures an error is reported if the embedded LLVM IR contains an
# error.
diff --git a/test/CodeGen/MIR/Generic/llvmIR.mir b/test/CodeGen/MIR/Generic/llvmIR.mir
index c7a220afa505..432b18ff939d 100644
--- a/test/CodeGen/MIR/Generic/llvmIR.mir
+++ b/test/CodeGen/MIR/Generic/llvmIR.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass none -o - %s | FileCheck %s
# This test ensures that the LLVM IR that's embedded with MIR is parsed
# correctly.
diff --git a/test/CodeGen/MIR/Generic/llvmIRMissing.mir b/test/CodeGen/MIR/Generic/llvmIRMissing.mir
index afa96010f297..9f361e8d3fe4 100644
--- a/test/CodeGen/MIR/Generic/llvmIRMissing.mir
+++ b/test/CodeGen/MIR/Generic/llvmIRMissing.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass none -o - %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser accepts files without the LLVM IR.
---
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir b/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
index d6ecd5dc8514..a5737c2c1526 100644
--- a/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass none -o - %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser preserves unnamed LLVM IR block
# references.
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir b/test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir
index 41747535c351..538c3f456b0b 100644
--- a/test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-redefinition-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir b/test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir
index df559f852ec0..ac9a12b3e44d 100644
--- a/test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-undefined-ir-block.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir b/test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir
index 876947b868b0..98d68f7cd46c 100644
--- a/test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-unknown-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported whenever the MIR parser can't find
# a basic block with the machine basis block's name.
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir b/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
index 0dc7477f6275..1896371db36a 100644
--- a/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser reports an error when it encounters a
# machine function with an empty body.
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-function.mir b/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
index 6800f8724324..c547bb25d753 100644
--- a/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when the mir file has LLVM IR and
# one of the machine functions has a name that doesn't match any function in
# the LLVM IR.
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-name.mir b/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
index f65b77880e97..30f0e51b3b66 100644
--- a/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when a machine function doesn't
# have a name attribute.
diff --git a/test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir b/test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir
index be84161b5630..a05d5357182e 100644
--- a/test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir
+++ b/test/CodeGen/MIR/Generic/machine-function-redefinition-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the machine function errors are reported correctly.
---
diff --git a/test/CodeGen/MIR/Generic/machine-function.mir b/test/CodeGen/MIR/Generic/machine-function.mir
index 1c4ca3d07d2a..64802a13060e 100644
--- a/test/CodeGen/MIR/Generic/machine-function.mir
+++ b/test/CodeGen/MIR/Generic/machine-function.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses machine functions correctly.
--- |
diff --git a/test/CodeGen/MIR/Generic/multiRunPass.mir b/test/CodeGen/MIR/Generic/multiRunPass.mir
new file mode 100644
index 000000000000..bca007de80b7
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/multiRunPass.mir
@@ -0,0 +1,20 @@
+# RUN: llc -run-pass expand-isel-pseudos -run-pass peephole-opt -debug-pass=Arguments -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=PSEUDO_PEEPHOLE
+# RUN: llc -run-pass expand-isel-pseudos,peephole-opt -debug-pass=Arguments -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=PSEUDO_PEEPHOLE
+# RUN: llc -run-pass peephole-opt -run-pass expand-isel-pseudos -debug-pass=Arguments -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=PEEPHOLE_PSEUDO
+# RUN: llc -run-pass peephole-opt,expand-isel-pseudos -debug-pass=Arguments -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=PEEPHOLE_PSEUDO
+# REQUIRES: asserts
+
+# This test ensures that the command line accepts
+# several run passes on the same command line and
+# actually create the proper pipeline for it.
+# PSEUDO_PEEPHOLE: -expand-isel-pseudos -peephole-opt
+# PEEPHOLE_PSEUDO: -peephole-opt -expand-isel-pseudos
+
+# Make sure there are no other passes happening after what we asked.
+# CHECK-NEXT: --- |
+---
+# CHECK: name: foo
+name: foo
+body: |
+ bb.0:
+...
diff --git a/test/CodeGen/MIR/Generic/register-info.mir b/test/CodeGen/MIR/Generic/register-info.mir
index 229cf0f9130f..bf90196b3e6e 100644
--- a/test/CodeGen/MIR/Generic/register-info.mir
+++ b/test/CodeGen/MIR/Generic/register-info.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses machine register info properties
# correctly.
diff --git a/test/CodeGen/MIR/Hexagon/anti-dep-partial.mir b/test/CodeGen/MIR/Hexagon/anti-dep-partial.mir
new file mode 100644
index 000000000000..a83c53e57cd3
--- /dev/null
+++ b/test/CodeGen/MIR/Hexagon/anti-dep-partial.mir
@@ -0,0 +1,35 @@
+# RUN: llc -march=hexagon -post-RA-scheduler -run-pass post-RA-sched %s -o - | FileCheck %s
+
+--- |
+ declare void @check(i64, i32, i32, i64)
+ define void @foo() {
+ ret void
+ }
+...
+
+---
+name: foo
+tracksRegLiveness: true
+allVRegsAllocated: true
+body: |
+ bb.0:
+ successors:
+ liveins: %r0, %r1, %d1, %d2, %r16, %r17, %r19, %r22, %r23
+ %r2 = A2_add %r23, killed %r17
+ %r6 = M2_mpyi %r16, %r16
+ %r22 = M2_accii %r22, killed %r2, 2
+ %r7 = A2_tfrsi 12345678
+ %r3 = A2_tfr killed %r16
+ %d2 = A2_tfrp killed %d0
+ %r2 = L2_loadri_io %r29, 28
+ %r2 = M2_mpyi killed %r6, killed %r2
+ %r23 = S2_asr_i_r %r22, 31
+ S2_storeri_io killed %r29, 0, killed %r7
+ ; The anti-dependency on r23 between the first A2_add and the
+ ; S2_asr_i_r was causing d11 to be renamed, while r22 remained
+ ; unchanged. Check that the renaming of d11 does not happen.
+ ; CHECK: d11
+ %d0 = A2_tfrp killed %d11
+ J2_call @check, implicit-def %d0, implicit-def %d1, implicit-def %d2, implicit %d0, implicit %d1, implicit %d2
+...
+
diff --git a/test/CodeGen/MIR/Hexagon/lit.local.cfg b/test/CodeGen/MIR/Hexagon/lit.local.cfg
new file mode 100644
index 000000000000..cc6a7edf05f3
--- /dev/null
+++ b/test/CodeGen/MIR/Hexagon/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'Hexagon' in config.root.targets:
+ config.unsupported = True
diff --git a/test/CodeGen/MIR/Lanai/lit.local.cfg b/test/CodeGen/MIR/Lanai/lit.local.cfg
new file mode 100644
index 000000000000..f1b8b4f4e21f
--- /dev/null
+++ b/test/CodeGen/MIR/Lanai/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'Lanai' in config.root.targets:
+ config.unsupported = True
diff --git a/test/CodeGen/MIR/Lanai/peephole-compare.mir b/test/CodeGen/MIR/Lanai/peephole-compare.mir
new file mode 100644
index 000000000000..763fe2b9b961
--- /dev/null
+++ b/test/CodeGen/MIR/Lanai/peephole-compare.mir
@@ -0,0 +1,714 @@
+# RUN: llc -run-pass=peephole-opt %s -o - | FileCheck %s
+
+# Test the compare fold peephole.
+
+# CHECK-LABEL: name: test0a
+# TODO: Enhance combiner to handle this case. This expands into:
+# sub %r7, %r6, %r3
+# sub.f %r7, %r6, %r0
+# sel.eq %r18, %r3, %rv
+# This is different from the pattern currently matched. If the lowered form had
+# been sub.f %r3, 0, %r0 then it would have matched.
+
+# CHECK-LABEL: name: test1a
+# CHECK: [[IN1:%.*]] = COPY %r7
+# CHECK: [[IN2:%.*]] = COPY %r6
+# CHECK: SUB_F_R [[IN1]], [[IN2]], 0, implicit-def %sr
+
+# CHECK-LABEL: name: test1b
+# CHECK: [[IN1:%.*]] = COPY %r7
+# CHECK: [[IN2:%.*]] = COPY %r6
+# CHECK: SUB_F_R [[IN1]], [[IN2]], 0, implicit-def %sr
+
+# CHECK-LABEL: name: test2a
+# CHECK: [[IN1:%.*]] = COPY %r7
+# CHECK: [[IN2:%.*]] = COPY %r6
+# CHECK: SUB_F_R [[IN1]], [[IN2]], 0, implicit-def %sr
+
+# CHECK-LABEL: name: test2b
+# CHECK: [[IN1:%.*]] = COPY %r7
+# CHECK: [[IN2:%.*]] = COPY %r6
+# CHECK: SUB_F_R [[IN1]], [[IN2]], 0, implicit-def %sr
+
+# CHECK-LABEL: name: test3
+# CHECK: AND_F_R
+# CHECK: AND_F_R
+# CHECK: AND_F_R
+
+--- |
+ target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+ target triple = "lanai-unknown-unknown"
+
+ @a = global i32 -1, align 4
+ @b = global i32 0, align 4
+
+ define i32 @test0a(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %sub = sub i32 %b, %a
+ %cmp = icmp eq i32 %sub, 0
+ %cond = select i1 %cmp, i32 %c, i32 %sub
+ ret i32 %cond
+ }
+
+ define i32 @test0b(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %cmp = icmp eq i32 %b, %a
+ %cond = select i1 %cmp, i32 %c, i32 %b
+ ret i32 %cond
+ }
+
+ define i32 @test1a(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %sub = sub i32 %b, %a
+ %cmp = icmp slt i32 %sub, 0
+ %cond = select i1 %cmp, i32 %c, i32 %d
+ ret i32 %cond
+ }
+
+ define i32 @test1b(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %sub = sub i32 %b, %a
+ %cmp = icmp slt i32 %sub, 0
+ %cond = select i1 %cmp, i32 %c, i32 %d
+ ret i32 %cond
+ }
+
+ define i32 @test2a(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %sub = sub i32 %b, %a
+ %cmp = icmp sgt i32 %sub, -1
+ %cond = select i1 %cmp, i32 %c, i32 %d
+ ret i32 %cond
+ }
+
+ define i32 @test2b(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %sub = sub i32 %b, %a
+ %cmp = icmp sgt i32 %sub, -1
+ %cond = select i1 %cmp, i32 %c, i32 %d
+ ret i32 %cond
+ }
+
+ define i32 @test3(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %sub = sub i32 %b, %a
+ %cmp = icmp slt i32 %sub, 1
+ %cond = select i1 %cmp, i32 %c, i32 %d
+ ret i32 %cond
+ }
+
+ define i32 @test4(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+ entry:
+ %cmp = icmp ne i32 %a, 0
+ %cmp1 = icmp ult i32 %a, %b
+ %or.cond = and i1 %cmp, %cmp1
+ br i1 %or.cond, label %return, label %if.end
+
+ if.end: ; preds = %entry
+ %cmp2 = icmp ne i32 %b, 0
+ %cmp4 = icmp ult i32 %b, %c
+ %or.cond29 = and i1 %cmp2, %cmp4
+ br i1 %or.cond29, label %return, label %if.end6
+
+ if.end6: ; preds = %if.end
+ %cmp7 = icmp ne i32 %c, 0
+ %cmp9 = icmp ult i32 %c, %d
+ %or.cond30 = and i1 %cmp7, %cmp9
+ br i1 %or.cond30, label %return, label %if.end11
+
+ if.end11: ; preds = %if.end6
+ %cmp12 = icmp ne i32 %d, 0
+ %cmp14 = icmp ult i32 %d, %a
+ %or.cond31 = and i1 %cmp12, %cmp14
+ %b. = select i1 %or.cond31, i32 %b, i32 21
+ ret i32 %b.
+
+ return: ; preds = %if.end6, %if.end, %entry
+ %retval.0 = phi i32 [ %c, %entry ], [ %d, %if.end ], [ %a, %if.end6 ]
+ ret i32 %retval.0
+ }
+
+ define void @testBB() {
+ entry:
+ %0 = load i32, i32* @a, align 4, !tbaa !0
+ %1 = load i32, i32* @b, align 4, !tbaa !0
+ %sub.i = sub i32 %1, %0
+ %tobool = icmp sgt i32 %sub.i, -1
+ br i1 %tobool, label %if.end, label %if.then
+
+ if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)()
+ br label %while.body
+
+ while.body: ; preds = %while.body, %if.then
+ br label %while.body
+
+ if.end: ; preds = %entry
+ %cmp.i = icmp slt i32 %sub.i, 1
+ br i1 %cmp.i, label %if.then4, label %if.end7
+
+ if.then4: ; preds = %if.end
+ %call5 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)()
+ br label %while.body6
+
+ while.body6: ; preds = %while.body6, %if.then4
+ br label %while.body6
+
+ if.end7: ; preds = %if.end
+ ret void
+ }
+
+ declare i32 @g(...)
+
+ ; Function Attrs: nounwind
+ declare void @llvm.stackprotector(i8*, i8**) #0
+
+ attributes #0 = { nounwind }
+
+ !0 = !{!1, !1, i64 0}
+ !1 = !{!"int", !2, i64 0}
+ !2 = !{!"omnipotent char", !3, i64 0}
+ !3 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name: test0a
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%0' }
+ - { reg: '%r7', virtual-reg: '%1' }
+ - { reg: '%r18', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %r6, %r7, %r18
+
+ %2 = COPY %r18
+ %1 = COPY %r7
+ %0 = COPY %r6
+ %4 = SUB_R %1, %0, 0
+ SFSUB_F_RI_LO %4, 0, implicit-def %sr
+ %5 = SELECT %2, %4, 7, implicit %sr
+ %rv = COPY %5
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: test0b
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%0' }
+ - { reg: '%r7', virtual-reg: '%1' }
+ - { reg: '%r18', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %r6, %r7, %r18
+
+ %2 = COPY %r18
+ %1 = COPY %r7
+ %0 = COPY %r6
+ SFSUB_F_RR %1, %0, implicit-def %sr
+ %4 = SELECT %2, %1, 7, implicit %sr
+ %rv = COPY %4
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: test1a
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%0' }
+ - { reg: '%r7', virtual-reg: '%1' }
+ - { reg: '%r18', virtual-reg: '%2' }
+ - { reg: '%r19', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %r6, %r7, %r18, %r19
+
+ %3 = COPY %r19
+ %2 = COPY %r18
+ %1 = COPY %r7
+ %0 = COPY %r6
+ %4 = SUB_R %1, %0, 0
+ SFSUB_F_RI_LO killed %4, 0, implicit-def %sr
+ %5 = SELECT %2, %3, 11, implicit %sr
+ %rv = COPY %5
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: test1b
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%0' }
+ - { reg: '%r7', virtual-reg: '%1' }
+ - { reg: '%r18', virtual-reg: '%2' }
+ - { reg: '%r19', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %r6, %r7, %r18, %r19
+
+ %3 = COPY %r19
+ %2 = COPY %r18
+ %1 = COPY %r7
+ %0 = COPY %r6
+ %4 = SUB_R %1, %0, 0
+ SFSUB_F_RI_LO killed %4, 0, implicit-def %sr
+ %5 = SELECT %2, %3, 11, implicit %sr
+ %rv = COPY %5
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: test2a
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%0' }
+ - { reg: '%r7', virtual-reg: '%1' }
+ - { reg: '%r18', virtual-reg: '%2' }
+ - { reg: '%r19', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %r6, %r7, %r18, %r19
+
+ %3 = COPY %r19
+ %2 = COPY %r18
+ %1 = COPY %r7
+ %0 = COPY %r6
+ %4 = SUB_R %1, %0, 0
+ SFSUB_F_RI_LO killed %4, 0, implicit-def %sr
+ %5 = SELECT %2, %3, 10, implicit %sr
+ %rv = COPY %5
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: test2b
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%0' }
+ - { reg: '%r7', virtual-reg: '%1' }
+ - { reg: '%r18', virtual-reg: '%2' }
+ - { reg: '%r19', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %r6, %r7, %r18, %r19
+
+ %3 = COPY %r19
+ %2 = COPY %r18
+ %1 = COPY %r7
+ %0 = COPY %r6
+ %4 = SUB_R %1, %0, 0
+ SFSUB_F_RI_LO killed %4, 0, implicit-def %sr
+ %5 = SELECT %2, %3, 10, implicit %sr
+ %rv = COPY %5
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: test3
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%0' }
+ - { reg: '%r7', virtual-reg: '%1' }
+ - { reg: '%r18', virtual-reg: '%2' }
+ - { reg: '%r19', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ liveins: %r6, %r7, %r18, %r19
+
+ %3 = COPY %r19
+ %2 = COPY %r18
+ %1 = COPY %r7
+ %0 = COPY %r6
+ %4 = SUB_R %1, %0, 0
+ SFSUB_F_RI_LO killed %4, 1, implicit-def %sr
+ %5 = SELECT %2, %3, 13, implicit %sr
+ %rv = COPY %5
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: test4
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+ - { id: 6, class: gpr }
+ - { id: 7, class: gpr }
+ - { id: 8, class: gpr }
+ - { id: 9, class: gpr }
+ - { id: 10, class: gpr }
+ - { id: 11, class: gpr }
+ - { id: 12, class: gpr }
+ - { id: 13, class: gpr }
+ - { id: 14, class: gpr }
+ - { id: 15, class: gpr }
+ - { id: 16, class: gpr }
+ - { id: 17, class: gpr }
+ - { id: 18, class: gpr }
+ - { id: 19, class: gpr }
+ - { id: 20, class: gpr }
+ - { id: 21, class: gpr }
+ - { id: 22, class: gpr }
+liveins:
+ - { reg: '%r6', virtual-reg: '%1' }
+ - { reg: '%r7', virtual-reg: '%2' }
+ - { reg: '%r18', virtual-reg: '%3' }
+ - { reg: '%r19', virtual-reg: '%4' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ successors: %bb.4.return, %bb.1.if.end
+ liveins: %r6, %r7, %r18, %r19
+
+ %4 = COPY %r19
+ %3 = COPY %r18
+ %2 = COPY %r7
+ %1 = COPY %r6
+ SFSUB_F_RI_LO %1, 0, implicit-def %sr
+ %5 = SCC 6, implicit %sr
+ SFSUB_F_RR %1, %2, implicit-def %sr
+ %6 = SCC 4, implicit %sr
+ %7 = AND_R killed %5, killed %6, 0
+ %8 = SLI 1
+ %9 = AND_R killed %7, %8, 0
+ SFSUB_F_RI_LO killed %9, 0, implicit-def %sr
+ BRCC %bb.4.return, 6, implicit %sr
+ BT %bb.1.if.end
+
+ bb.1.if.end:
+ successors: %bb.4.return, %bb.2.if.end6
+
+ SFSUB_F_RI_LO %2, 0, implicit-def %sr
+ %10 = SCC 6, implicit %sr
+ SFSUB_F_RR %2, %3, implicit-def %sr
+ %11 = SCC 4, implicit %sr
+ %12 = AND_R killed %10, killed %11, 0
+ %14 = AND_R killed %12, %8, 0
+ SFSUB_F_RI_LO killed %14, 0, implicit-def %sr
+ BRCC %bb.4.return, 6, implicit %sr
+ BT %bb.2.if.end6
+
+ bb.2.if.end6:
+ successors: %bb.4.return, %bb.3.if.end11
+
+ SFSUB_F_RI_LO %3, 0, implicit-def %sr
+ %15 = SCC 6, implicit %sr
+ SFSUB_F_RR %3, %4, implicit-def %sr
+ %16 = SCC 4, implicit %sr
+ %17 = AND_R killed %15, killed %16, 0
+ %18 = SLI 1
+ %19 = AND_R killed %17, killed %18, 0
+ SFSUB_F_RI_LO killed %19, 0, implicit-def %sr
+ BRCC %bb.4.return, 6, implicit %sr
+ BT %bb.3.if.end11
+
+ bb.3.if.end11:
+ %20 = SLI 21
+ SFSUB_F_RR %4, %1, implicit-def %sr
+ %21 = SELECT %2, %20, 4, implicit %sr
+ SFSUB_F_RI_LO %4, 0, implicit-def %sr
+ %22 = SELECT killed %21, %20, 6, implicit %sr
+ %rv = COPY %22
+ RET implicit %rca, implicit %rv
+
+ bb.4.return:
+ %0 = PHI %3, %bb.0.entry, %4, %bb.1.if.end, %1, %bb.2.if.end6
+ %rv = COPY %0
+ RET implicit %rca, implicit %rv
+
+...
+---
+name: testBB
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+ - { id: 6, class: gpr }
+ - { id: 7, class: gpr }
+ - { id: 8, class: gpr }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: true
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ successors: %bb.3.if.end, %bb.1.if.then
+
+ %1 = MOVHI target-flags(lanai-hi) @a
+ %2 = OR_I_LO killed %1, target-flags(lanai-lo) @a
+ %3 = LDW_RI killed %2, 0, 0 :: (load 4 from @a, !tbaa !0)
+ %4 = MOVHI target-flags(lanai-hi) @b
+ %5 = OR_I_LO killed %4, target-flags(lanai-lo) @b
+ %6 = LDW_RI killed %5, 0, 0 :: (load 4 from @b, !tbaa !0)
+ %0 = SUB_R killed %6, killed %3, 0
+ SFSUB_F_RI_LO %0, 0, implicit-def %sr
+ BRCC %bb.3.if.end, 10, implicit %sr
+ BT %bb.1.if.then
+
+ bb.1.if.then:
+ successors: %bb.2.while.body
+
+ ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp
+ CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv
+ ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
+
+ bb.2.while.body:
+ successors: %bb.2.while.body
+
+ BT %bb.2.while.body
+
+ bb.3.if.end:
+ successors: %bb.4.if.then4, %bb.6.if.end7
+ liveins: %sr
+
+ BRCC %bb.6.if.end7, 14, implicit %sr
+ BT %bb.4.if.then4
+
+ bb.4.if.then4:
+ successors: %bb.5.while.body6
+
+ ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp
+ CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv
+ ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
+
+ bb.5.while.body6:
+ successors: %bb.5.while.body6
+
+ BT %bb.5.while.body6
+
+ bb.6.if.end7:
+ RET implicit %rca
+
+...
diff --git a/test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir b/test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir
index ea94c9906557..cc7a96ff50cc 100644
--- a/test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir
+++ b/test/CodeGen/MIR/Mips/expected-global-value-or-symbol-after-call-entry.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=mipsel -mattr=mips16 -relocation-model=pic -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=mipsel -mattr=mips16 -relocation-model=pic -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test(i32 %a) {
entry:
diff --git a/test/CodeGen/MIR/Mips/memory-operands.mir b/test/CodeGen/MIR/Mips/memory-operands.mir
index d4206b067f7e..69ecf985fcc0 100644
--- a/test/CodeGen/MIR/Mips/memory-operands.mir
+++ b/test/CodeGen/MIR/Mips/memory-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the call entry pseudo source
# values in memory operands correctly.
@@ -50,8 +50,8 @@ body: |
%v0 = SllX16 killed %v0, 16
%v0 = AdduRxRyRz16 killed %v1, killed %v0
; CHECK-LABEL: name: test
- ; CHECK: %v1 = LwRxRyOffMemX16 %v0, @foo, 0 :: (load 4 from call-entry @foo)
- %v1 = LwRxRyOffMemX16 %v0, @foo, 0 :: (load 4 from call-entry @foo)
+ ; CHECK: %v1 = LwRxRyOffMemX16 %v0, @foo :: (load 4 from call-entry @foo)
+ %v1 = LwRxRyOffMemX16 %v0, @foo :: (load 4 from call-entry @foo)
%t9 = COPY %v1
%gp = COPY killed %v0
JumpLinkReg16 killed %v1, csr_o32, implicit-def %ra, implicit killed %t9, implicit %a0, implicit killed %gp, implicit-def %sp, implicit-def dead %v0
@@ -87,13 +87,13 @@ body: |
%v0, %v1 = GotPrologue16 $_gp_disp, $_gp_disp
%v0 = SllX16 killed %v0, 16
%s0 = AdduRxRyRz16 killed %v1, killed %v0
- %v0 = LwRxRyOffMemX16 %s0, @g, 0 :: (load 4 from call-entry @g)
+ %v0 = LwRxRyOffMemX16 %s0, @g :: (load 4 from call-entry @g)
; CHECK-LABEL: test2
- ; CHECK: %v1 = LwRxRyOffMemX16 %s0, $__mips16_call_stub_sf_0, 0 :: (load 4 from call-entry $__mips16_call_stub_sf_0)
- %v1 = LwRxRyOffMemX16 %s0, $__mips16_call_stub_sf_0, 0 :: (load 4 from call-entry $__mips16_call_stub_sf_0)
+ ; CHECK: %v1 = LwRxRyOffMemX16 %s0, $__mips16_call_stub_sf_0 :: (load 4 from call-entry $__mips16_call_stub_sf_0)
+ %v1 = LwRxRyOffMemX16 %s0, $__mips16_call_stub_sf_0 :: (load 4 from call-entry $__mips16_call_stub_sf_0)
%gp = COPY %s0
JumpLinkReg16 killed %v1, csr_o32, implicit-def %ra, implicit %v0, implicit killed %gp, implicit-def %sp, implicit-def %v0
- %v1 = LwRxRyOffMemX16 %s0, @__mips16_ret_sf, 0 :: (load 4 from call-entry @__mips16_ret_sf)
+ %v1 = LwRxRyOffMemX16 %s0, @__mips16_ret_sf :: (load 4 from call-entry @__mips16_ret_sf)
%t9 = COPY %v1
%gp = COPY killed %s0
JumpLinkReg16 killed %v1, csr_mips16rethelper, implicit-def %ra, implicit killed %t9, implicit %v0, implicit killed %gp, implicit-def %sp
diff --git a/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir b/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
index 28fb2a2cf5c9..d35fd323bf5d 100644
--- a/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
+++ b/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=nvptx -mcpu=sm_20 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=nvptx -mcpu=sm_20 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
index 18866d58a946..312bf004a9ce 100644
--- a/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
+++ b/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=nvptx -mcpu=sm_20 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=nvptx -mcpu=sm_20 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses floating point constant operands
# correctly.
diff --git a/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir b/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
index e4080f80ee52..2ff7f1a9451d 100644
--- a/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
+++ b/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=nvptx -mcpu=sm_20 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=nvptx -mcpu=sm_20 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir b/test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir
index 39d14e72ffee..3caab2c7a578 100644
--- a/test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir
+++ b/test/CodeGen/MIR/PowerPC/unordered-implicit-registers.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=powerpc64-unknown-linux-gnu -start-after machine-combiner -stop-after machine-combiner -o /dev/null %s | FileCheck %s
+# RUN: llc -mtriple=powerpc64-unknown-linux-gnu -run-pass none -o - %s | FileCheck %s
# PR24724
--- |
diff --git a/test/CodeGen/MIR/X86/basic-block-liveins.mir b/test/CodeGen/MIR/X86/basic-block-liveins.mir
index 00732975495d..35f5512936ba 100644
--- a/test/CodeGen/MIR/X86/basic-block-liveins.mir
+++ b/test/CodeGen/MIR/X86/basic-block-liveins.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses basic block liveins correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir b/test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir
index b4b7dddea56c..01c226a34537 100644
--- a/test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir
+++ b/test/CodeGen/MIR/X86/basic-block-not-at-start-of-line-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/block-address-operands.mir b/test/CodeGen/MIR/X86/block-address-operands.mir
index 3c2d2aefff20..2207f9360965 100644
--- a/test/CodeGen/MIR/X86/block-address-operands.mir
+++ b/test/CodeGen/MIR/X86/block-address-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the block address operands
# correctly.
diff --git a/test/CodeGen/MIR/X86/callee-saved-info.mir b/test/CodeGen/MIR/X86/callee-saved-info.mir
index 17c7739951d9..883f6fdb0d22 100644
--- a/test/CodeGen/MIR/X86/callee-saved-info.mir
+++ b/test/CodeGen/MIR/X86/callee-saved-info.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after prologepilog -stop-after prologepilog -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses callee saved information in the
# stack objects correctly.
diff --git a/test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir b/test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir
index 47051a53e3f4..ed26df684b00 100644
--- a/test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir
+++ b/test/CodeGen/MIR/X86/cfi-def-cfa-offset.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the .cfi_def_cfa_offset operands
# correctly.
diff --git a/test/CodeGen/MIR/X86/cfi-def-cfa-register.mir b/test/CodeGen/MIR/X86/cfi-def-cfa-register.mir
index 74a33b5c3437..9a57eb047b87 100644
--- a/test/CodeGen/MIR/X86/cfi-def-cfa-register.mir
+++ b/test/CodeGen/MIR/X86/cfi-def-cfa-register.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the .cfi_def_cfa_register
# operands correctly.
diff --git a/test/CodeGen/MIR/X86/cfi-offset.mir b/test/CodeGen/MIR/X86/cfi-offset.mir
index fd9e605a036a..0a50fe1866f4 100644
--- a/test/CodeGen/MIR/X86/cfi-offset.mir
+++ b/test/CodeGen/MIR/X86/cfi-offset.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the .cfi_offset operands
# correctly.
diff --git a/test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir b/test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir
index 2ddf5736b977..2f016a7599e3 100644
--- a/test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir
+++ b/test/CodeGen/MIR/X86/constant-pool-item-redefinition-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/constant-pool.mir b/test/CodeGen/MIR/X86/constant-pool.mir
index 213e4e283485..3312e6f67bde 100644
--- a/test/CodeGen/MIR/X86/constant-pool.mir
+++ b/test/CodeGen/MIR/X86/constant-pool.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses constant pool constants and
# constant pool operands correctly.
diff --git a/test/CodeGen/MIR/X86/constant-value-error.mir b/test/CodeGen/MIR/X86/constant-value-error.mir
index 1e14d2282c5a..baf933a87105 100644
--- a/test/CodeGen/MIR/X86/constant-value-error.mir
+++ b/test/CodeGen/MIR/X86/constant-value-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser reports an error when parsing an invalid
# constant value.
diff --git a/test/CodeGen/MIR/X86/dead-register-flag.mir b/test/CodeGen/MIR/X86/dead-register-flag.mir
index 309e776de46a..e6ab458e7389 100644
--- a/test/CodeGen/MIR/X86/dead-register-flag.mir
+++ b/test/CodeGen/MIR/X86/dead-register-flag.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the 'dead' register flags
# correctly.
diff --git a/test/CodeGen/MIR/X86/def-register-already-tied-error.mir b/test/CodeGen/MIR/X86/def-register-already-tied-error.mir
index 69c816f59b9b..bd9365b5f416 100644
--- a/test/CodeGen/MIR/X86/def-register-already-tied-error.mir
+++ b/test/CodeGen/MIR/X86/def-register-already-tied-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i64 @test(i64 %x) #0 {
entry:
diff --git a/test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir b/test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir
index 7d01810c792b..0c15e84f2268 100644
--- a/test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir
+++ b/test/CodeGen/MIR/X86/duplicate-memory-operand-flag.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/duplicate-register-flag-error.mir b/test/CodeGen/MIR/X86/duplicate-register-flag-error.mir
index d80c6ed061de..9d8f4f159304 100644
--- a/test/CodeGen/MIR/X86/duplicate-register-flag-error.mir
+++ b/test/CodeGen/MIR/X86/duplicate-register-flag-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/early-clobber-register-flag.mir b/test/CodeGen/MIR/X86/early-clobber-register-flag.mir
index 4dc442e4fb94..0870fa062be6 100644
--- a/test/CodeGen/MIR/X86/early-clobber-register-flag.mir
+++ b/test/CodeGen/MIR/X86/early-clobber-register-flag.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the 'early-clobber' register
# flags correctly.
diff --git a/test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir
index f2e349454c5d..f65a5e6c9486 100644
--- a/test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-align-in-memory-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir
index 7ce377f8c5fb..9bde7bf279a9 100644
--- a/test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-alignment-after-align-in-memory-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir b/test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir
index 861baec4bcbc..0de5b5bc6878 100644
--- a/test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir
+++ b/test/CodeGen/MIR/X86/expected-basic-block-at-start-of-body.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir b/test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir
index ef7df4c8c20f..c74d42d4dcc7 100644
--- a/test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir
+++ b/test/CodeGen/MIR/X86/expected-block-reference-in-blockaddress.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir b/test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir
index ba7b2ab64c3e..52ba166094f3 100644
--- a/test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir
+++ b/test/CodeGen/MIR/X86/expected-comma-after-cfi-register.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir b/test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir
index dd5693952573..f617ddfa0eb0 100644
--- a/test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-comma-after-memory-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
index 601551a7720a..d96d263a3204 100644
--- a/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
index 6494960d3264..bd6cf6bd576d 100644
--- a/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
+++ b/test/CodeGen/MIR/X86/expected-different-implicit-register-flag.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir
deleted file mode 100644
index f9e9d0b22968..000000000000
--- a/test/CodeGen/MIR/X86/expected-from-in-memory-operand.mir
+++ /dev/null
@@ -1,24 +0,0 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
-
---- |
-
- define i32 @test(i32* %a) {
- entry:
- %b = load i32, i32* %a
- ret i32 %b
- }
-
-...
----
-name: test
-tracksRegLiveness: true
-liveins:
- - { reg: '%rdi' }
-body: |
- bb.0.entry:
- liveins: %rdi
- ; CHECK: [[@LINE+1]]:55: expected 'from'
- %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 %ir.a)
- RETQ %eax
-...
-
diff --git a/test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir b/test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir
index de6a745fd702..2f53023ecdb2 100644
--- a/test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir
+++ b/test/CodeGen/MIR/X86/expected-function-reference-after-blockaddress.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir b/test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir
index f737c06c3e1e..1cabcfc73c2a 100644
--- a/test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir
+++ b/test/CodeGen/MIR/X86/expected-global-value-after-blockaddress.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir b/test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir
index e337292f17a2..f2f354b5a7c9 100644
--- a/test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir
+++ b/test/CodeGen/MIR/X86/expected-integer-after-offset-sign.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir b/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
index 580d2bc0a419..c3f4fca11eaa 100644
--- a/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
+++ b/test/CodeGen/MIR/X86/expected-integer-after-tied-def.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i64 @test(i64 %x) #0 {
entry:
diff --git a/test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir b/test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir
index 83874eb67476..e8f06358505b 100644
--- a/test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir
+++ b/test/CodeGen/MIR/X86/expected-integer-in-successor-weight.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir
index 8fcd622a18e6..225f767c5558 100644
--- a/test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-load-or-store-in-memory-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-machine-operand.mir b/test/CodeGen/MIR/X86/expected-machine-operand.mir
index 3ba5126b9982..70fff3daa093 100644
--- a/test/CodeGen/MIR/X86/expected-machine-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-machine-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir b/test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir
index 620bb5d961ee..c5d7d5eb2893 100644
--- a/test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir
+++ b/test/CodeGen/MIR/X86/expected-metadata-node-after-debug-location.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
@@ -20,11 +20,10 @@
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
- !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+ !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "test.ll", directory: "")
!2 = !{}
- !3 = !{!4}
- !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+ !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
!5 = !DIFile(filename: "test.c", directory: "")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
diff --git a/test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir b/test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir
index 6497f5db2026..c94fd9f5028e 100644
--- a/test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir
+++ b/test/CodeGen/MIR/X86/expected-metadata-node-after-exclaim.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
@@ -20,11 +20,10 @@
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
- !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+ !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "test.ll", directory: "")
!2 = !{}
- !3 = !{!4}
- !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+ !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
!5 = !DIFile(filename: "test.c", directory: "")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
diff --git a/test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir b/test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir
index 9a4696779fb5..dc8cb02338ea 100644
--- a/test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir
+++ b/test/CodeGen/MIR/X86/expected-metadata-node-in-stack-object.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test(i32 %x) {
entry:
diff --git a/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir b/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
index 04568f6dde57..bba7b1a6e4a0 100644
--- a/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
+++ b/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-scheduler -stop-after machine-scheduler -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir b/test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir
index be57734ecf33..424f7cb21c45 100644
--- a/test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir
+++ b/test/CodeGen/MIR/X86/expected-named-register-in-callee-saved-register.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after prologepilog -stop-after prologepilog -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir b/test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir
index ae9f776ad769..a6384bb07197 100644
--- a/test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir
+++ b/test/CodeGen/MIR/X86/expected-named-register-in-functions-livein.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-named-register-livein.mir b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
index 41e6a4a6cc88..fcd68d3f614c 100644
--- a/test/CodeGen/MIR/X86/expected-named-register-livein.mir
+++ b/test/CodeGen/MIR/X86/expected-named-register-livein.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir b/test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir
index 1f0439d126f4..238d7aa6ffb8 100644
--- a/test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir
+++ b/test/CodeGen/MIR/X86/expected-newline-at-end-of-list.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-number-after-bb.mir b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
index a239cf176f5f..6770031da807 100644
--- a/test/CodeGen/MIR/X86/expected-number-after-bb.mir
+++ b/test/CodeGen/MIR/X86/expected-number-after-bb.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir b/test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir
index aefeed9ce05e..e3c5ee9b9c88 100644
--- a/test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-offset-after-cfi-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir
index fca078c3497c..3d127f855ced 100644
--- a/test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-pointer-value-in-memory-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir b/test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir
index 31b4c5be1251..1119133fc113 100644
--- a/test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir
+++ b/test/CodeGen/MIR/X86/expected-positive-alignment-after-align.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir b/test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir
index 3280fca6d551..eea795821465 100644
--- a/test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-register-after-cfi-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-register-after-flags.mir b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
index 68f1060ad873..dc679ea7fc22 100644
--- a/test/CodeGen/MIR/X86/expected-register-after-flags.mir
+++ b/test/CodeGen/MIR/X86/expected-register-after-flags.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when a register operand doesn't
# follow register flags.
diff --git a/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir b/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
index 71ff15bd9c52..cfa03247e31f 100644
--- a/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
+++ b/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-stack-object.mir b/test/CodeGen/MIR/X86/expected-stack-object.mir
index ff0c10d59e33..c536295c42f4 100644
--- a/test/CodeGen/MIR/X86/expected-stack-object.mir
+++ b/test/CodeGen/MIR/X86/expected-stack-object.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
index 6283427c10b3..d1d62461d371 100644
--- a/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
+++ b/test/CodeGen/MIR/X86/expected-subregister-after-colon.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-target-flag-name.mir b/test/CodeGen/MIR/X86/expected-target-flag-name.mir
index 3d094a11e9f3..c3ee45d96606 100644
--- a/test/CodeGen/MIR/X86/expected-target-flag-name.mir
+++ b/test/CodeGen/MIR/X86/expected-target-flag-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir b/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
index e8d6afd5333e..9e307f8833d9 100644
--- a/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
+++ b/test/CodeGen/MIR/X86/expected-tied-def-after-lparen.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i64 @test(i64 %x) #0 {
entry:
diff --git a/test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir b/test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir
index f99443f1726d..a76202eb55b4 100644
--- a/test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir
+++ b/test/CodeGen/MIR/X86/expected-value-in-memory-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir b/test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir
index da0d1e166a1c..cdfcabbbf827 100644
--- a/test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir
+++ b/test/CodeGen/MIR/X86/expected-virtual-register-in-functions-livein.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/external-symbol-operands.mir b/test/CodeGen/MIR/X86/external-symbol-operands.mir
index 7e85d946b75a..599f957f66d5 100644
--- a/test/CodeGen/MIR/X86/external-symbol-operands.mir
+++ b/test/CodeGen/MIR/X86/external-symbol-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the external symbol machine
# operands correctly.
diff --git a/test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir b/test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir
index 75d0f8a39c1c..f7a48d2eec06 100644
--- a/test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir
+++ b/test/CodeGen/MIR/X86/fixed-stack-memory-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses fixed stack memory operands
# correctly.
diff --git a/test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir b/test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir
index c4c57a1d2443..d1b7c1633c29 100644
--- a/test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir
+++ b/test/CodeGen/MIR/X86/fixed-stack-object-redefinition-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/fixed-stack-objects.mir b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
index 70e5a7428359..a7ecac841a64 100644
--- a/test/CodeGen/MIR/X86/fixed-stack-objects.mir
+++ b/test/CodeGen/MIR/X86/fixed-stack-objects.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses fixed stack objects correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
index 54fa8ad0b616..2d5347e5d30d 100644
--- a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
+++ b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -enable-shrink-wrap=true -start-after shrink-wrap -stop-after shrink-wrap -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the save and restore points in
# the machine frame info correctly.
diff --git a/test/CodeGen/MIR/X86/frame-info-stack-references.mir b/test/CodeGen/MIR/X86/frame-info-stack-references.mir
index c8fa3bbe226f..e64b44c65f81 100644
--- a/test/CodeGen/MIR/X86/frame-info-stack-references.mir
+++ b/test/CodeGen/MIR/X86/frame-info-stack-references.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the stack protector stack
# object reference in the machine frame info correctly.
diff --git a/test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir b/test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir
index 87c1fc68046e..5ae4df459437 100644
--- a/test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir
+++ b/test/CodeGen/MIR/X86/frame-setup-instruction-flag.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the frame setup instruction flag.
--- |
diff --git a/test/CodeGen/MIR/X86/function-liveins.mir b/test/CodeGen/MIR/X86/function-liveins.mir
index 95f8786b47a8..cbdc36281b71 100644
--- a/test/CodeGen/MIR/X86/function-liveins.mir
+++ b/test/CodeGen/MIR/X86/function-liveins.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses machine function's liveins
# correctly.
diff --git a/test/CodeGen/MIR/X86/generic-instr-type-error.mir b/test/CodeGen/MIR/X86/generic-instr-type-error.mir
new file mode 100644
index 000000000000..1f196919afa0
--- /dev/null
+++ b/test/CodeGen/MIR/X86/generic-instr-type-error.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that the MIR parser report an error for
+# opaque types used on generic instruction.
+
+---
+name: bar
+isSSA: true
+registers:
+ - { id: 0, class: gr32 }
+body: |
+ bb.0.entry:
+ liveins: %edi
+ ; CHECK: [[@LINE+1]]:20: expected a sized type
+ %0(32) = G_ADD %opaque %edi, %edi
+...
diff --git a/test/CodeGen/MIR/X86/generic-virtual-registers.mir b/test/CodeGen/MIR/X86/generic-virtual-registers.mir
new file mode 100644
index 000000000000..225ddfbc452a
--- /dev/null
+++ b/test/CodeGen/MIR/X86/generic-virtual-registers.mir
@@ -0,0 +1,48 @@
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
+# REQUIRES: global-isel
+# This test ensures that the MIR parser parses generic virtual
+# register definitions correctly.
+
+--- |
+ ; ModuleID = 'generic-virtual-registers-type-error.mir'
+ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+ %type_alias = type <2 x i32>
+ %structure_alias = type { i32, i16 }
+ define void @bar() {
+ entry:
+ ret void
+ }
+
+...
+
+---
+name: bar
+isSSA: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: _ }
+# CHECK-NEXT: - { id: 1, class: _ }
+# CHECK-NEXT: - { id: 2, class: _ }
+# CHECK-NEXT: - { id: 3, class: _ }
+# CHECK-NEXT: - { id: 4, class: _ }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+body: |
+ bb.0.entry:
+ liveins: %edi
+ ; CHECK: %0(32) = G_ADD i32 %edi
+ %0(32) = G_ADD i32 %edi, %edi
+ ; CHECK: %1(64) = G_ADD <2 x i32> %edi
+ %1(64) = G_ADD <2 x i32> %edi, %edi
+ ; CHECK: %2(64) = G_ADD <2 x i32> %edi
+ %2(64) = G_ADD %type_alias %edi, %edi
+ ; G_ADD is actually not a valid operand for structure type,
+ ; but that is the only one we have for now for testing.
+ ; CHECK: %3(64) = G_ADD { i32, i32 } %edi
+ %3(64) = G_ADD {i32, i32} %edi, %edi
+ ; CHECK: %4(48) = G_ADD %structure_alias %edi
+ %4(48) = G_ADD %structure_alias %edi, %edi
+...
diff --git a/test/CodeGen/MIR/X86/global-value-operands.mir b/test/CodeGen/MIR/X86/global-value-operands.mir
index 394aa397aef4..9b9554da7bd6 100644
--- a/test/CodeGen/MIR/X86/global-value-operands.mir
+++ b/test/CodeGen/MIR/X86/global-value-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses global value operands correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/immediate-operands.mir b/test/CodeGen/MIR/X86/immediate-operands.mir
index 34bd0fa14904..4d47219bf3b1 100644
--- a/test/CodeGen/MIR/X86/immediate-operands.mir
+++ b/test/CodeGen/MIR/X86/immediate-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses immediate machine operands.
--- |
diff --git a/test/CodeGen/MIR/X86/implicit-register-flag.mir b/test/CodeGen/MIR/X86/implicit-register-flag.mir
index b0a15ed93a8f..70b1cc500944 100644
--- a/test/CodeGen/MIR/X86/implicit-register-flag.mir
+++ b/test/CodeGen/MIR/X86/implicit-register-flag.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the 'implicit' and 'implicit-def'
# register flags correctly.
diff --git a/test/CodeGen/MIR/X86/inline-asm-registers.mir b/test/CodeGen/MIR/X86/inline-asm-registers.mir
index 3fd565891091..f0e8d1fcd8ff 100644
--- a/test/CodeGen/MIR/X86/inline-asm-registers.mir
+++ b/test/CodeGen/MIR/X86/inline-asm-registers.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after block-placement -stop-after block-placement -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
--- |
define i64 @test(i64 %x, i64 %y) #0 {
diff --git a/test/CodeGen/MIR/X86/instructions-debug-location.mir b/test/CodeGen/MIR/X86/instructions-debug-location.mir
index ea2cdbf7cb2f..12ee5d873d94 100644
--- a/test/CodeGen/MIR/X86/instructions-debug-location.mir
+++ b/test/CodeGen/MIR/X86/instructions-debug-location.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the machine instruction's
# debug location metadata correctly.
@@ -31,11 +31,10 @@
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
- !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+ !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "test.ll", directory: "")
!2 = !{}
- !3 = !{!4}
- !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+ !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
!5 = !DIFile(filename: "test.c", directory: "")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
@@ -62,9 +61,9 @@ stack:
body: |
bb.0.entry:
liveins: %edi
- ; CHECK: DBG_VALUE debug-use _, 0, !12, !13, debug-location !14
- ; CHECK: %eax = COPY %0, debug-location !15
- ; CHECK: RETQ %eax, debug-location !15
+ ; CHECK: DBG_VALUE debug-use _, 0, !11, !12, debug-location !13
+ ; CHECK: %eax = COPY %0, debug-location !14
+ ; CHECK: RETQ %eax, debug-location !14
%0 = COPY %edi
DBG_VALUE debug-use _, 0, !12, !13, debug-location !14
MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
@@ -86,9 +85,9 @@ body: |
liveins: %edi
%0 = COPY %edi
- ; CHECK: DBG_VALUE _, i32 0, !12, !13
- ; CHECK-NEXT: DBG_VALUE _, i64 -22, !12, !13
- ; CHECK-NEXT: DBG_VALUE _, i128 123492148938512984928424384934328985928, !12, !13
+ ; CHECK: DBG_VALUE _, i32 0, !11, !12
+ ; CHECK-NEXT: DBG_VALUE _, i64 -22, !11, !12
+ ; CHECK-NEXT: DBG_VALUE _, i128 123492148938512984928424384934328985928, !11, !12
DBG_VALUE _, i32 0, !12, !13
DBG_VALUE _, i64 -22, !12, !13
DBG_VALUE _, i128 123492148938512984928424384934328985928, !12, !13
diff --git a/test/CodeGen/MIR/X86/invalid-constant-pool-item.mir b/test/CodeGen/MIR/X86/invalid-constant-pool-item.mir
index afd6c78546ce..0b1eb2f5275b 100644
--- a/test/CodeGen/MIR/X86/invalid-constant-pool-item.mir
+++ b/test/CodeGen/MIR/X86/invalid-constant-pool-item.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser reports an error when parsing an invalid
# constant pool item operand.
diff --git a/test/CodeGen/MIR/X86/invalid-metadata-node-type.mir b/test/CodeGen/MIR/X86/invalid-metadata-node-type.mir
index a6c2e509da0c..42d05274e7cd 100644
--- a/test/CodeGen/MIR/X86/invalid-metadata-node-type.mir
+++ b/test/CodeGen/MIR/X86/invalid-metadata-node-type.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
@@ -6,7 +6,7 @@
entry:
%x.i = alloca i8, align 1
%y.i = alloca [256 x i8], align 16
- %0 = bitcast [256 x i8]* %y.i to i8*
+ %0 = bitcast i8* %x.i to i8*
br label %for.body
for.body:
@@ -22,12 +22,12 @@
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3}
- !0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: 0, enums: !2, retainedTypes: !2)
+ !0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
!1 = !DIFile(filename: "t.c", directory: "")
!2 = !{}
!3 = !{i32 1, !"Debug Info Version", i32 3}
!4 = !DILocalVariable(name: "x", scope: !5, file: !1, line: 16, type: !6)
- !5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+ !5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
!6 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
!7 = !DIExpression()
!8 = !DILocation(line: 0, scope: !5)
diff --git a/test/CodeGen/MIR/X86/invalid-target-flag-name.mir b/test/CodeGen/MIR/X86/invalid-target-flag-name.mir
index 313c5bdafed8..1cc9bed2349f 100644
--- a/test/CodeGen/MIR/X86/invalid-target-flag-name.mir
+++ b/test/CodeGen/MIR/X86/invalid-target-flag-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir b/test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir
index 00436adca484..2ba3288335fb 100644
--- a/test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir
+++ b/test/CodeGen/MIR/X86/invalid-tied-def-index-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i64 @test(i64 %x) #0 {
entry:
diff --git a/test/CodeGen/MIR/X86/jump-table-info.mir b/test/CodeGen/MIR/X86/jump-table-info.mir
index a4e6f6a1728c..e44f4b237df4 100644
--- a/test/CodeGen/MIR/X86/jump-table-info.mir
+++ b/test/CodeGen/MIR/X86/jump-table-info.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the jump table info and jump
# table operands correctly.
@@ -62,7 +62,7 @@ name: test_jumptable
# CHECK-NEXT: entries:
# CHECK-NEXT: - id: 0
# CHECK-NEXT: blocks: [ '%bb.3.lbl1', '%bb.4.lbl2', '%bb.5.lbl3', '%bb.6.lbl4' ]
-# CHECK_NEXT: body:
+# CHECK-NEXT: body:
jumpTable:
kind: label-difference32
entries:
diff --git a/test/CodeGen/MIR/X86/jump-table-redefinition-error.mir b/test/CodeGen/MIR/X86/jump-table-redefinition-error.mir
index d4ab11f40787..1eeabfba8124 100644
--- a/test/CodeGen/MIR/X86/jump-table-redefinition-error.mir
+++ b/test/CodeGen/MIR/X86/jump-table-redefinition-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/killed-register-flag.mir b/test/CodeGen/MIR/X86/killed-register-flag.mir
index 9e8f3ba3b368..159553ba4829 100644
--- a/test/CodeGen/MIR/X86/killed-register-flag.mir
+++ b/test/CodeGen/MIR/X86/killed-register-flag.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the 'killed' register flags
# correctly.
diff --git a/test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir b/test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir
index 93ce30abec7c..3339115c8bdf 100644
--- a/test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir
+++ b/test/CodeGen/MIR/X86/large-cfi-offset-number-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/large-immediate-operand-error.mir b/test/CodeGen/MIR/X86/large-immediate-operand-error.mir
index f815c63e18e9..0d72690401d4 100644
--- a/test/CodeGen/MIR/X86/large-immediate-operand-error.mir
+++ b/test/CodeGen/MIR/X86/large-immediate-operand-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/large-index-number-error.mir b/test/CodeGen/MIR/X86/large-index-number-error.mir
index 272cd685b381..f8423fd43e14 100644
--- a/test/CodeGen/MIR/X86/large-index-number-error.mir
+++ b/test/CodeGen/MIR/X86/large-index-number-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/large-offset-number-error.mir b/test/CodeGen/MIR/X86/large-offset-number-error.mir
index 5463cdbce444..0b2225f15414 100644
--- a/test/CodeGen/MIR/X86/large-offset-number-error.mir
+++ b/test/CodeGen/MIR/X86/large-offset-number-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir b/test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir
index c570f0992a3f..616adfad1eda 100644
--- a/test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir
+++ b/test/CodeGen/MIR/X86/large-size-in-memory-operand-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/liveout-register-mask.mir b/test/CodeGen/MIR/X86/liveout-register-mask.mir
index 7ded7287060e..c2a5a34a85ca 100644
--- a/test/CodeGen/MIR/X86/liveout-register-mask.mir
+++ b/test/CodeGen/MIR/X86/liveout-register-mask.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after stackmap-liveness -stop-after stackmap-liveness -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the liveout register mask
# machine operands correctly.
diff --git a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
index 0d7a9f8ef34c..f59157386796 100644
--- a/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
+++ b/test/CodeGen/MIR/X86/machine-basic-block-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses machine basic block operands.
--- |
diff --git a/test/CodeGen/MIR/X86/machine-instructions.mir b/test/CodeGen/MIR/X86/machine-instructions.mir
index 0e46d01e0bd1..28d4d47e3e7f 100644
--- a/test/CodeGen/MIR/X86/machine-instructions.mir
+++ b/test/CodeGen/MIR/X86/machine-instructions.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses X86 machine instructions
# correctly.
diff --git a/test/CodeGen/MIR/X86/machine-verifier.mir b/test/CodeGen/MIR/X86/machine-verifier.mir
index a7413d4d03bc..c56bab8c998c 100644
--- a/test/CodeGen/MIR/X86/machine-verifier.mir
+++ b/test/CodeGen/MIR/X86/machine-verifier.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser runs the machine verifier after parsing.
--- |
diff --git a/test/CodeGen/MIR/X86/memory-operands.mir b/test/CodeGen/MIR/X86/memory-operands.mir
index 3c9463d2f313..a25538d9b1fc 100644
--- a/test/CodeGen/MIR/X86/memory-operands.mir
+++ b/test/CodeGen/MIR/X86/memory-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the machine memory operands
# correctly.
@@ -186,6 +186,9 @@
%0 = load i8*, i8** undef, align 8
ret i8* %0
}
+
+ define void @dummy0() { ret void }
+ define void @dummy1() { ret void }
...
---
name: test
@@ -506,3 +509,28 @@ body: |
%rax = MOV64rm undef %rax, 1, _, 0, _ :: (load 8 from `i8** undef`)
RETQ %rax
...
+---
+# Test memory operand without associated value.
+# CHECK-LABEL: name: dummy0
+# CHECK: %rax = MOV64rm undef %rax, 1, _, 0, _ :: (load 8)
+name: dummy0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %rax = MOV64rm undef %rax, 1, _, 0, _ :: (load 8)
+ RETQ %rax
+...
+---
+# Test parsing of stack references in machine memory operands.
+# CHECK-LABEL: name: dummy1
+# CHECK: %rax = MOV64rm %rsp, 1, _, 0, _ :: (load 8 from %stack.0)
+name: dummy1
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 4, alignment: 4 }
+body: |
+ bb.0:
+ %rax = MOV64rm %rsp, 1, _, 0, _ :: (load 8 from %stack.0)
+ RETQ %rax
+
+...
diff --git a/test/CodeGen/MIR/X86/metadata-operands.mir b/test/CodeGen/MIR/X86/metadata-operands.mir
index 89a1e6fcb815..42f3fe1c86c7 100644
--- a/test/CodeGen/MIR/X86/metadata-operands.mir
+++ b/test/CodeGen/MIR/X86/metadata-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the metadata machine operands
# correctly.
@@ -22,11 +22,10 @@
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
- !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+ !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "test.ll", directory: "")
!2 = !{}
- !3 = !{!4}
- !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+ !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
!5 = !DIFile(filename: "test.c", directory: "")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
@@ -54,7 +53,7 @@ body: |
bb.0.entry:
liveins: %edi
; CHECK: %0 = COPY %edi
- ; CHECK-NEXT: DBG_VALUE _, 0, !12, !13
+ ; CHECK-NEXT: DBG_VALUE _, 0, !11, !12
%0 = COPY %edi
DBG_VALUE _, 0, !12, ! 13
MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
diff --git a/test/CodeGen/MIR/X86/missing-closing-quote.mir b/test/CodeGen/MIR/X86/missing-closing-quote.mir
index 9f4b369a3df4..0e912f5ea788 100644
--- a/test/CodeGen/MIR/X86/missing-closing-quote.mir
+++ b/test/CodeGen/MIR/X86/missing-closing-quote.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/missing-comma.mir b/test/CodeGen/MIR/X86/missing-comma.mir
index 092995e59c70..0aaba6ddaa3a 100644
--- a/test/CodeGen/MIR/X86/missing-comma.mir
+++ b/test/CodeGen/MIR/X86/missing-comma.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/missing-implicit-operand.mir b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
index 0135c756e138..fd26f19d1847 100644
--- a/test/CodeGen/MIR/X86/missing-implicit-operand.mir
+++ b/test/CodeGen/MIR/X86/missing-implicit-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser reports an error when an instruction
# is missing one of its implicit register operands.
diff --git a/test/CodeGen/MIR/X86/named-registers.mir b/test/CodeGen/MIR/X86/named-registers.mir
index e547c326563e..eedc2dbe853f 100644
--- a/test/CodeGen/MIR/X86/named-registers.mir
+++ b/test/CodeGen/MIR/X86/named-registers.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses X86 registers correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir
index bce06d540114..ce43a83ecae5 100644
--- a/test/CodeGen/MIR/X86/newline-handling.mir
+++ b/test/CodeGen/MIR/X86/newline-handling.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/null-register-operands.mir b/test/CodeGen/MIR/X86/null-register-operands.mir
index 5563ef8e8f75..9cba00bc9e5e 100644
--- a/test/CodeGen/MIR/X86/null-register-operands.mir
+++ b/test/CodeGen/MIR/X86/null-register-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses null register operands correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/register-mask-operands.mir b/test/CodeGen/MIR/X86/register-mask-operands.mir
index 9fa4e6e3994e..c683a635f147 100644
--- a/test/CodeGen/MIR/X86/register-mask-operands.mir
+++ b/test/CodeGen/MIR/X86/register-mask-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses register mask operands correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/register-operands-target-flag-error.mir b/test/CodeGen/MIR/X86/register-operands-target-flag-error.mir
index 64d46d20db74..d4d3f5692e90 100644
--- a/test/CodeGen/MIR/X86/register-operands-target-flag-error.mir
+++ b/test/CodeGen/MIR/X86/register-operands-target-flag-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/simple-register-allocation-hints.mir b/test/CodeGen/MIR/X86/simple-register-allocation-hints.mir
index d7e76329be73..27ca266f7794 100644
--- a/test/CodeGen/MIR/X86/simple-register-allocation-hints.mir
+++ b/test/CodeGen/MIR/X86/simple-register-allocation-hints.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after machine-scheduler -stop-after machine-scheduler -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses simple register allocation hints
# correctly.
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
index b62cd755fec1..5e191ba11942 100644
--- a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-aliased.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
index c89216bea67a..91288aa40b39 100644
--- a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-object-immutable.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
index 7e13a26f0b68..1771d6fafcae 100644
--- a/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
+++ b/test/CodeGen/MIR/X86/spill-slot-fixed-stack-objects.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses fixed stack objects correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/stack-object-debug-info.mir b/test/CodeGen/MIR/X86/stack-object-debug-info.mir
index 509b196416fd..d80b7d0bfcb1 100644
--- a/test/CodeGen/MIR/X86/stack-object-debug-info.mir
+++ b/test/CodeGen/MIR/X86/stack-object-debug-info.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the stack object's debug info
# correctly.
--- |
@@ -31,15 +31,18 @@
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3}
- !0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: 0, enums: !2, retainedTypes: !2)
+ !0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
!1 = !DIFile(filename: "t.c", directory: "")
!2 = !{}
!3 = !{i32 1, !"Debug Info Version", i32 3}
- !4 = !DILocalVariable(name: "x", scope: !5, file: !1, line: 16, type: !6)
- !5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+ !4 = !DILocalVariable(name: "x", scope: !5, file: !1, line: 16, type: !9)
+ !5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
!6 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
!7 = !DIExpression()
!8 = !DILocation(line: 0, scope: !5)
+ !9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 2048, align: 8, elements: !10)
+ !10 = !{!11}
+ !11 = !DISubrange(count: 256)
...
---
name: foo
@@ -50,7 +53,7 @@ frameInfo:
# CHECK-LABEL: foo
# CHECK: stack:
# CHECK: - { id: 0, name: y.i, offset: 0, size: 256, alignment: 16, di-variable: '!4',
-# CHECK-NEXT: di-expression: '!7', di-location: '!8' }
+# CHECK-NEXT: di-expression: '!10', di-location: '!11' }
stack:
- { id: 0, name: y.i, offset: 0, size: 256, alignment: 16, di-variable: '!4',
di-expression: '!7', di-location: '!8' }
diff --git a/test/CodeGen/MIR/X86/stack-object-invalid-name.mir b/test/CodeGen/MIR/X86/stack-object-invalid-name.mir
index e42e1e59f1e7..4572f106256d 100644
--- a/test/CodeGen/MIR/X86/stack-object-invalid-name.mir
+++ b/test/CodeGen/MIR/X86/stack-object-invalid-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser reports an error when it encounters a
# stack object with a name that can't be associated with an alloca instruction.
diff --git a/test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir b/test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir
index 46661d95e727..2115a11ae693 100644
--- a/test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir
+++ b/test/CodeGen/MIR/X86/stack-object-operand-name-mismatch-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an stack object reference
# uses a different name then the stack object definition.
diff --git a/test/CodeGen/MIR/X86/stack-object-operands.mir b/test/CodeGen/MIR/X86/stack-object-operands.mir
index fce5bf717d1a..6ff15aef4d7f 100644
--- a/test/CodeGen/MIR/X86/stack-object-operands.mir
+++ b/test/CodeGen/MIR/X86/stack-object-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses stack object machine operands
# correctly.
diff --git a/test/CodeGen/MIR/X86/stack-object-redefinition-error.mir b/test/CodeGen/MIR/X86/stack-object-redefinition-error.mir
index b84863ebca67..0fccff0425ee 100644
--- a/test/CodeGen/MIR/X86/stack-object-redefinition-error.mir
+++ b/test/CodeGen/MIR/X86/stack-object-redefinition-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/stack-objects.mir b/test/CodeGen/MIR/X86/stack-objects.mir
index bdd911075da0..08b9ec0b4347 100644
--- a/test/CodeGen/MIR/X86/stack-objects.mir
+++ b/test/CodeGen/MIR/X86/stack-objects.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses stack objects correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/standalone-register-error.mir b/test/CodeGen/MIR/X86/standalone-register-error.mir
index f17451bfc89c..b50393390289 100644
--- a/test/CodeGen/MIR/X86/standalone-register-error.mir
+++ b/test/CodeGen/MIR/X86/standalone-register-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test(i32 %a) {
body:
diff --git a/test/CodeGen/MIR/X86/subreg-on-physreg.mir b/test/CodeGen/MIR/X86/subreg-on-physreg.mir
new file mode 100644
index 000000000000..f20195e7ddf5
--- /dev/null
+++ b/test/CodeGen/MIR/X86/subreg-on-physreg.mir
@@ -0,0 +1,12 @@
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported for subreg index on a physreg.
+--- |
+ define void @t() { ret void }
+...
+---
+name: t
+body: |
+ bb.0:
+ ; CHECK: [[@LINE+1]]:19: subregister index expects a virtual register
+ %eax:sub_8bit = COPY %bl
+...
diff --git a/test/CodeGen/MIR/X86/subregister-index-operands.mir b/test/CodeGen/MIR/X86/subregister-index-operands.mir
new file mode 100644
index 000000000000..a9a45adadf6e
--- /dev/null
+++ b/test/CodeGen/MIR/X86/subregister-index-operands.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
+# This test ensures that the MIR parser parses and prints subregisters index
+# operands correctly.
+
+--- |
+
+ define zeroext i1 @t(i1 %c) {
+ entry:
+ ret i1 %c
+ }
+
+...
+---
+# CHECK-LABEL: name: t
+# CHECK: %0 = INSERT_SUBREG %edi, %al, {{[0-9]+}}
+# CHECK: %1 = EXTRACT_SUBREG %eax, {{[0-9]+}}
+# CHECK: %ax = REG_SEQUENCE %1, {{[0-9]+}}, %1, {{[0-9]+}}
+name: t
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr8 }
+body: |
+ bb.0.entry:
+ liveins: %edi, %eax
+ %0 = INSERT_SUBREG %edi, %al, %subreg.sub_8bit
+ %1 = EXTRACT_SUBREG %eax, %subreg.sub_8bit_hi
+ %ax = REG_SEQUENCE %1, %subreg.sub_8bit, %1, %subreg.sub_8bit_hi
+ RETQ %ax
+...
+
diff --git a/test/CodeGen/MIR/X86/subregister-operands.mir b/test/CodeGen/MIR/X86/subregister-operands.mir
index 8a3fcf69aca6..a02bfe8359dc 100644
--- a/test/CodeGen/MIR/X86/subregister-operands.mir
+++ b/test/CodeGen/MIR/X86/subregister-operands.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses subregisters in register operands
# correctly.
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
index 64af6121189a..8de31f3274df 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses basic block successors and
# probabilities correctly.
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
index a6c14f70bc7c..6f15f522bd5f 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses basic block successors correctly.
--- |
diff --git a/test/CodeGen/MIR/X86/tied-def-operand-invalid.mir b/test/CodeGen/MIR/X86/tied-def-operand-invalid.mir
index fe5263df355f..05502fff4062 100644
--- a/test/CodeGen/MIR/X86/tied-def-operand-invalid.mir
+++ b/test/CodeGen/MIR/X86/tied-def-operand-invalid.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i64 @test(i64 %x) #0 {
entry:
diff --git a/test/CodeGen/MIR/X86/undef-register-flag.mir b/test/CodeGen/MIR/X86/undef-register-flag.mir
index 0b26c528aee1..2c332d848bbc 100644
--- a/test/CodeGen/MIR/X86/undef-register-flag.mir
+++ b/test/CodeGen/MIR/X86/undef-register-flag.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the 'undef' register flags
# correctly.
diff --git a/test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir b/test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir
index 8d8f8614f32b..18cb758408ff 100644
--- a/test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir
+++ b/test/CodeGen/MIR/X86/undefined-fixed-stack-object.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test(i32 %a) {
diff --git a/test/CodeGen/MIR/X86/undefined-global-value.mir b/test/CodeGen/MIR/X86/undefined-global-value.mir
index f82c626397a9..e717c1ee5976 100644
--- a/test/CodeGen/MIR/X86/undefined-global-value.mir
+++ b/test/CodeGen/MIR/X86/undefined-global-value.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an invalid global value index
# is used.
diff --git a/test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir b/test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir
index f6b10e3123ca..5c2a45eec2a1 100644
--- a/test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir
+++ b/test/CodeGen/MIR/X86/undefined-ir-block-in-blockaddress.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir b/test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir
index 0b3c0093dc62..ef7dd4802aca 100644
--- a/test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir
+++ b/test/CodeGen/MIR/X86/undefined-ir-block-slot-in-blockaddress.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/undefined-jump-table-id.mir b/test/CodeGen/MIR/X86/undefined-jump-table-id.mir
index b463dc4bd9f4..765bb9d97d19 100644
--- a/test/CodeGen/MIR/X86/undefined-jump-table-id.mir
+++ b/test/CodeGen/MIR/X86/undefined-jump-table-id.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/undefined-named-global-value.mir b/test/CodeGen/MIR/X86/undefined-named-global-value.mir
index a1ada4b42e46..435257b8fac1 100644
--- a/test/CodeGen/MIR/X86/undefined-named-global-value.mir
+++ b/test/CodeGen/MIR/X86/undefined-named-global-value.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an undefined global value is
# used.
diff --git a/test/CodeGen/MIR/X86/undefined-register-class.mir b/test/CodeGen/MIR/X86/undefined-register-class.mir
index 348f6af5c44f..70b413b5ad37 100644
--- a/test/CodeGen/MIR/X86/undefined-register-class.mir
+++ b/test/CodeGen/MIR/X86/undefined-register-class.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser reports an error when it encounters an
# unknown register class.
@@ -15,7 +15,7 @@ name: test
isSSA: true
tracksRegLiveness: true
registers:
- # CHECK: [[@LINE+1]]:20: use of undefined register class 'gr3200'
+ # CHECK: [[@LINE+1]]:20: use of undefined register class or register bank 'gr3200'
- {id: 0, class: 'gr3200'}
body: |
bb.0.entry:
diff --git a/test/CodeGen/MIR/X86/undefined-stack-object.mir b/test/CodeGen/MIR/X86/undefined-stack-object.mir
index 416e6789ba0f..5d40791b4c31 100644
--- a/test/CodeGen/MIR/X86/undefined-stack-object.mir
+++ b/test/CodeGen/MIR/X86/undefined-stack-object.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test(i32 %a) {
diff --git a/test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir b/test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir
index a3907d7a3a4a..42e94c1ce5a3 100644
--- a/test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir
+++ b/test/CodeGen/MIR/X86/undefined-value-in-memory-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/undefined-virtual-register.mir b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
index 2f9a304ffe5c..fe41e0a4d2fc 100644
--- a/test/CodeGen/MIR/X86/undefined-virtual-register.mir
+++ b/test/CodeGen/MIR/X86/undefined-virtual-register.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that the MIR parser reports an error when parsing a
# reference to an undefined virtual register.
diff --git a/test/CodeGen/MIR/X86/unknown-instruction.mir b/test/CodeGen/MIR/X86/unknown-instruction.mir
index cec354948832..4377347f0a9f 100644
--- a/test/CodeGen/MIR/X86/unknown-instruction.mir
+++ b/test/CodeGen/MIR/X86/unknown-instruction.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an unknown instruction is
# encountered.
diff --git a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
index a512d9aa08e6..0634fb2e0ed8 100644
--- a/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-machine-basic-block.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an invalid machine basic
# block index is used.
diff --git a/test/CodeGen/MIR/X86/unknown-metadata-keyword.mir b/test/CodeGen/MIR/X86/unknown-metadata-keyword.mir
index c58c38ab1322..ddd5686b0cf3 100644
--- a/test/CodeGen/MIR/X86/unknown-metadata-keyword.mir
+++ b/test/CodeGen/MIR/X86/unknown-metadata-keyword.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @inc(i32* %x) {
diff --git a/test/CodeGen/MIR/X86/unknown-metadata-node.mir b/test/CodeGen/MIR/X86/unknown-metadata-node.mir
index 958a30678be1..793f9123776a 100644
--- a/test/CodeGen/MIR/X86/unknown-metadata-node.mir
+++ b/test/CodeGen/MIR/X86/unknown-metadata-node.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
@@ -20,11 +20,10 @@
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
- !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+ !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "test.ll", directory: "")
!2 = !{}
- !3 = !{!4}
- !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+ !4 = distinct !DISubprogram(name: "test", scope: !5, file: !5, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
!5 = !DIFile(filename: "test.c", directory: "")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
diff --git a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
index 6627273d4470..5ba6402353ef 100644
--- a/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
+++ b/test/CodeGen/MIR/X86/unknown-named-machine-basic-block.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an unknown named machine
# basic block is encountered.
diff --git a/test/CodeGen/MIR/X86/unknown-register.mir b/test/CodeGen/MIR/X86/unknown-register.mir
index da0798ca1b52..74e9bfa72157 100644
--- a/test/CodeGen/MIR/X86/unknown-register.mir
+++ b/test/CodeGen/MIR/X86/unknown-register.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an unknown register is
# encountered.
diff --git a/test/CodeGen/MIR/X86/unknown-subregister-index-op.mir b/test/CodeGen/MIR/X86/unknown-subregister-index-op.mir
new file mode 100644
index 000000000000..2d997b07dbd5
--- /dev/null
+++ b/test/CodeGen/MIR/X86/unknown-subregister-index-op.mir
@@ -0,0 +1,26 @@
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+# This test ensures that an error is reported when an unknown subregister index
+# is encountered.
+
+--- |
+
+ define zeroext i1 @t(i1 %c) {
+ entry:
+ ret i1 %c
+ }
+
+...
+---
+name: t
+isSSA: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: gr8 }
+ - { id: 2, class: gr8 }
+body: |
+ bb.0.entry:
+ ; CHECK: [[@LINE+1]]:35: unknown subregister index 'bit8'
+ %0 = INSERT_SUBREG %edi, %al, %subreg.bit8
+ RETQ %0
+...
diff --git a/test/CodeGen/MIR/X86/unknown-subregister-index.mir b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
index 5dde34561236..6ad6242f79a3 100644
--- a/test/CodeGen/MIR/X86/unknown-subregister-index.mir
+++ b/test/CodeGen/MIR/X86/unknown-subregister-index.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
# This test ensures that an error is reported when an unknown subregister index
# is encountered.
diff --git a/test/CodeGen/MIR/X86/unrecognized-character.mir b/test/CodeGen/MIR/X86/unrecognized-character.mir
index cf99028677fa..4b6631099716 100644
--- a/test/CodeGen/MIR/X86/unrecognized-character.mir
+++ b/test/CodeGen/MIR/X86/unrecognized-character.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/used-physical-register-info.mir b/test/CodeGen/MIR/X86/used-physical-register-info.mir
index 9a81578703e0..9edc4113b279 100644
--- a/test/CodeGen/MIR/X86/used-physical-register-info.mir
+++ b/test/CodeGen/MIR/X86/used-physical-register-info.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses the callee saved register mask
# correctly and that the MIR parser can infer it as well.
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
index e6a9ef8d4c88..2633ea59bf55 100644
--- a/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-object-size-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
define i32 @test(i32 %a) {
diff --git a/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
index a58be69ae046..5e7d99352e57 100644
--- a/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
+++ b/test/CodeGen/MIR/X86/variable-sized-stack-objects.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses variable sized stack objects
# correctly.
diff --git a/test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir b/test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir
index 5dae6e666c83..4d2350a01b8d 100644
--- a/test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir
+++ b/test/CodeGen/MIR/X86/virtual-register-redefinition-error.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
--- |
diff --git a/test/CodeGen/MIR/X86/virtual-registers.mir b/test/CodeGen/MIR/X86/virtual-registers.mir
index 93c2fea6fd95..3f7b0fdcc0e3 100644
--- a/test/CodeGen/MIR/X86/virtual-registers.mir
+++ b/test/CodeGen/MIR/X86/virtual-registers.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -start-after machine-sink -stop-after machine-sink -o /dev/null %s | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
# This test ensures that the MIR parser parses virtual register definitions and
# references correctly.
diff --git a/test/CodeGen/MIR/lit.local.cfg b/test/CodeGen/MIR/lit.local.cfg
deleted file mode 100644
index e69aa5765356..000000000000
--- a/test/CodeGen/MIR/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-config.suffixes = ['.mir']
-
diff --git a/test/CodeGen/MSP430/spill-to-stack.ll b/test/CodeGen/MSP430/spill-to-stack.ll
new file mode 100644
index 000000000000..d925bc91b72d
--- /dev/null
+++ b/test/CodeGen/MSP430/spill-to-stack.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=msp430 < %s
+%VeryLarge = type { i8, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+
+; intentionally cause a spill
+define void @inc(%VeryLarge* byval align 1 %s) {
+entry:
+ %p0 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 0
+ %0 = load i8, i8* %p0
+ %p1 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 1
+ %1 = load i32, i32* %p1
+ %p2 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 2
+ %2 = load i32, i32* %p2
+ %p3 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 3
+ %3 = load i32, i32* %p3
+ %p4 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 4
+ %4 = load i32, i32* %p4
+ %p5 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 5
+ %5 = load i32, i32* %p5
+ %p6 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 6
+ %6 = load i32, i32* %p6
+ %p7 = getelementptr inbounds %VeryLarge, %VeryLarge* %s, i32 0, i32 7
+ %7 = load i32, i32* %p7
+ %add = add i8 %0, 1
+ store i8 %add, i8* %p0
+ %add2 = add i32 %1, 2
+ store i32 %add2, i32* %p1
+ %add3 = add i32 %2, 3
+ store i32 %add3, i32* %p2
+ %add4 = add i32 %3, 4
+ store i32 %add4, i32* %p3
+ %add5 = add i32 %4, 5
+ store i32 %add5, i32* %p4
+ %add6 = add i32 %5, 6
+ store i32 %add6, i32* %p5
+ %add7 = add i32 %6, 7
+ store i32 %add7, i32* %p6
+ %add8 = add i32 %7, 8
+ store i32 %add8, i32* %p7
+ ret void
+}
diff --git a/test/CodeGen/Mips/2010-07-20-Switch.ll b/test/CodeGen/Mips/2010-07-20-Switch.ll
index fd0254e9f5ec..7d66d1a1a204 100644
--- a/test/CodeGen/Mips/2010-07-20-Switch.ll
+++ b/test/CodeGen/Mips/2010-07-20-Switch.ll
@@ -55,19 +55,19 @@ bb5: ; preds = %entry
ret i32 1
}
-; STATIC-O32: .align 2
+; STATIC-O32: .p2align 2
; STATIC-O32: $JTI0_0:
; STATIC-O32: .4byte
; STATIC-O32: .4byte
; STATIC-O32: .4byte
; STATIC-O32: .4byte
-; PIC-O32: .align 2
+; PIC-O32: .p2align 2
; PIC-O32: $JTI0_0:
; PIC-O32: .gpword
; PIC-O32: .gpword
; PIC-O32: .gpword
; PIC-O32: .gpword
-; N64: .align 3
+; N64: .p2align 3
; N64: $JTI0_0:
; N64: .gpdword
; N64: .gpdword
diff --git a/test/CodeGen/Mips/Fast-ISel/callabi.ll b/test/CodeGen/Mips/Fast-ISel/callabi.ll
index 34616a50b1a0..9988622db54f 100644
--- a/test/CodeGen/Mips/Fast-ISel/callabi.ll
+++ b/test/CodeGen/Mips/Fast-ISel/callabi.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=mipsel -mcpu=mips32 -O0 \
-; RUN: -relocation-model=pic -fast-isel-abort=1 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R1
-; RUN: llc -march=mipsel -mcpu=mips32r2 -O0 \
-; RUN: -relocation-model=pic -fast-isel-abort=1 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R2
+; RUN: llc -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \
+; RUN: -fast-isel-abort=1 -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R1
+; RUN: llc -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
+; RUN: -fast-isel-abort=1 -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R2
declare void @xb(i8)
diff --git a/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll b/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
index 203e5a7e1595..290e4ecb7405 100644
--- a/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
+++ b/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
@@ -1,20 +1,22 @@
; RUN: llc -march=mips -mcpu=mips2 -O0 -relocation-model=pic \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips3 -O0 -relocation-model=pic \
+; RUN: llc -march=mips -mcpu=mips3 -O0 -relocation-model=pic -target-abi n64 \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips4 -O0 -relocation-model=pic \
+; RUN: llc -march=mips -mcpu=mips4 -O0 -relocation-model=pic -target-abi n64 \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
; RUN: llc -march=mips -mcpu=mips32r6 -O0 -relocation-model=pic \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+micromips -O0 -relocation-model=pic \
+; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips64 -O0 -relocation-model=pic \
+; RUN: llc -march=mips -mcpu=mips64 -O0 -relocation-model=pic -target-abi n64 \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips64r2 -O0 -relocation-model=pic \
+; RUN: llc -march=mips -mcpu=mips64r2 -O0 -relocation-model=pic -target-abi n64 \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips64r3 -O0 -relocation-model=pic \
+; RUN: llc -march=mips -mcpu=mips64r3 -O0 -relocation-model=pic -target-abi n64 \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips64r5 -O0 -relocation-model=pic \
+; RUN: llc -march=mips -mcpu=mips64r5 -O0 -relocation-model=pic -target-abi n64 \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
; RUN: llc -march=mips -mcpu=mips32r6 -O0 -relocation-model=pic \
; RUN: -fast-isel-verbose <%s 2>&1 | FileCheck %s
diff --git a/test/CodeGen/Mips/Fast-ISel/div1.ll b/test/CodeGen/Mips/Fast-ISel/div1.ll
index 89055aa12805..b0865e649d96 100644
--- a/test/CodeGen/Mips/Fast-ISel/div1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/div1.ll
@@ -22,7 +22,7 @@ define void @divs() {
; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]])
; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]])
; CHECK-DAG: div $zero, $[[J]], $[[K]]
- ; CHECK_DAG: teq $[[K]], $zero, 7
+ ; CHECK-DAG: teq $[[K]], $zero, 7
; CHECK-DAG: mflo $[[RESULT:[0-9]+]]
; CHECK: sw $[[RESULT]], 0($[[I_ADDR]])
%1 = load i32, i32* @sj, align 4
@@ -44,7 +44,7 @@ define void @divu() {
; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]])
; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]])
; CHECK-DAG: divu $zero, $[[J]], $[[K]]
- ; CHECK_DAG: teq $[[K]], $zero, 7
+ ; CHECK-DAG: teq $[[K]], $zero, 7
; CHECK-DAG: mflo $[[RESULT:[0-9]+]]
; CHECK: sw $[[RESULT]], 0($[[I_ADDR]])
%1 = load i32, i32* @uj, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
index 00bc7f485e08..9c91567eabf0 100644
--- a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
@@ -1,5 +1,5 @@
; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
-; RUN: < %s | FileCheck %s
+; RUN: < %s -verify-machineinstrs | FileCheck %s
%struct.x = type { i32 }
diff --git a/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
index e346acfeff13..d661a281ea1d 100644
--- a/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
@@ -1,7 +1,7 @@
; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32r2 \
-; RUN: < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=mipsel -relocation-model=pic -O0 -fast-isel-abort=1 -mcpu=mips32 \
-; RUN: < %s | FileCheck %s
+; RUN: -verify-machineinstrs < %s | FileCheck %s
@f1 = common global float 0.000000e+00, align 4
@f2 = common global float 0.000000e+00, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/memtest1.ll b/test/CodeGen/Mips/Fast-ISel/memtest1.ll
index b98200d7456d..7deb5c08ec69 100644
--- a/test/CodeGen/Mips/Fast-ISel/memtest1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/memtest1.ll
@@ -1,8 +1,8 @@
; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \
-; RUN: -fast-isel-abort=1 | FileCheck %s \
+; RUN: -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s \
; RUN: -check-prefix=ALL -check-prefix=32R1
; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
-; RUN: -fast-isel-abort=1 | FileCheck %s \
+; RUN: -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s \
; RUN: -check-prefix=ALL -check-prefix=32R2
@str = private unnamed_addr constant [12 x i8] c"hello there\00", align 1
diff --git a/test/CodeGen/Mips/Fast-ISel/rem1.ll b/test/CodeGen/Mips/Fast-ISel/rem1.ll
index cf709e7e4954..a5cc24361e6a 100644
--- a/test/CodeGen/Mips/Fast-ISel/rem1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/rem1.ll
@@ -22,7 +22,7 @@ define void @rems() {
; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]])
; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]])
; CHECK-DAG: div $zero, $[[J]], $[[K]]
- ; CHECK_DAG: teq $[[K]], $zero, 7
+ ; CHECK-DAG: teq $[[K]], $zero, 7
; CHECK-DAG: mfhi $[[RESULT:[0-9]+]]
; CHECK: sw $[[RESULT]], 0($[[I_ADDR]])
%1 = load i32, i32* @sj, align 4
@@ -45,7 +45,7 @@ define void @remu() {
; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]])
; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]])
; CHECK-DAG: divu $zero, $[[J]], $[[K]]
- ; CHECK_DAG: teq $[[K]], $zero, 7
+ ; CHECK-DAG: teq $[[K]], $zero, 7
; CHECK-DAG: mfhi $[[RESULT:[0-9]+]]
; CHECK: sw $[[RESULT]], 0($[[I_ADDR]])
%1 = load i32, i32* @uj, align 4
diff --git a/test/CodeGen/Mips/Fast-ISel/shift.ll b/test/CodeGen/Mips/Fast-ISel/shift.ll
index 9fe694bb5827..651fb6ad1f5a 100644
--- a/test/CodeGen/Mips/Fast-ISel/shift.ll
+++ b/test/CodeGen/Mips/Fast-ISel/shift.ll
@@ -1,5 +1,5 @@
; RUN: llc -march=mipsel -mcpu=mips32r2 -O0 -fast-isel=true -filetype=obj %s -o - \
-; RUN: | llvm-objdump -arch mipsel -mcpu=mips32r2 -d - | FileCheck %s
+; RUN: | llvm-objdump -d - | FileCheck %s
; This test checks that encoding for srl is correct when fast-isel for mips32r2 is used.
diff --git a/test/CodeGen/Mips/abicalls.ll b/test/CodeGen/Mips/abicalls.ll
index 7edc3e25c352..26bbab40b3b3 100644
--- a/test/CodeGen/Mips/abicalls.ll
+++ b/test/CodeGen/Mips/abicalls.ll
@@ -1,7 +1,7 @@
-; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=STATIC %s
-; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
-; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
-; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefixes=ABICALLS,STATIC %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=pic %s -o - | FileCheck -check-prefixes=ABICALLS,PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefixes=ABICALLS,PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefixes=ABICALLS,PIC %s
; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr noabicalls -relocation-model=static %s -o - | FileCheck -implicit-check-not='.abicalls' -implicit-check-not='pic0' %s
diff --git a/test/CodeGen/Mips/adjust-callstack-sp.ll b/test/CodeGen/Mips/adjust-callstack-sp.ll
index e4afcd835005..32d77ac19ae6 100644
--- a/test/CodeGen/Mips/adjust-callstack-sp.ll
+++ b/test/CodeGen/Mips/adjust-callstack-sp.ll
@@ -2,18 +2,18 @@
; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefix=GP32
; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefix=GP32
; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips3 | FileCheck %s -check-prefix=GP64
-; RUN: llc < %s -march=mips -mcpu=mips64 | FileCheck %s -check-prefix=GP64
-; RUN: llc < %s -march=mips -mcpu=mips64r6 | FileCheck %s -check-prefix=GP64
+; RUN: llc < %s -march=mips -mcpu=mips3 -target-abi n64 | FileCheck %s -check-prefix=GP64
+; RUN: llc < %s -march=mips -mcpu=mips64 -target-abi n64 | FileCheck %s -check-prefix=GP64
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 | FileCheck %s -check-prefix=GP64
declare void @bar(i32*)
define void @foo(i32 %sz) {
; ALL-LABEL: foo:
- ; M16-NOT: addiu $sp, 0 # 16 bit inst
- ; GP32-NOT: addiu $sp, $sp, 0
- ; GP64-NOT: daddiu $sp, $sp, 0
+ ; M16-NOT: addiu $sp, 0 # 16 bit inst
+ ; GP32-NOT: addiu $sp, $sp, 0
+ ; GP64-NOT: daddiu $sp, $sp, 0
%a = alloca i32, i32 %sz
call void @bar(i32* %a)
ret void
diff --git a/test/CodeGen/Mips/alloca.ll b/test/CodeGen/Mips/alloca.ll
index 747a1362161d..b708ddb134ce 100644
--- a/test/CodeGen/Mips/alloca.ll
+++ b/test/CodeGen/Mips/alloca.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
define i32 @twoalloca(i32 %size) nounwind {
entry:
diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll
index d5ecaaeddc33..377fe9327e0e 100644
--- a/test/CodeGen/Mips/analyzebranch.ll
+++ b/test/CodeGen/Mips/analyzebranch.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
-; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
-; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=32-GPR
-; RUN: llc -march=mips64 -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
-; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
-; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
-; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=64-GPR
+; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s -check-prefixes=ALL,FCC
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefixes=ALL,FCC
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefixes=ALL,GPR,32-GPR
+; RUN: llc -march=mips64 -mcpu=mips4 < %s | FileCheck %s -check-prefixes=ALL,FCC
+; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefixes=ALL,FCC
+; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | FileCheck %s -check-prefixes=ALL,FCC
+; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | FileCheck %s -check-prefixes=ALL,GPR,64-GPR
define double @foo(double %a, double %b) nounwind readnone {
entry:
@@ -19,7 +19,7 @@ entry:
; GPR: cmp.lt.d $[[FGRCC:f[0-9]+]], $[[Z]], $f12
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC]]
; GPR-NOT: not $[[GPRCC]], $[[GPRCC]]
-; GPR: bnez $[[GPRCC]], $BB
+; GPR: bnezc $[[GPRCC]], $BB
%cmp = fcmp ogt double %a, 0.000000e+00
br i1 %cmp, label %if.end6, label %if.else
@@ -50,7 +50,8 @@ entry:
; GPR: cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $[[Z]]
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC]]
; GPR-NOT: not $[[GPRCC]], $[[GPRCC]]
-; GPR: beqz $[[GPRCC]], $BB
+; 64-GPR beqzc $[[GPRCC]], $BB
+; 32-GPR beqz $[[GPRCC]], $BB
%cmp = fcmp une float %f, 0.000000e+00
br i1 %cmp, label %if.then, label %if.end
diff --git a/test/CodeGen/Mips/assertzext-trunc.ll b/test/CodeGen/Mips/assertzext-trunc.ll
new file mode 100644
index 000000000000..2295727834eb
--- /dev/null
+++ b/test/CodeGen/Mips/assertzext-trunc.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | \
+; RUN: FileCheck %s -check-prefixes=ALL,PRE-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | \
+; RUN: FileCheck %s -check-prefixes=ALL,PRE-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | \
+; RUN: FileCheck %s -check-prefixes=ALL,PRE-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | \
+; RUN: FileCheck %s -check-prefixes=ALL,PRE-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | \
+; RUN: FileCheck %s -check-prefixes=ALL,PRE-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | \
+; RUN: FileCheck %s -check-prefixes=ALL,PRE-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | \
+; RUN: FileCheck %s -check-prefixes=ALL,R6
+
+; Check that we don't emit redundant SLLs for sequences of
+; (AssertZext:i32 (trunc:i32 (AssertZext:i64 X, i32)), i8)
+define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) {
+entry:
+; ALL-LABEL: udiv_i8:
+
+ ; PRE-R6-NOT: sll {{.*}}
+ ; PRE-R6: divu $zero, $4, $5
+ ; PRE-R6: teq $5, $zero, 7
+ ; PRE-R6: mflo $2
+
+ ; R6-NOT: sll {{.*}}
+ ; R6: divu $2, $4, $5
+ ; R6: teq $5, $zero, 7
+
+ %r = udiv i8 %a, %b
+ ret i8 %r
+}
+
+; Check that we do sign-extend when we have a (trunc:i32 (AssertZext:i64 X, i32))
+define i64 @foo1(i64 zeroext %var) {
+entry:
+; ALL-LABEL: foo1:
+
+ %shr = lshr i64 %var, 32
+ %cmp = icmp eq i64 %shr, 0
+ br i1 %cmp, label %if.end6, label %if.then
+
+ ; ALL: dsrl $[[T0:[0-9]+]], $4, 32
+ ; ALL: sll $[[T1:[0-9]+]], $[[T0]], 0
+ if.then: ; preds = %entry
+ %conv = trunc i64 %shr to i32
+ %cmp2 = icmp slt i32 %conv, 0
+ br i1 %cmp2, label %if.then4, label %if.else
+
+ if.then4: ; preds = %if.then
+ %add = add i64 %var, 16
+ br label %if.end6
+
+ if.else: ; preds = %if.then
+ %add5 = add i64 %var, 32
+ br label %if.end6
+
+ if.end6: ; preds = %entry, %if.then4, %if.else
+ %var.addr.0 = phi i64 [ %add, %if.then4 ], [ %add5, %if.else ], [ %var, %entry ]
+ ret i64 %var.addr.0
+}
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 031cce0b6074..8f4ccb19958a 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,15 +1,26 @@
-; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
-; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
-; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
-; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=NOT-MICROMIPS
-; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 -mattr=micromips < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL -check-prefix=MICROMIPS
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS32-ANY,NO-SEB-SEH,CHECK-EL,NOT-MICROMIPS
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 -relocation-model=pic -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS32-ANY,HAS-SEB-SEH,CHECK-EL,NOT-MICROMIPS
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r6 -relocation-model=pic -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS32-ANY,HAS-SEB-SEH,CHECK-EL,MIPSR6
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips4 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS64-ANY,NO-SEB-SEH,CHECK-EL,NOT-MICROMIPS
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS64-ANY,NO-SEB-SEH,CHECK-EL,NOT-MICROMIPS
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r2 -relocation-model=pic -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS64-ANY,HAS-SEB-SEH,CHECK-EL,NOT-MICROMIPS
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r6 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS64-ANY,HAS-SEB-SEH,CHECK-EL,MIPSR6
+; RUN: llc -march=mips64 -O0 -mcpu=mips64r6 -relocation-model=pic -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL-LABEL,MIPS64-ANY,O0
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 -mattr=micromips -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS32-ANY,HAS-SEB-SEH,CHECK-EL,MICROMIPS
; Keep one big-endian check so that we don't reduce testing, but don't add more
; since endianness doesn't affect the body of the atomic operations.
-; RUN: llc -march=mips --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EB -check-prefix=NOT-MICROMIPS
+; RUN: llc -march=mips --disable-machine-licm -mcpu=mips32 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MIPS32-ANY,NO-SEB-SEH,CHECK-EB,NOT-MICROMIPS
@x = common global i32 0, align 4
@@ -23,12 +34,17 @@ entry:
; MIPS32-ANY: lw $[[R0:[0-9]+]], %got(x)
; MIPS64-ANY: ld $[[R0:[0-9]+]], %got_disp(x)(
+; O0: $[[BB0:[A-Z_0-9]+]]:
+; O0: ld $[[R1:[0-9]+]]
+; O0-NEXT: ll $[[R2:[0-9]+]], 0($[[R1]])
+
; ALL: $[[BB0:[A-Z_0-9]+]]:
-; ALL: ll $[[R1:[0-9]+]], 0($[[R0]])
-; ALL: addu $[[R2:[0-9]+]], $[[R1]], $4
-; ALL: sc $[[R2]], 0($[[R0]])
-; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]]
-; MICROMIPS: beqzc $[[R2]], $[[BB0]]
+; ALL: ll $[[R3:[0-9]+]], 0($[[R0]])
+; ALL: addu $[[R4:[0-9]+]], $[[R3]], $4
+; ALL: sc $[[R4]], 0($[[R0]])
+; NOT-MICROMIPS: beqz $[[R4]], $[[BB0]]
+; MICROMIPS: beqzc $[[R4]], $[[BB0]]
+; MIPSR6: beqzc $[[R4]], $[[BB0]]
}
define i32 @AtomicLoadNand32(i32 signext %incr) nounwind {
@@ -41,6 +57,8 @@ entry:
; MIPS32-ANY: lw $[[R0:[0-9]+]], %got(x)
; MIPS64-ANY: ld $[[R0:[0-9]+]], %got_disp(x)(
+
+
; ALL: $[[BB0:[A-Z_0-9]+]]:
; ALL: ll $[[R1:[0-9]+]], 0($[[R0]])
; ALL: and $[[R3:[0-9]+]], $[[R1]], $4
@@ -48,6 +66,7 @@ entry:
; ALL: sc $[[R2]], 0($[[R0]])
; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]]
; MICROMIPS: beqzc $[[R2]], $[[BB0]]
+; MIPSR6: beqzc $[[R2]], $[[BB0]]
}
define i32 @AtomicSwap32(i32 signext %newval) nounwind {
@@ -68,6 +87,7 @@ entry:
; ALL: sc $[[R2:[0-9]+]], 0($[[R0]])
; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]]
; MICROMIPS: beqzc $[[R2]], $[[BB0]]
+; MIPSR6: beqzc $[[R2]], $[[BB0]]
}
define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
@@ -86,10 +106,13 @@ entry:
; ALL: $[[BB0:[A-Z_0-9]+]]:
; ALL: ll $2, 0($[[R0]])
-; ALL: bne $2, $4, $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne $2, $4, $[[BB1:[A-Z_0-9]+]]
+; MICROMIPS: bne $2, $4, $[[BB1:[A-Z_0-9]+]]
+; MIPSR6: bnec $2, $4, $[[BB1:[A-Z_0-9]+]]
; ALL: sc $[[R2:[0-9]+]], 0($[[R0]])
; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]]
; MICROMIPS: beqzc $[[R2]], $[[BB0]]
+; MIPSR6: beqzc $[[R2]], $[[BB0]]
; ALL: $[[BB1]]:
}
@@ -118,23 +141,28 @@ entry:
; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]]
; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]]
+; O0: $[[BB0:[A-Z_0-9]+]]:
+; O0: ld $[[R10:[0-9]+]]
+; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]])
+
; ALL: $[[BB0:[A-Z_0-9]+]]:
-; ALL: ll $[[R10:[0-9]+]], 0($[[R2]])
-; ALL: addu $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; ALL: and $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; ALL: and $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; ALL: or $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; ALL: sc $[[R14]], 0($[[R2]])
-; NOT-MICROMIPS: beqz $[[R14]], $[[BB0]]
-; MICROMIPS: beqzc $[[R14]], $[[BB0]]
+; ALL: ll $[[R12:[0-9]+]], 0($[[R2]])
+; ALL: addu $[[R13:[0-9]+]], $[[R12]], $[[R9]]
+; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]]
+; ALL: and $[[R15:[0-9]+]], $[[R12]], $[[R8]]
+; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R14]]
+; ALL: sc $[[R16]], 0($[[R2]])
+; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]]
+; MICROMIPS: beqzc $[[R16]], $[[BB0]]
+; MIPSR6: beqzc $[[R16]], $[[BB0]]
-; ALL: and $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; ALL: srlv $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+; ALL: and $[[R17:[0-9]+]], $[[R12]], $[[R7]]
+; ALL: srlv $[[R18:[0-9]+]], $[[R17]], $[[R5]]
-; NO-SEB-SEH: sll $[[R17:[0-9]+]], $[[R16]], 24
-; NO-SEB-SEH: sra $2, $[[R17]], 24
+; NO-SEB-SEH: sll $[[R19:[0-9]+]], $[[R18]], 24
+; NO-SEB-SEH: sra $2, $[[R19]], 24
-; HAS-SEB-SEH: seb $2, $[[R16]]
+; HAS-SEB-SEH: seb $2, $[[R18]]
}
define signext i8 @AtomicLoadSub8(i8 signext %incr) nounwind {
@@ -158,23 +186,28 @@ entry:
; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]]
; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]]
+; O0: $[[BB0:[A-Z_0-9]+]]:
+; O0: ld $[[R10:[0-9]+]]
+; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]])
+
; ALL: $[[BB0:[A-Z_0-9]+]]:
-; ALL: ll $[[R10:[0-9]+]], 0($[[R2]])
-; ALL: subu $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; ALL: and $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; ALL: and $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; ALL: or $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; ALL: sc $[[R14]], 0($[[R2]])
-; NOT-MICROMIPS: beqz $[[R14]], $[[BB0]]
-; MICROMIPS: beqzc $[[R14]], $[[BB0]]
+; ALL: ll $[[R12:[0-9]+]], 0($[[R2]])
+; ALL: subu $[[R13:[0-9]+]], $[[R12]], $[[R9]]
+; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]]
+; ALL: and $[[R15:[0-9]+]], $[[R12]], $[[R8]]
+; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R14]]
+; ALL: sc $[[R16]], 0($[[R2]])
+; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]]
+; MICROMIPS: beqzc $[[R16]], $[[BB0]]
+; MIPSR6: beqzc $[[R16]], $[[BB0]]
-; ALL: and $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; ALL: srlv $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+; ALL: and $[[R17:[0-9]+]], $[[R12]], $[[R7]]
+; ALL: srlv $[[R18:[0-9]+]], $[[R17]], $[[R5]]
-; NO-SEB-SEH: sll $[[R17:[0-9]+]], $[[R16]], 24
-; NO-SEB-SEH: sra $2, $[[R17]], 24
+; NO-SEB-SEH: sll $[[R19:[0-9]+]], $[[R18]], 24
+; NO-SEB-SEH: sra $2, $[[R19]], 24
-; HAS-SEB-SEH:seb $2, $[[R16]]
+; HAS-SEB-SEH:seb $2, $[[R18]]
}
define signext i8 @AtomicLoadNand8(i8 signext %incr) nounwind {
@@ -198,24 +231,29 @@ entry:
; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]]
; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]]
-; ALL: $[[BB0:[A-Z_0-9]+]]:
-; ALL: ll $[[R10:[0-9]+]], 0($[[R2]])
-; ALL: and $[[R18:[0-9]+]], $[[R10]], $[[R9]]
-; ALL: nor $[[R11:[0-9]+]], $zero, $[[R18]]
-; ALL: and $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; ALL: and $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; ALL: or $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; ALL: sc $[[R14]], 0($[[R2]])
-; NOT-MICROMIPS: beqz $[[R14]], $[[BB0]]
-; MICROMIPS: beqzc $[[R14]], $[[BB0]]
+; O0: $[[BB0:[A-Z_0-9]+]]:
+; O0: ld $[[R10:[0-9]+]]
+; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]])
-; ALL: and $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; ALL: srlv $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-
-; NO-SEB-SEH: sll $[[R17:[0-9]+]], $[[R16]], 24
-; NO-SEB-SEH: sra $2, $[[R17]], 24
-
-; HAS-SEB-SEH: seb $2, $[[R16]]
+; ALL: $[[BB0:[A-Z_0-9]+]]:
+; ALL: ll $[[R12:[0-9]+]], 0($[[R2]])
+; ALL: and $[[R13:[0-9]+]], $[[R12]], $[[R9]]
+; ALL: nor $[[R14:[0-9]+]], $zero, $[[R13]]
+; ALL: and $[[R15:[0-9]+]], $[[R14]], $[[R7]]
+; ALL: and $[[R16:[0-9]+]], $[[R12]], $[[R8]]
+; ALL: or $[[R17:[0-9]+]], $[[R16]], $[[R15]]
+; ALL: sc $[[R17]], 0($[[R2]])
+; NOT-MICROMIPS: beqz $[[R17]], $[[BB0]]
+; MICROMIPS: beqzc $[[R17]], $[[BB0]]
+; MIPSR6: beqzc $[[R17]], $[[BB0]]
+
+; ALL: and $[[R18:[0-9]+]], $[[R12]], $[[R7]]
+; ALL: srlv $[[R19:[0-9]+]], $[[R18]], $[[R5]]
+
+; NO-SEB-SEH: sll $[[R20:[0-9]+]], $[[R19]], 24
+; NO-SEB-SEH: sra $2, $[[R20]], 24
+
+; HAS-SEB-SEH: seb $2, $[[R19]]
}
define signext i8 @AtomicSwap8(i8 signext %newval) nounwind {
@@ -247,6 +285,7 @@ entry:
; ALL: sc $[[R14]], 0($[[R2]])
; NOT-MICROMIPS: beqz $[[R14]], $[[BB0]]
; MICROMIPS: beqzc $[[R14]], $[[BB0]]
+; MIPSR6: beqzc $[[R14]], $[[BB0]]
; ALL: and $[[R15:[0-9]+]], $[[R10]], $[[R7]]
; ALL: srlv $[[R16:[0-9]+]], $[[R15]], $[[R5]]
@@ -286,13 +325,16 @@ entry:
; ALL: $[[BB0:[A-Z_0-9]+]]:
; ALL: ll $[[R13:[0-9]+]], 0($[[R2]])
; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; ALL: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; MIPSR6: bnec $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
; ALL: and $[[R15:[0-9]+]], $[[R13]], $[[R8]]
; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R12]]
; ALL: sc $[[R16]], 0($[[R2]])
; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]]
; MICROMIPS: beqzc $[[R16]], $[[BB0]]
+; MIPSR6: beqzc $[[R16]], $[[BB0]]
; ALL: $[[BB1]]:
; ALL: srlv $[[R17:[0-9]+]], $[[R14]], $[[R5]]
@@ -327,13 +369,16 @@ entry:
; ALL: $[[BB0:[A-Z_0-9]+]]:
; ALL: ll $[[R13:[0-9]+]], 0($[[R2]])
; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; ALL: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; NOT-MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; MICROMIPS: bne $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+; MIPSR6: bnec $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
; ALL: and $[[R15:[0-9]+]], $[[R13]], $[[R8]]
; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R12]]
; ALL: sc $[[R16]], 0($[[R2]])
; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]]
; MICROMIPS: beqzc $[[R16]], $[[BB0]]
+; MIPSR6: beqzc $[[R16]], $[[BB0]]
; ALL: $[[BB1]]:
; ALL: srlv $[[R17:[0-9]+]], $[[R14]], $[[R5]]
@@ -341,10 +386,17 @@ entry:
; NO-SEB-SEH: sll $[[R18:[0-9]+]], $[[R17]], 24
; NO-SEB-SEH: sra $[[R19:[0-9]+]], $[[R18]], 24
+; FIXME: -march=mips produces a redundant sign extension here...
+; NO-SEB-SEH: sll $[[R20:[0-9]+]], $5, 24
+; NO-SEB-SEH: sra $[[R20]], $[[R20]], 24
+
; HAS-SEB-SEH: seb $[[R19:[0-9]+]], $[[R17]]
-; ALL: xor $[[R20:[0-9]+]], $[[R19]], $5
-; ALL: sltiu $2, $[[R20]], 1
+; FIXME: ...Leading to this split check.
+; NO-SEB-SEH: xor $[[R21:[0-9]+]], $[[R19]], $[[R20]]
+; HAS-SEB-SEH: xor $[[R21:[0-9]+]], $[[R19]], $5
+
+; ALL: sltiu $2, $[[R21]], 1
}
; Check one i16 so that we cover the seh sign extend
@@ -371,25 +423,73 @@ entry:
; ALL: nor $[[R8:[0-9]+]], $zero, $[[R7]]
; ALL: sllv $[[R9:[0-9]+]], $4, $[[R5]]
+; O0: $[[BB0:[A-Z_0-9]+]]:
+; O0: ld $[[R10:[0-9]+]]
+; O0-NEXT: ll $[[R11:[0-9]+]], 0($[[R10]])
+
; ALL: $[[BB0:[A-Z_0-9]+]]:
-; ALL: ll $[[R10:[0-9]+]], 0($[[R2]])
-; ALL: addu $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; ALL: and $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; ALL: and $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; ALL: or $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; ALL: sc $[[R14]], 0($[[R2]])
-; NOT-MICROMIPS: beqz $[[R14]], $[[BB0]]
-; MICROMIPS: beqzc $[[R14]], $[[BB0]]
+; ALL: ll $[[R12:[0-9]+]], 0($[[R2]])
+; ALL: addu $[[R13:[0-9]+]], $[[R12]], $[[R9]]
+; ALL: and $[[R14:[0-9]+]], $[[R13]], $[[R7]]
+; ALL: and $[[R15:[0-9]+]], $[[R12]], $[[R8]]
+; ALL: or $[[R16:[0-9]+]], $[[R15]], $[[R14]]
+; ALL: sc $[[R16]], 0($[[R2]])
+; NOT-MICROMIPS: beqz $[[R16]], $[[BB0]]
+; MICROMIPS: beqzc $[[R16]], $[[BB0]]
+; MIPSR6: beqzc $[[R16]], $[[BB0]]
-; ALL: and $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; ALL: srlv $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+; ALL: and $[[R17:[0-9]+]], $[[R12]], $[[R7]]
+; ALL: srlv $[[R18:[0-9]+]], $[[R17]], $[[R5]]
-; NO-SEB-SEH: sll $[[R17:[0-9]+]], $[[R16]], 16
-; NO-SEB-SEH: sra $2, $[[R17]], 16
+; NO-SEB-SEH: sll $[[R19:[0-9]+]], $[[R18]], 16
+; NO-SEB-SEH: sra $2, $[[R19]], 16
-; MIPS32R2: seh $2, $[[R16]]
+; MIPS32R2: seh $2, $[[R18]]
}
+; Test that the i16 return value from cmpxchg is recognised as signed,
+; so that setCC doesn't end up comparing an unsigned value to a signed
+; value.
+; The rest of the functions here are testing the atomic expansion, so
+; we just match the end of the function.
+define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
+ %desired = add i16 %l, %r
+ %res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst seq_cst
+ ret {i16, i1} %res
+
+; ALL-LABEL: foo
+; MIPSR6: addu $[[R2:[0-9]+]], $[[R1:[0-9]+]], $[[R0:[0-9]+]]
+; NOT-MICROMIPS: addu $[[R2:[0-9]+]], $[[R1:[0-9]+]], $[[R0:[0-9]+]]
+; MICROMIPS: addu16 $[[R2:[0-9]+]], $[[R1:[0-9]+]], $[[R0:[0-9]+]]
+
+; ALL: sync
+
+; ALL: andi $[[R3:[0-9]+]], $[[R2]], 65535
+; ALL: $[[BB0:[A-Z_0-9]+]]:
+; ALL: ll $[[R4:[0-9]+]], 0($[[R5:[0-9]+]])
+; ALL: and $[[R6:[0-9]+]], $[[R4]], $
+; ALL: and $[[R7:[0-9]+]], $[[R4]], $
+; ALL: or $[[R8:[0-9]+]], $[[R7]], $
+; ALL: sc $[[R8]], 0($[[R5]])
+; NOT-MICROMIPS: beqz $[[R8]], $[[BB0]]
+; MICROMIPS: beqzc $[[R8]], $[[BB0]]
+; MIPSR6: beqzc $[[R8]], $[[BB0]]
+
+; ALL: srlv $[[R9:[0-9]+]], $[[R6]], $
+
+; NO-SEB-SEH: sll $[[R10:[0-9]+]], $[[R9]], 16
+; NO-SEB-SEH: sra $[[R11:[0-9]+]], $[[R10]], 16
+
+; NO-SEB-SEH: sll $[[R12:[0-9]+]], $[[R2]], 16
+; NO-SEB-SEH: sra $[[R13:[0-9]+]], $[[R12]], 16
+
+; HAS-SEB-SEH: seh $[[R11:[0-9]+]], $[[R9]]
+; HAS-SEB-SEH: seh $[[R13:[0-9]+]], $[[R2]]
+
+; ALL: xor $[[R12:[0-9]+]], $[[R11]], $[[R13]]
+; ALL: sltiu $3, $[[R12]], 1
+; ALL: sync
+}
@countsint = common global i32 0, align 4
@@ -444,4 +544,5 @@ entry:
; ALL: sc $[[R2]], 0($[[PTR]])
; NOT-MICROMIPS: beqz $[[R2]], $[[BB0]]
; MICROMIPS: beqzc $[[R2]], $[[BB0]]
+; MIPSR6: beqzc $[[R2]], $[[BB0]]
}
diff --git a/test/CodeGen/Mips/atomicCmpSwapPW.ll b/test/CodeGen/Mips/atomicCmpSwapPW.ll
new file mode 100644
index 000000000000..981f0983fa4c
--- /dev/null
+++ b/test/CodeGen/Mips/atomicCmpSwapPW.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 -march=mipsel -mcpu=mips32r2 -target-abi=o32 < %s -filetype=asm -o - \
+; RUN: | FileCheck -check-prefixes=PTR32,ALL %s
+; RUN: llc -O0 -march=mips64el -mcpu=mips64r2 -target-abi=n32 < %s -filetype=asm -o - \
+; RUN: | FileCheck -check-prefixes=PTR32,ALL %s
+; RUN: llc -O0 -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s -filetype=asm -o - \
+; RUN: | FileCheck -check-prefixes=PTR64,ALL %s
+
+; PTR32: lw $[[R0:[0-9]+]]
+; PTR64: ld $[[R0:[0-9]+]]
+
+; ALL: ll ${{[0-9]+}}, 0($[[R0]])
+
+define {i16, i1} @foo(i16* %addr, i16 signext %r, i16 zeroext %new) {
+ %res = cmpxchg i16* %addr, i16 %r, i16 %new seq_cst seq_cst
+ ret {i16, i1} %res
+}
+
diff --git a/test/CodeGen/Mips/biggot.ll b/test/CodeGen/Mips/biggot.ll
index b56ce6ba87b1..aafaf55ae184 100644
--- a/test/CodeGen/Mips/biggot.ll
+++ b/test/CodeGen/Mips/biggot.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mipsel -mxgot < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -mxgot < %s | \
+; RUN: llc -march=mipsel -mxgot -relocation-model=pic < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -mxgot -relocation-model=pic < %s | \
; RUN: FileCheck %s -check-prefix=N64
@v0 = external global i32
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index 0f46619b8272..3a5d4dcded26 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -1,23 +1,24 @@
; RUN: llc -march=mipsel -O0 < %s | FileCheck %s -check-prefix=None
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=Default
+; RUN: llc -march=mipsel -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefix=Default
; RUN: llc -march=mipsel -O1 -relocation-model=static < %s | \
; RUN: FileCheck %s -check-prefix=STATICO1
; RUN: llc -march=mipsel -disable-mips-df-forward-search=false \
; RUN: -relocation-model=static < %s | FileCheck %s -check-prefix=FORWARD
-; RUN: llc -march=mipsel -disable-mips-df-backward-search \
-; RUN: -disable-mips-df-succbb-search=false < %s | \
+; RUN: llc -march=mipsel -disable-mips-df-backward-search -relocation-model=pic \
+; RUN: -disable-mips-df-succbb-search=false -disable-preheader-prot=true < %s | \
; RUN: FileCheck %s -check-prefix=SUCCBB
define void @foo1() nounwind {
entry:
-; Default: jalr
-; Default-NOT: nop
-; Default: jr
+; Default: jalr
+; Default-NOT: nop
+; Default: jr
; Default-NOT: nop
; Default: .end
-; None: jalr
-; None: nop
-; None: jr
+; None: jalr
+; None: nop
+; None: jr
; None: nop
; None: .end
diff --git a/test/CodeGen/Mips/brsize3.ll b/test/CodeGen/Mips/brsize3.ll
index 1e76879409c6..ce8b8f6e35b1 100644
--- a/test/CodeGen/Mips/brsize3.ll
+++ b/test/CodeGen/Mips/brsize3.ll
@@ -1,6 +1,12 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands < %s | FileCheck %s -check-prefix=b-no-short
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 \
+; RUN: -mattr=+soft-float -mips16-hard-float -relocation-model=pic \
+; RUN: -mips16-constant-islands -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefix=b-no-short
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands < %s | FileCheck %s -check-prefix=b-long
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 \
+; RUN: -mattr=+soft-float -mips16-hard-float -relocation-model=pic \
+; RUN: -mips16-constant-islands -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefix=b-long
; ModuleID = 'brsize3.c'
target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index 7682a98ace99..19ef04f040d8 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL
-; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
-; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
-; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefixes=NO-MFHC1,ALL
+; RUN: llc -march=mips < %s | FileCheck %s -check-prefixes=NO-MFHC1,ALL
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefixes=HAS-MFHC1,ALL
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefixes=HAS-MFHC1,ALL
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefixes=HAS-MFHC1,ALL
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefixes=HAS-MFHC1,ALL
@a = external global i32
diff --git a/test/CodeGen/Mips/call-optimization.ll b/test/CodeGen/Mips/call-optimization.ll
index bfa09eaae3cb..762b00effc97 100644
--- a/test/CodeGen/Mips/call-optimization.ll
+++ b/test/CodeGen/Mips/call-optimization.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
+; RUN: llc -march=mipsel -disable-mips-delay-filler -relocation-model=pic < %s | \
; RUN: FileCheck %s -check-prefix=O32
; RUN: llc -march=mipsel -mips-load-target-from-got=false \
-; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=O32-LOADTGT
+; RUN: -disable-mips-delay-filler -relocation-model=pic < %s | FileCheck %s -check-prefix=O32-LOADTGT
@gd1 = common global double 0.000000e+00, align 8
@gd2 = common global double 0.000000e+00, align 8
diff --git a/test/CodeGen/Mips/cannot-copy-registers.ll b/test/CodeGen/Mips/cannot-copy-registers.ll
new file mode 100644
index 000000000000..75cceb2011eb
--- /dev/null
+++ b/test/CodeGen/Mips/cannot-copy-registers.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=mips64 -mcpu=mips64r6 -mattr=+micromips \
+; RUN: -relocation-model=pic -O3 < %s
+
+; Check that message "Cannot copy registers" is not asserted in case of microMIPS64r6.
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+ %0 = load i32, i32* @x, align 4
+ %and1 = and i32 %0, 4
+ %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds
+ ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %and1)
+
+ %1 = load i32, i32* @y, align 4
+ %and2 = and i32 %1, 5
+ %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds
+ ([7 x i8], [7 x i8]* @.str, i32 0, i32 0), i32 %and2)
+ ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/cconv/arguments-float.ll b/test/CodeGen/Mips/cconv/arguments-float.ll
index c81c7215e164..a76cf6226dc0 100644
--- a/test/CodeGen/Mips/cconv/arguments-float.ll
+++ b/test/CodeGen/Mips/cconv/arguments-float.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=mips -relocation-model=static -mattr=+soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
-; RUN: llc -march=mipsel -relocation-model=static -mattr=+soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+; RUN: llc -march=mips -relocation-model=static -mattr=+soft-float < %s | FileCheck --check-prefixes=ALL,SYM32,O32,O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static -mattr=+soft-float < %s | FileCheck --check-prefixes=ALL,SYM32,O32,O32LE %s
-; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi o32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi o32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
-; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW %s
-; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW %s
; Test the floating point arguments for all ABI's and byte orders as specified
; by section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/arguments-fp128.ll b/test/CodeGen/Mips/cconv/arguments-fp128.ll
index 6c62609396c5..70df97608aa9 100644
--- a/test/CodeGen/Mips/cconv/arguments-fp128.ll
+++ b/test/CodeGen/Mips/cconv/arguments-fp128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32 %s
-; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=+soft-float -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64 %s
; Test the fp128 arguments for all ABI's and byte orders as specified
; by section 2 of the MIPSpro N32 Handbook.
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
index 9f1fe91ec172..5f7a86534bdf 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
-; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32,O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32,O32LE %s
-; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWBE %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWLE %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,N32,NEW,NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,N32,NEW,NEWLE %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWBE %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWLE %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,N64,NEW,NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,N64,NEW,NEWLE %s
; Test the effect of varargs on floating point types in the non-variable part
; of the argument list as specified by section 2 of the MIPSpro N32 Handbook.
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float.ll b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
index 24148ed176db..2e753d0f07cb 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-float.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
-; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32,O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32,O32LE %s
-; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW %s
; Test the floating point arguments for all ABI's and byte orders as specified
; by section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
index 26eb569f865d..1a3b664d9159 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32 %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64 %s
; Test the fp128 arguments for all ABI's and byte orders as specified
; by section 2 of the MIPSpro N32 Handbook.
diff --git a/test/CodeGen/Mips/cconv/arguments-small-structures-bigger-than-32bits.ll b/test/CodeGen/Mips/cconv/arguments-small-structures-bigger-than-32bits.ll
index 087a0515f379..56f9a64908bc 100644
--- a/test/CodeGen/Mips/cconv/arguments-small-structures-bigger-than-32bits.ll
+++ b/test/CodeGen/Mips/cconv/arguments-small-structures-bigger-than-32bits.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=mips64 -target-abi n64 -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=MIPSEB
-; RUN: llc < %s -march=mips64el -target-abi n64 -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=MIPSEL
-; RUN: llc < %s -march=mips64 -target-abi n32 -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=MIPSEB
-; RUN: llc < %s -march=mips64el -target-abi n32 -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=MIPSEL
+; RUN: llc < %s -march=mips64 -target-abi n64 -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,MIPSEB
+; RUN: llc < %s -march=mips64el -target-abi n64 -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,MIPSEL
+; RUN: llc < %s -march=mips64 -target-abi n32 -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,MIPSEB
+; RUN: llc < %s -march=mips64el -target-abi n32 -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,MIPSEL
; #include <stdio.h>
;
diff --git a/test/CodeGen/Mips/cconv/arguments-struct.ll b/test/CodeGen/Mips/cconv/arguments-struct.ll
index ee6bfaeb9537..44ea7c0f8337 100644
--- a/test/CodeGen/Mips/cconv/arguments-struct.ll
+++ b/test/CodeGen/Mips/cconv/arguments-struct.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=mips-unknown-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-BE %s
-; RUN: llc -mtriple=mipsel-unknown-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-LE %s
+; RUN: llc -mtriple=mips-unknown-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32-BE %s
+; RUN: llc -mtriple=mipsel-unknown-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32-LE %s
-; RUN-TODO: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-BE %s
-; RUN-TODO: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32-LE %s
+; RUN-TODO: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32-BE %s
+; RUN-TODO: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32-LE %s
-; RUN: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW-BE %s
-; RUN: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW-LE %s
+; RUN: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW-BE %s
+; RUN: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW-LE %s
-; RUN: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW-BE %s
-; RUN: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW-LE %s
+; RUN: llc -mtriple=mips64-unknown-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW-BE %s
+; RUN: llc -mtriple=mips64el-unknown-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW-LE %s
; Test small structures for all ABI's and byte orders.
;
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs.ll b/test/CodeGen/Mips/cconv/arguments-varargs.ll
index d1a196738aee..9c20b882dcb6 100644
--- a/test/CodeGen/Mips/cconv/arguments-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-varargs.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=mips-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
-; RUN: llc -mtriple=mipsel-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
+; RUN: llc -mtriple=mips-linux -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32,O32-BE %s
+; RUN: llc -mtriple=mipsel-linux -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32,O32-LE %s
-; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -mtriple=mips64-linux -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-BE %s
-; RUN: llc -mtriple=mips64el-linux -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-LE %s
+; RUN: llc -mtriple=mips64-linux -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,NEW,N32,NEW-BE %s
+; RUN: llc -mtriple=mips64el-linux -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,NEW,N32,NEW-LE %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-BE %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-LE %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,NEW,N64,NEW-BE %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,NEW,N64,NEW-LE %s
@hwords = global [3 x i16] zeroinitializer, align 1
@words = global [3 x i32] zeroinitializer, align 1
diff --git a/test/CodeGen/Mips/cconv/arguments.ll b/test/CodeGen/Mips/cconv/arguments.ll
index 430705f8d418..7af4e5517d51 100644
--- a/test/CodeGen/Mips/cconv/arguments.ll
+++ b/test/CodeGen/Mips/cconv/arguments.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
-; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,O32 %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,SYM32,NEW %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,SYM64,NEW %s
; Test the integer arguments for all ABI's and byte orders as specified by
; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/callee-saved-float.ll b/test/CodeGen/Mips/cconv/callee-saved-float.ll
index c84f0f439c26..30a5727f344d 100644
--- a/test/CodeGen/Mips/cconv/callee-saved-float.ll
+++ b/test/CodeGen/Mips/cconv/callee-saved-float.ll
@@ -1,22 +1,24 @@
-; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
-; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefixes=ALL,O32-INV %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefixes=ALL,O32-INV %s
-; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
-; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,ALL-INV,O32-INV %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,ALL-INV,O32-INV %s
-; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
-; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,ALL-INV,N32-INV %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,ALL-INV,N32-INV %s
-; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
-; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,ALL-INV,N64-INV %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,ALL-INV,N64-INV %s
+
+; RUN: llc -march=mips -mcpu=mips32r6 -mattr=micromips -filetype=obj < %s -o - | llvm-objdump -no-show-raw-insn -arch mips -mcpu=mips32r6 -mattr=micromips -d - | FileCheck --check-prefix=MM32R6 %s
; Test the the callee-saved registers are callee-saved as specified by section
; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
@@ -109,3 +111,6 @@ entry:
; N64-DAG: ldc1 [[F30]], [[OFF30]]($sp)
; N64-DAG: ldc1 [[F31]], [[OFF31]]($sp)
; N64: addiu $sp, $sp, 64
+
+; Check the mapping between LDC164 and LDC1_64_MMR6.
+; MM32R6: ldc1
diff --git a/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll b/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll
index 4b28b9962075..bd33e0c51697 100644
--- a/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll
+++ b/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
-; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
-; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV %s
-; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV %s
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX %s
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX %s
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX-INV %s
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX-INV %s
-; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
-; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
-; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV --check-prefix=O32-FPXX-INV %s
-; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV --check-prefix=O32-FPXX-INV %s
+; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX %s
+; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX %s
+; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX-INV,O32-FPXX-INV %s
+; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefixes=ALL,O32-FPXX-INV,O32-FPXX-INV %s
define void @fpu_clobber() nounwind {
entry:
diff --git a/test/CodeGen/Mips/cconv/callee-saved.ll b/test/CodeGen/Mips/cconv/callee-saved.ll
index 0570ab35fd00..b7e94f882282 100644
--- a/test/CodeGen/Mips/cconv/callee-saved.ll
+++ b/test/CodeGen/Mips/cconv/callee-saved.ll
@@ -1,22 +1,22 @@
-; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
-; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefixes=ALL,O32-INV %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefixes=ALL,O32-INV %s
-; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
-; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32-INV %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32-INV %s
-; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
-; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32-INV %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32-INV %s
-; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
-; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64-INV %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64-INV %s
; Test the callee-saved registers are callee-saved as specified by section
; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
diff --git a/test/CodeGen/Mips/cconv/memory-layout.ll b/test/CodeGen/Mips/cconv/memory-layout.ll
index 33a68da157f6..e992169f6b64 100644
--- a/test/CodeGen/Mips/cconv/memory-layout.ll
+++ b/test/CodeGen/Mips/cconv/memory-layout.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
; Test the memory layout for all ABI's and byte orders as specified by section
; 4 of MD00305 (MIPS ABIs Described).
@@ -27,39 +27,39 @@
@double = global double 1.0, align 1
@pointer = global i8* @byte
-; ALL-NOT: .align
+; ALL-NOT: .p2align
; ALL-LABEL: byte:
; ALL: .byte 1
; ALL: .size byte, 1
-; ALL: .align 1
+; ALL: .p2align 1
; ALL-LABEL: halfword:
; ALL: .2byte 258
; ALL: .size halfword, 2
-; ALL: .align 2
+; ALL: .p2align 2
; ALL-LABEL: word:
; ALL: .4byte 16909060
; ALL: .size word, 4
-; ALL: .align 2
+; ALL: .p2align 2
; ALL-LABEL: float:
; ALL: .4byte 1065353216
; ALL: .size float, 4
-; ALL: .align 3
+; ALL: .p2align 3
; ALL-LABEL: dword:
; ALL: .8byte 283686952306183
; ALL: .size dword, 8
-; ALL: .align 3
+; ALL: .p2align 3
; ALL-LABEL: double:
; ALL: .8byte 4607182418800017408
; ALL: .size double, 8
-; O32: .align 2
-; N32: .align 2
-; N64: .align 3
+; O32: .p2align 2
+; N32: .p2align 2
+; N64: .p2align 3
; ALL-LABEL: pointer:
; O32: .4byte byte
; O32: .size pointer, 4
@@ -76,44 +76,44 @@
@double_array = global [2 x double] [double 1.0, double 2.0], align 1
@pointer_array = global [2 x i8*] [i8* @byte, i8* @byte]
-; ALL-NOT: .align
+; ALL-NOT: .p2align
; ALL-LABEL: byte_array:
; ALL: .ascii "\001\002"
; ALL: .size byte_array, 2
-; ALL: .align 1
+; ALL: .p2align 1
; ALL-LABEL: halfword_array:
; ALL: .2byte 1
; ALL: .2byte 2
; ALL: .size halfword_array, 4
-; ALL: .align 2
+; ALL: .p2align 2
; ALL-LABEL: word_array:
; ALL: .4byte 1
; ALL: .4byte 2
; ALL: .size word_array, 8
-; ALL: .align 2
+; ALL: .p2align 2
; ALL-LABEL: float_array:
; ALL: .4byte 1065353216
; ALL: .4byte 1073741824
; ALL: .size float_array, 8
-; ALL: .align 3
+; ALL: .p2align 3
; ALL-LABEL: dword_array:
; ALL: .8byte 1
; ALL: .8byte 2
; ALL: .size dword_array, 16
-; ALL: .align 3
+; ALL: .p2align 3
; ALL-LABEL: double_array:
; ALL: .8byte 4607182418800017408
; ALL: .8byte 4611686018427387904
; ALL: .size double_array, 16
-; O32: .align 2
-; N32: .align 2
-; N64: .align 3
+; O32: .p2align 2
+; N32: .p2align 2
+; N64: .p2align 3
; ALL-LABEL: pointer_array:
; O32: .4byte byte
; O32: .4byte byte
@@ -128,7 +128,7 @@
%mixed = type { i8, double, i16 }
@mixed = global %mixed { i8 1, double 1.0, i16 515 }, align 1
-; ALL: .align 3
+; ALL: .p2align 3
; ALL-LABEL: mixed:
; ALL: .byte 1
; ALL: .space 7
diff --git a/test/CodeGen/Mips/cconv/reserved-space.ll b/test/CodeGen/Mips/cconv/reserved-space.ll
index 23190c2790cc..a17377ba0e56 100644
--- a/test/CodeGen/Mips/cconv/reserved-space.ll
+++ b/test/CodeGen/Mips/cconv/reserved-space.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
; Test that O32 correctly reserved space for the four arguments, even when
; there aren't any as per section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/return-float.ll b/test/CodeGen/Mips/cconv/return-float.ll
index 8eb8c411e081..b9a6d6c5bc0d 100644
--- a/test/CodeGen/Mips/cconv/return-float.ll
+++ b/test/CodeGen/Mips/cconv/return-float.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=mips-linux-gnu -mattr=+soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -mtriple=mipsel-linux-gnu -mattr=+soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mips-linux-gnu -mattr=+soft-float -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -mattr=+soft-float -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -mattr=+soft-float -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
; Test the float returns for all ABI's and byte orders as specified by
; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/return-hard-float.ll b/test/CodeGen/Mips/cconv/return-hard-float.ll
index 14853c8ca6f7..768cb6a9f2c6 100644
--- a/test/CodeGen/Mips/cconv/return-hard-float.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-float.ll
@@ -1,17 +1,17 @@
-; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
-; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
-; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefixes=ALL,032FP64 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefixes=ALL,032FP64 %s
; Test the float returns for all ABI's and byte orders as specified by
; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/return-hard-fp128.ll b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
index 34e9647acddd..bdbfb80bd4aa 100644
--- a/test/CodeGen/Mips/cconv/return-hard-fp128.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64el -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
; Test the fp128 returns for N32/N64 and all byte orders as specified by
; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
index c4c8f10ca3b4..9b178e4380d1 100644
--- a/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
; Test return of {fp128} agrees with de-facto N32/N64 ABI.
diff --git a/test/CodeGen/Mips/cconv/return-struct.ll b/test/CodeGen/Mips/cconv/return-struct.ll
index 8decd04f089b..da20919ffd42 100644
--- a/test/CodeGen/Mips/cconv/return-struct.ll
+++ b/test/CodeGen/Mips/cconv/return-struct.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
-; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32,O32-BE %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32,O32-LE %s
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-BE %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-LE %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32,N32-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32,N32-LE %s
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-BE %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-LE %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64,N64-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64,N64-LE %s
; Test struct returns for all ABI's and byte orders.
@@ -158,9 +158,6 @@ entry:
; sret pointer is already in $4
; N32-DAG: lui [[PTR_HI:\$[0-9]+]], %hi(struct_128xi16)
; N32-DAG: addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(struct_128xi16)
-; FIXME: This signext isn't necessary. Like integers, pointers are
-; but unlike integers, pointers cannot have the signext attribute.
-; N32-DAG: sll $5, [[PTR]], 0
; N32: jal memcpy
; sret pointer is already in $4
diff --git a/test/CodeGen/Mips/cconv/return.ll b/test/CodeGen/Mips/cconv/return.ll
index a53767275434..561c94cb5783 100644
--- a/test/CodeGen/Mips/cconv/return.ll
+++ b/test/CodeGen/Mips/cconv/return.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
; Test the integer returns for all ABI's and byte orders as specified by
; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cconv/roundl-call.ll b/test/CodeGen/Mips/cconv/roundl-call.ll
new file mode 100644
index 000000000000..8e4d6597784c
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/roundl-call.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=mips64 -mcpu=mips64 -target-abi=n32 -relocation-model=pic < \
+; RUN: %s | FileCheck %s -check-prefixes=ALL,N32,HARD-FLOAT
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n32 -relocation-model=pic < \
+; RUN: %s | FileCheck %s -check-prefixes=ALL,N32,HARD-FLOAT
+
+; RUN: llc -march=mips64 -mcpu=mips64 -target-abi=n64 -relocation-model=pic < \
+; RUN: %s | FileCheck %s -check-prefixes=ALL,N64,HARD-FLOAT
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -relocation-model=pic < \
+; RUN: %s | FileCheck %s -check-prefixes=ALL,N64,HARD-FLOAT
+
+; RUN: llc -march=mips64 -mcpu=mips64 -mattr=+soft-float -target-abi=n32 \
+; RUN: -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,N32,SOFT-FLOAT
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=+soft-float -target-abi=n32 \
+; RUN: -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,N32,SOFT-FLOAT
+
+; RUN: llc -march=mips64 -mcpu=mips64 -mattr=+soft-float -target-abi=n64 < %s \
+; RUN: | FileCheck %s -check-prefixes=ALL,N64,SOFT-FLOAT
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=+soft-float -target-abi=n64 < \
+; RUN: %s | FileCheck %s -check-prefixes=ALL,N64,SOFT-FLOAT
+
+@fp128 = global fp128 zeroinitializer
+
+define void @roundl_call(fp128 %value) {
+entry:
+; ALL-LABEL: roundl_call:
+; N32: lw $25, %call16(roundl)($gp)
+; N64: ld $25, %call16(roundl)($gp)
+
+; SOFT-FLOAT: sd $4, 8(${{[0-9]+}})
+; SOFT-FLOAT: sd $2, 0(${{[0-9]+}})
+
+; HARD-FLOAT: sdc1 $f2, 8(${{[0-9]+}})
+; HARD-FLOAT: sdc1 $f0, 0(${{[0-9]+}})
+
+ %call = call fp128 @roundl(fp128 %value)
+ store fp128 %call, fp128* @fp128
+ ret void
+}
+
+declare fp128 @roundl(fp128) nounwind readnone
diff --git a/test/CodeGen/Mips/cconv/stack-alignment.ll b/test/CodeGen/Mips/cconv/stack-alignment.ll
index f21bc3066f72..c957e311b1b3 100644
--- a/test/CodeGen/Mips/cconv/stack-alignment.ll
+++ b/test/CodeGen/Mips/cconv/stack-alignment.ll
@@ -1,14 +1,14 @@
-; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
-; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -march=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
-; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
-; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
; Test the stack alignment for all ABI's and byte orders as specified by
; section 5 of MD00305 (MIPS ABIs Described).
diff --git a/test/CodeGen/Mips/cfi_offset.ll b/test/CodeGen/Mips/cfi_offset.ll
index 97233328fd55..0e85ea0a42be 100644
--- a/test/CodeGen/Mips/cfi_offset.ll
+++ b/test/CodeGen/Mips/cfi_offset.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=mips -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
-; RUN: llc -march=mipsel -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
-; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
-; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
-; RUN: llc -march=mips -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
-; RUN: llc -march=mipsel -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
+; RUN: llc -march=mips -mattr=+o32 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
+; RUN: llc -march=mips -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
@var = global double 0.0
diff --git a/test/CodeGen/Mips/check-adde-redundant-moves.ll b/test/CodeGen/Mips/check-adde-redundant-moves.ll
index 7bc63a494ac7..cf0fda66ad62 100644
--- a/test/CodeGen/Mips/check-adde-redundant-moves.ll
+++ b/test/CodeGen/Mips/check-adde-redundant-moves.ll
@@ -1,15 +1,9 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s -check-prefix=ALL
; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s -check-prefix=ALL
; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefix=ALL
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index a8008a2cb29f..b0ef2b973d6a 100755
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
-; RUN: llc -march=mips -mcpu=mips32 -regalloc=basic < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
-; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
-; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMP
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
-; RUN: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMP
+; RUN: llc -march=mips -mcpu=mips32 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,32-CMOV
+; RUN: llc -march=mips -mcpu=mips32 -regalloc=basic -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,32-CMOV
+; RUN: llc -march=mips -mcpu=mips32r2 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,32-CMOV
+; RUN: llc -march=mips -mcpu=mips32r6 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,32-CMP
+; RUN: llc -march=mips64el -mcpu=mips4 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,64-CMOV
+; RUN: llc -march=mips64el -mcpu=mips64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,64-CMOV
+; RUN: llc -march=mips64el -mcpu=mips64r6 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,64-CMP
@i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
@i3 = common global i32* null, align 4
@@ -521,14 +521,14 @@ entry:
; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
; 64-CMOV-DAG: movn $[[I4]], $[[I5]], $[[R0]]
-; 64-CMP-DAG: daddiu $[[I5:[0-9]+]], $zero, 5
-; 64-CMP-DAG: daddiu $[[I4:2]], $zero, 4
+; 64-CMP-DAG: daddiu $[[I4:[0-9]+]], $zero, 4
+; 64-CMP-DAG: daddiu $[[I5:2]], $zero, 5
; 64-CMP-DAG: daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, 32766
; 64-CMP-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
; FIXME: We can do better than this by using selccz to choose between -0 and -2
-; 64-CMP-DAG: selnez $[[T0:[0-9]+]], $[[I5]], $[[R0]]
-; 64-CMP-DAG: seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]]
-; 64-CMP-DAG: or $2, $[[T0]], $[[T1]]
+; 64-CMP-DAG: seleqz $[[T0:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG: selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG: or $2, $[[T1]], $[[T0]]
define i64 @slti64_3(i64 %a) {
entry:
diff --git a/test/CodeGen/Mips/compactbranches/beqc-bnec-register-constraint.ll b/test/CodeGen/Mips/compactbranches/beqc-bnec-register-constraint.ll
new file mode 100644
index 000000000000..3cec194a3786
--- /dev/null
+++ b/test/CodeGen/Mips/compactbranches/beqc-bnec-register-constraint.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=mips -mcpu=mips32r6 -O1 -start-after=dwarfehprepare < %s | FileCheck %s
+
+; beqc/bnec have the constraint that $rs < $rt && $rs != 0 && $rt != 0
+; Cases where $rs == 0 and $rt != 0 should be transformed into beqzc/bnezc.
+; Cases where $rs > $rt can have the operands swapped as ==,!= are commutative.
+
+; Cases where beq & bne where $rs == $rt have to inhibited from being turned
+; into compact branches but arguably should not occur. This test covers the
+; $rs == $rt case.
+
+; Starting from dwarf exception handling preparation skips optimizations that
+; may simplify out the crucical bnec $4, $4 instruction.
+
+define internal void @_ZL14TestRemoveLastv(i32* %alist.sroa.0.4) {
+entry:
+ %ascevgep = getelementptr i32, i32* %alist.sroa.0.4, i64 99
+ br label %do.body121
+
+for.cond117:
+ %alsr.iv.next = add nsw i32 %alsr.iv, -1
+ %ascevgep340 = getelementptr i32, i32* %alsr.iv339, i64 -1
+ %acmp118 = icmp sgt i32 %alsr.iv.next, 0
+ br i1 %acmp118, label %do.body121, label %if.then143
+
+do.body121:
+ %alsr.iv339 = phi i32* [ %ascevgep, %entry ], [ %ascevgep340, %for.cond117 ]
+ %alsr.iv = phi i32 [ 100, %entry ], [ %alsr.iv.next, %for.cond117 ]
+ %a9 = add i32 %alsr.iv, -1
+ %alnot124 = icmp eq i32 %alsr.iv, %alsr.iv
+ br i1 %alnot124, label %do.body134, label %if.then143, !prof !11
+
+do.body134:
+ %a10 = add i32 %alsr.iv, -1
+ %a11 = load i32, i32* %alsr.iv339, align 4, !tbaa !5
+; CHECK-NOT: bnec $[[R0:[0-9]+]], $[[R0]]
+; CHECK-NOT: beqc $[[R1:[0-9]+]], $[[R1]]
+ %alnot137 = icmp eq i32 %a9, %a11
+ br i1 %alnot137, label %do.end146, label %if.then143, !prof !11
+
+if.then143:
+ ret void
+ unreachable
+
+do.end146:
+ %alnot151 = icmp eq i32 %a9, %a10
+ br i1 %alnot151, label %for.cond117, label %if.then143, !prof !11
+
+}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !3, i64 0}
+!11 = !{!"branch_weights", i32 2000, i32 1}
+!12 = !{!"branch_weights", i32 -388717296, i32 7818360}
+
diff --git a/test/CodeGen/Mips/compactbranches/compact-branch-policy.ll b/test/CodeGen/Mips/compactbranches/compact-branch-policy.ll
new file mode 100644
index 000000000000..c819bf59ace6
--- /dev/null
+++ b/test/CodeGen/Mips/compactbranches/compact-branch-policy.ll
@@ -0,0 +1,28 @@
+; Check that -mips-compact-branches={never,optimal,always} is accepted and honoured.
+; RUN: llc -march=mips -mcpu=mips32r6 -mips-compact-branches=never < %s | FileCheck %s -check-prefix=NEVER
+; RUN: llc -march=mips -mcpu=mips32r6 -mips-compact-branches=optimal < %s | FileCheck %s -check-prefix=OPTIMAL
+; RUN: llc -march=mips -mcpu=mips32r6 -mips-compact-branches=always < %s | FileCheck %s -check-prefix=ALWAYS
+
+define i32 @l(i32 signext %a, i32 signext %b) {
+entry:
+ %add = add nsw i32 %b, %a
+ %cmp = icmp slt i32 %add, 100
+; NEVER: beq
+; OPTIMAL: beq
+; ALWAYS: beqzc
+; This nop is required for correct as having (j|b)al as the instruction
+; immediately following beqzc would cause a forbidden slot hazard.
+; ALWAYS: nop
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %call = tail call i32 @k()
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %call.pn = phi i32 [ %call, %if.then ], [ -1, %entry ]
+ %c.0 = add nsw i32 %call.pn, %add
+ ret i32 %c.0
+}
+
+declare i32 @k() #1
diff --git a/test/CodeGen/Mips/compactbranches/compact-branches.ll b/test/CodeGen/Mips/compactbranches/compact-branches.ll
new file mode 100644
index 000000000000..75ff8a0bbcbb
--- /dev/null
+++ b/test/CodeGen/Mips/compactbranches/compact-branches.ll
@@ -0,0 +1,206 @@
+; RUN: llc -march=mipsel -mcpu=mips32r6 -relocation-model=static -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=STATIC32
+; RUN: llc -march=mipsel -mcpu=mips64r6 -target-abi n64 -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=PIC
+
+; Function Attrs: nounwind
+define void @l() {
+entry:
+; PIC: jalrc $25
+ %call = tail call i32 @k()
+; PIC: jalrc $25
+ %call1 = tail call i32 @j()
+ %cmp = icmp eq i32 %call, %call1
+; CHECK: bnec
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+; PIC: jalrc $25
+ tail call void @f(i32 signext -2)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+; CHECK: jrc $ra
+ ret void
+}
+
+declare i32 @k()
+
+declare i32 @j()
+
+declare void @f(i32 signext)
+
+; Function Attrs: define void @l2() {
+define void @l2() {
+entry:
+; PIC: jalrc $25
+ %call = tail call i32 @k()
+; PIC: jalrc $25
+ %call1 = tail call i32 @i()
+ %cmp = icmp eq i32 %call, %call1
+; CHECK beqc
+ br i1 %cmp, label %if.end, label %if.then
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+; PIC: jalrc $25
+ tail call void @f(i32 signext -1)
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+; CHECK: jrc $ra
+ ret void
+}
+
+declare i32 @i()
+
+; Function Attrs: nounwind
+define void @l3() {
+entry:
+; PIC: jalrc $25
+ %call = tail call i32 @k()
+ %cmp = icmp slt i32 %call, 0
+; CHECK : bgez
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+; PIC: jalrc $25
+ tail call void @f(i32 signext 0)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+; CHECK: jrc $ra
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @l4() {
+entry:
+ %call = tail call i32 @k()
+ %cmp = icmp slt i32 %call, 1
+; CHECK: bgtzc
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+ tail call void @f(i32 signext 1)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+; CHECK: jrc $ra
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @l5() {
+entry:
+; PIC: jalrc $25
+ %call = tail call i32 @k()
+; PIC: jalrc $25
+ %cmp = icmp sgt i32 %call, 0
+; CHECK: blezc
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+; PIC: jalrc $25
+ tail call void @f(i32 signext 2)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+; CHECK: jrc $ra
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @l6() {
+entry:
+; PIC: jalrc $25
+ %call = tail call i32 @k()
+; PIC: jalrc $25
+ %cmp = icmp sgt i32 %call, -1
+; CHECK: bltzc
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+; PIC: jalrc $25
+ tail call void @f(i32 signext 3)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+; CHECK: jrc $ra
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @l7() {
+entry:
+; PIC: jalrc $25
+ %call = tail call i32 @k()
+ %cmp = icmp eq i32 %call, 0
+; CHECK: bnezc
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+; PIC: jalrc $25
+ tail call void @f(i32 signext 4)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+; CHECK: jrc $ra
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @l8() {
+entry:
+; PIC: jalrc $25
+ %call = tail call i32 @k()
+ %cmp = icmp eq i32 %call, 0
+; CHECK: beqzc
+ br i1 %cmp, label %if.end, label %if.then
+
+if.then: ; preds = %entry:
+; STATIC: nop
+; STATIC: jal
+; PIC: jalrc $25
+ tail call void @f(i32 signext 5)
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+; CHECK: jrc $ra
+ ret void
+}
+
+define i32 @l9(i8* ()* %i) #0 {
+entry:
+ %i.addr = alloca i8* ()*, align 4
+ store i8* ()* %i, i8* ()** %i.addr, align 4
+; STATIC32: jal
+; STATIC32: nop
+; PIC: jalrc $25
+ %call = call i32 @k()
+; PIC: jalrc $25
+ %cmp = icmp ne i32 %call, 0
+; CHECK: beqzc
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %0 = load i8* ()*, i8* ()** %i.addr, align 4
+; CHECK: jalrc $25
+ %call1 = call i8* %0()
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+; CHECK: jrc $ra
+ ret i32 -1
+}
diff --git a/test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll b/test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll
new file mode 100644
index 000000000000..f6fef90d01b9
--- /dev/null
+++ b/test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=mipsel -mcpu=mips32r6 -disable-mips-delay-filler < %s | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips32r6 -disable-mips-delay-filler < %s \
+; RUN: -filetype=obj -o - | llvm-objdump -d - | FileCheck %s -check-prefix=ENCODING
+
+; bnezc and beqzc have restriction that $rt != 0
+
+define i32 @f() {
+; CHECK-LABEL: f
+; CHECK-NOT: bnezc $0
+
+ %cmp = icmp eq i32 1, 1
+ br i1 %cmp, label %if.then, label %if.end
+
+ if.then:
+ ret i32 1
+
+ if.end:
+ ret i32 0
+}
+
+define i32 @f1() {
+; CHECK-LABEL: f1
+; CHECK-NOT: beqzc $0
+
+ %cmp = icmp eq i32 0, 0
+ br i1 %cmp, label %if.then, label %if.end
+
+ if.then:
+ ret i32 1
+
+ if.end:
+ ret i32 0
+}
+
+; We silently fixup cases where the register allocator or user has given us
+; an instruction with incorrect operands that is trivially acceptable.
+; beqc and bnec have the restriction that $rs < $rt.
+
+define i32 @f2(i32 %a, i32 %b) {
+; ENCODING-LABEL: f2
+; ENCODING-NOT: beqc $5, $4
+; ENCODING-NOT: bnec $5, $4
+
+ %cmp = icmp eq i32 %b, %a
+ br i1 %cmp, label %if.then, label %if.end
+
+ if.then:
+ ret i32 1
+
+ if.end:
+ ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/const-mult.ll b/test/CodeGen/Mips/const-mult.ll
index 60b2a88196bd..7f4e896572a6 100644
--- a/test/CodeGen/Mips/const-mult.ll
+++ b/test/CodeGen/Mips/const-mult.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -march=mips64el < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK64
+; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mips64el < %s | FileCheck %s -check-prefixes=CHECK,CHECK64
; CHECK-LABEL: mul5_32:
; CHECK: sll $[[R0:[0-9]+]], $4, 2
diff --git a/test/CodeGen/Mips/const4a.ll b/test/CodeGen/Mips/const4a.ll
index d1182d7fc6ec..df4a1d9ff9ae 100644
--- a/test/CodeGen/Mips/const4a.ll
+++ b/test/CodeGen/Mips/const4a.ll
@@ -21,7 +21,7 @@ entry:
; no-load-relax: beqz ${{[0-9]+}}, $BB0_3
; no-load-relax: lw ${{[0-9]+}}, %call16(foo)(${{[0-9]+}})
; no-load-relax: b $BB0_4
-; no-load-relax: .align 2
+; no-load-relax: .p2align 2
; no-load-relax: $CPI0_1:
; no-load-relax: .4byte 3735943886
; no-load-relax: $BB0_3:
diff --git a/test/CodeGen/Mips/const6.ll b/test/CodeGen/Mips/const6.ll
index c576f573a43b..9085e38b3f60 100644
--- a/test/CodeGen/Mips/const6.ll
+++ b/test/CodeGen/Mips/const6.ll
@@ -18,7 +18,7 @@ entry:
store i32 -559023410, i32* @i, align 4
; load-relax: lw ${{[0-9]+}}, $CPI0_0
; load-relax: jrc $ra
-; load-relax: .align 2
+; load-relax: .p2align 2
; load-relax: $CPI0_0:
; load-relax: .4byte 3735943886
; load-relax: .end t
@@ -26,7 +26,7 @@ entry:
; no-load-relax: lw ${{[0-9]+}}, $CPI0_1 # 16 bit inst
; no-load-relax: jalrc ${{[0-9]+}}
; no-load-relax: b $BB0_2
-; no-load-relax: .align 2
+; no-load-relax: .p2align 2
; no-load-relax: $CPI0_1:
; no-load-relax: .4byte 3735943886
; no-load-relax: $BB0_2:
diff --git a/test/CodeGen/Mips/const6a.ll b/test/CodeGen/Mips/const6a.ll
index 653cdeb920f3..80eedb4c897d 100644
--- a/test/CodeGen/Mips/const6a.ll
+++ b/test/CodeGen/Mips/const6a.ll
@@ -15,7 +15,7 @@ entry:
; load-relax-NOT: lw ${{[0-9]+}}, $CPI0_0 # 16 bit inst
; load-relax1: lw ${{[0-9]+}}, $CPI0_0
; load-relax: jrc $ra
-; load-relax: .align 2
+; load-relax: .p2align 2
; load-relax: $CPI0_0:
; load-relax: .4byte 3735943886
; load-relax: .end t
diff --git a/test/CodeGen/Mips/countleading.ll b/test/CodeGen/Mips/countleading.ll
index b7aad049e8ab..1b61be5ed2ac 100644
--- a/test/CodeGen/Mips/countleading.ll
+++ b/test/CodeGen/Mips/countleading.ll
@@ -1,10 +1,11 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R1-R2 -check-prefix=MIPS32-GT-R1 %s
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R1-R2 -check-prefix=MIPS32-GT-R1 %s
-; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R6 -check-prefix=MIPS32-GT-R1 %s
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS4 %s
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
-; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
-; R!N: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck -check-prefixes=ALL,MIPS32-R1-R2,MIPS32-GT-R1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck -check-prefixes=ALL,MIPS32-R1-R2,MIPS32-GT-R1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck -check-prefixes=ALL,MIPS32-R6,MIPS32-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck -check-prefixes=ALL,MIPS4 %s
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck -check-prefixes=ALL,MIPS64-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck -check-prefixes=ALL,MIPS64-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck -check-prefixes=ALL,MIPS64-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=micromips < %s | FileCheck -check-prefixes=ALL,MICROMIPS64 %s
; Prefixes:
; ALL - All
@@ -21,6 +22,8 @@ entry:
; MIPS64-GT-R1: clz $2, $4
+; MICROMIPS64: clz $2, $4
+
%tmp1 = tail call i32 @llvm.ctlz.i32(i32 %X, i1 true)
ret i32 %tmp1
}
@@ -37,6 +40,8 @@ entry:
; MIPS64-GT-R1: clo $2, $4
+; MICROMIPS64: clo $2, $4
+
%neg = xor i32 %X, -1
%tmp1 = tail call i32 @llvm.ctlz.i32(i32 %neg, i1 true)
ret i32 %tmp1
@@ -58,6 +63,7 @@ entry:
; MIPS32-GT-R1-DAG: addiu $3, $zero, 0
; MIPS64-GT-R1: dclz $2, $4
+; MICROMIPS64: dclz $2, $4
%tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
ret i64 %tmp1
@@ -83,6 +89,7 @@ entry:
; MIPS32-GT-R1-DAG: addiu $3, $zero, 0
; MIPS64-GT-R1: dclo $2, $4
+; MICROMIPS64: dclo $2, $4
%neg = xor i64 %X, -1
%tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
diff --git a/test/CodeGen/Mips/cstmaterialization/stack.ll b/test/CodeGen/Mips/cstmaterialization/stack.ll
new file mode 100644
index 000000000000..7266d00069cc
--- /dev/null
+++ b/test/CodeGen/Mips/cstmaterialization/stack.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=CHECK-MIPS32
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-MIPS64
+; RUN: llc -march=mipsel -mcpu=mips64 -target-abi n32 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-MIPSN32
+
+; Test that the expansion of ADJCALLSTACKDOWN and ADJCALLSTACKUP generate
+; (d)subu and (d)addu rather than just (d)addu. The (d)subu sequences are
+; generally shorter as the constant that has to be materialized is smaller.
+
+define i32 @main() {
+entry:
+ %z = alloca [1048576 x i8], align 1
+ %arraydecay = getelementptr inbounds [1048576 x i8], [1048576 x i8]* %z, i32 0, i32 0
+ %call = call i32 @foo(i8* %arraydecay)
+ ret i32 0
+; CHECK-LABEL: main
+
+; CHECK-MIPS32: lui $[[R0:[0-9]+]], 16
+; CHECK-MIPS32: addiu $[[R0]], $[[R0]], 24
+; CHECK-MIPS32: subu $sp, $sp, $[[R0]]
+
+; CHECK-MIPS32: lui $[[R1:[0-9]+]], 16
+; CHECK-MIPS32: addiu $[[R1]], $[[R1]], 24
+; CHECK-MIPS32: addu $sp, $sp, $[[R1]]
+
+; CHECK-MIPS64: lui $[[R0:[0-9]+]], 1
+; CHECK-MIPS64: daddiu $[[R0]], $[[R0]], 32
+; CHECK-MIPS64: dsubu $sp, $sp, $[[R0]]
+
+; FIXME:
+; These are here to match other lui's used in address computations. We need to
+; investigate why address computations are not CSE'd. Or implement it.
+
+; CHECK-MIPS64: lui
+; CHECK-MIPS64: lui
+; CHECK-MIPS64: lui
+; CHECK-MIPS64: lui
+
+; CHECK-MIPS64: lui $[[R1:[0-9]+]], 16
+; CHECK-MIPS64: daddiu $[[R1]], $[[R1]], 32
+; CHECK-MIPS64: daddu $sp, $sp, $[[R1]]
+
+; CHECK-MIPSN32: lui $[[R0:[0-9]+]], 16
+; CHECK-MIPSN32: addiu $[[R0]], $[[R0]], 16
+; CHECK-MIPSN32: subu $sp, $sp, $[[R0]]
+
+; CHECK-MIPSN32: lui $[[R1:[0-9]+]], 16
+; CHECK-MIPSN32: addiu $[[R1]], $[[R1]], 16
+; CHECK-MIPSN32: addu $sp, $sp, $[[R1]]
+
+}
+
+declare i32 @foo(i8*)
diff --git a/test/CodeGen/Mips/divrem.ll b/test/CodeGen/Mips/divrem.ll
index 918db053f5b6..7703a8cecf5c 100644
--- a/test/CodeGen/Mips/divrem.ll
+++ b/test/CodeGen/Mips/divrem.ll
@@ -1,16 +1,16 @@
-; RUN: llc -march=mips -mcpu=mips32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=ACC32-TRAP
-; RUN: llc -march=mips -mcpu=mips32r2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=ACC32-TRAP
-; RUN: llc -march=mips -mcpu=mips32r6 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=GPR32-TRAP
-; RUN: llc -march=mips64 -mcpu=mips64 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=ACC64-TRAP
-; RUN: llc -march=mips64 -mcpu=mips64r2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=ACC64-TRAP
-; RUN: llc -march=mips64 -mcpu=mips64r6 -verify-machineinstrs < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=GPR64-TRAP
-
-; RUN: llc -march=mips -mcpu=mips32 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=NOCHECK
-; RUN: llc -march=mips -mcpu=mips32r2 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=NOCHECK
-; RUN: llc -march=mips -mcpu=mips32r6 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=NOCHECK
-; RUN: llc -march=mips64 -mcpu=mips64 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=NOCHECK
-; RUN: llc -march=mips64 -mcpu=mips64r2 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=NOCHECK
-; RUN: llc -march=mips64 -mcpu=mips64r6 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=NOCHECK
+; RUN: llc -march=mips -mcpu=mips32 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC32,ACC32-TRAP
+; RUN: llc -march=mips -mcpu=mips32r2 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC32,ACC32-TRAP
+; RUN: llc -march=mips -mcpu=mips32r6 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,GPR32,GPR32-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC64,ACC64-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64r2 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC64,ACC64-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64r6 -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,GPR64,GPR64-TRAP
+
+; RUN: llc -march=mips -mcpu=mips32 -mno-check-zero-division -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC32,NOCHECK
+; RUN: llc -march=mips -mcpu=mips32r2 -mno-check-zero-division -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC32,NOCHECK
+; RUN: llc -march=mips -mcpu=mips32r6 -mno-check-zero-division -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,GPR32,NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64 -mno-check-zero-division -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC64,NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mno-check-zero-division -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,ACC64,NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64r6 -mno-check-zero-division -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,GPR64,NOCHECK
; FileCheck Prefixes:
; ALL - All targets
@@ -81,7 +81,7 @@ entry:
ret i32 %rem
}
-define i32 @udiv1(i32 zeroext %a0, i32 zeroext %a1) nounwind readnone {
+define i32 @udiv1(i32 signext %a0, i32 signext %a1) nounwind readnone {
entry:
; ALL-LABEL: udiv1:
@@ -107,7 +107,7 @@ entry:
ret i32 %div
}
-define i32 @urem1(i32 zeroext %a0, i32 zeroext %a1) nounwind readnone {
+define i32 @urem1(i32 signext %a0, i32 signext %a1) nounwind readnone {
entry:
; ALL-LABEL: urem1:
@@ -152,20 +152,20 @@ entry:
; ACC64: mfhi $[[R0:[0-9]+]]
; ACC64: sw $[[R0]], 0(${{[0-9]+}})
-; GPR32: mod $[[R0:[0-9]+]], $4, $5
+; GPR32-DAG: div $2, $4, $5
; GPR32-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
-; GPR32: sw $[[R0]], 0(${{[0-9]+}})
-; GPR32-DAG: div $2, $4, $5
+; GPR32-DAG: mod $[[R0:[0-9]+]], $4, $5
; GPR32-TRAP: teq $5, $zero, 7
+; GPR32: sw $[[R0]], 0(${{[0-9]+}})
-; GPR64: mod $[[R0:[0-9]+]], $4, $5
+; GPR64-DAG: div $2, $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
-; GPR64: sw $[[R0]], 0(${{[0-9]+}})
-; GPR64-DAG: div $2, $4, $5
+; GPR64-DAG: mod $[[R0:[0-9]+]], $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
+; GPR64: sw $[[R0]], 0(${{[0-9]+}})
; ALL: .end sdivrem1
@@ -175,7 +175,7 @@ entry:
ret i32 %div
}
-define i32 @udivrem1(i32 zeroext %a0, i32 zeroext %a1, i32* nocapture %r) nounwind {
+define i32 @udivrem1(i32 signext %a0, i32 signext %a1, i32* nocapture %r) nounwind {
entry:
; ALL-LABEL: udivrem1:
@@ -193,21 +193,21 @@ entry:
; ACC64: mfhi $[[R0:[0-9]+]]
; ACC64: sw $[[R0]], 0(${{[0-9]+}})
-; GPR32: modu $[[R0:[0-9]+]], $4, $5
+; GPR32-DAG: divu $2, $4, $5
; GPR32-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
-; GPR32: sw $[[R0]], 0(${{[0-9]+}})
-; GPR32-DAG: divu $2, $4, $5
+; GPR32-DAG: modu $[[R0:[0-9]+]], $4, $5
; GPR32-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
+; GPR32: sw $[[R0]], 0(${{[0-9]+}})
-; GPR64: modu $[[R0:[0-9]+]], $4, $5
+; GPR64-DAG: divu $2, $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
-; GPR64: sw $[[R0]], 0(${{[0-9]+}})
-; GPR64-DAG: divu $2, $4, $5
+; GPR64-DAG: modu $[[R0:[0-9]+]], $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
+; GPR64: sw $[[R0]], 0(${{[0-9]+}})
; ALL: .end udivrem1
@@ -335,14 +335,14 @@ entry:
; ACC64: mfhi $[[R0:[0-9]+]]
; ACC64: sd $[[R0]], 0(${{[0-9]+}})
-; GPR64: dmod $[[R0:[0-9]+]], $4, $5
+; GPR64-DAG: ddiv $2, $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
-; GPR64: sd $[[R0]], 0(${{[0-9]+}})
-; GPR64-DAG: ddiv $2, $4, $5
+; GPR64-DAG: dmod $[[R0:[0-9]+]], $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
+; GPR64: sd $[[R0]], 0(${{[0-9]+}})
; ALL: .end sdivrem2
@@ -370,14 +370,14 @@ entry:
; ACC64: mfhi $[[R0:[0-9]+]]
; ACC64: sd $[[R0]], 0(${{[0-9]+}})
-; GPR64: dmodu $[[R0:[0-9]+]], $4, $5
+; GPR64-DAG: ddivu $2, $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
-; GPR64: sd $[[R0]], 0(${{[0-9]+}})
-; GPR64-DAG: ddivu $2, $4, $5
+; GPR64: dmodu $[[R0:[0-9]+]], $4, $5
; GPR64-TRAP: teq $5, $zero, 7
; NOCHECK-NOT: teq
+; GPR64: sd $[[R0]], 0(${{[0-9]+}})
; ALL: .end udivrem2
diff --git a/test/CodeGen/Mips/dsp-r1.ll b/test/CodeGen/Mips/dsp-r1.ll
index fbd970399640..edd6258270a0 100644
--- a/test/CodeGen/Mips/dsp-r1.ll
+++ b/test/CodeGen/Mips/dsp-r1.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+dsp < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+dsp -verify-machineinstrs < %s | \
+; RUN: FileCheck %s
define i32 @test__builtin_mips_extr_w1(i32 %i0, i32, i64 %a0) nounwind {
entry:
diff --git a/test/CodeGen/Mips/dynamic-stack-realignment.ll b/test/CodeGen/Mips/dynamic-stack-realignment.ll
index 777930a37ad5..2ee1ca2e0dfc 100644
--- a/test/CodeGen/Mips/dynamic-stack-realignment.ll
+++ b/test/CodeGen/Mips/dynamic-stack-realignment.ll
@@ -1,21 +1,21 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP32
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N64
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N64
-; RUN: llc < %s -march=mips64 -mcpu=mips3 -target-abi n32 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N32
-; RUN: llc < %s -march=mips64 -mcpu=mips64 -target-abi n32 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N32
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -target-abi n32 | FileCheck %s \
-; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N32
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP64,N64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP64,N64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP64,N64
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -target-abi n32 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP64,N32
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -target-abi n32 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP64,N32
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -target-abi n32 -relocation-model=pic | FileCheck %s \
+; RUN: --check-prefixes=ALL,GP64,N32
; Check dynamic stack realignment in functions without variable-sized objects.
diff --git a/test/CodeGen/Mips/eh-dwarf-cfa.ll b/test/CodeGen/Mips/eh-dwarf-cfa.ll
index 6554974bf849..c4019c78d69e 100644
--- a/test/CodeGen/Mips/eh-dwarf-cfa.ll
+++ b/test/CodeGen/Mips/eh-dwarf-cfa.ll
@@ -13,6 +13,8 @@ entry:
%0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
ret i8* %0
+; CHECK-LABEL: f1:
+
; CHECK: addiu $sp, $sp, -32
; CHECK: addiu $2, $sp, 32
}
@@ -24,10 +26,12 @@ entry:
%0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
ret i8* %0
+; CHECK-LABEL: f2:
+
; check stack size (65536 + 8)
-; CHECK: lui $[[R0:[a-z0-9]+]], 65535
-; CHECK: addiu $[[R0]], $[[R0]], -8
-; CHECK: addu $sp, $sp, $[[R0]]
+; CHECK: lui $[[R0:[a-z0-9]+]], 1
+; CHECK: addiu $[[R0]], $[[R0]], 8
+; CHECK: subu $sp, $sp, $[[R0]]
; check return value ($sp + stack size)
; CHECK: lui $[[R1:[a-z0-9]+]], 1
@@ -46,6 +50,8 @@ entry:
%add = add i32 %1, %3
ret i32 %add
+; CHECK-LABEL: f3:
+
; CHECK: addiu $sp, $sp, -40
; check return value ($fp + stack size + $fp)
@@ -60,6 +66,8 @@ entry:
%0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
ret i8* %0
+; CHECK-LABEL: f4:
+
; CHECK-MIPS64: daddiu $sp, $sp, -32
; CHECK-MIPS64: daddiu $2, $sp, 32
}
diff --git a/test/CodeGen/Mips/eh-return32.ll b/test/CodeGen/Mips/eh-return32.ll
index 542c5bf4462e..a11a43cb406e 100644
--- a/test/CodeGen/Mips/eh-return32.ll
+++ b/test/CodeGen/Mips/eh-return32.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -mcpu=mips32 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
-; RUN: llc -march=mipsel -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
-; RUN: llc -march=mipsel -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=R6
+; RUN: llc -march=mipsel -mcpu=mips32 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -march=mipsel -mcpu=mips32r2 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -march=mipsel -mcpu=mips32r6 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,R6
declare void @llvm.eh.return.i32(i32, i8*)
declare void @foo(...)
diff --git a/test/CodeGen/Mips/eh-return64.ll b/test/CodeGen/Mips/eh-return64.ll
index 2f8203d77c84..496e3abcf9c5 100644
--- a/test/CodeGen/Mips/eh-return64.ll
+++ b/test/CodeGen/Mips/eh-return64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=mips64el -mcpu=mips4 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
-; RUN: llc -march=mips64el -mcpu=mips64 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
-; RUN: llc -march=mips64el -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
-; RUN: llc -march=mips64el -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=R6
+; RUN: llc -march=mips64el -mcpu=mips4 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -march=mips64el -mcpu=mips64 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -march=mips64el -mcpu=mips64r2 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -march=mips64el -mcpu=mips64r6 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,R6
declare void @llvm.eh.return.i64(i64, i8*)
declare void @foo(...)
diff --git a/test/CodeGen/Mips/eh.ll b/test/CodeGen/Mips/eh.ll
index 19f3d4d23d64..2f843d9da9a6 100644
--- a/test/CodeGen/Mips/eh.ll
+++ b/test/CodeGen/Mips/eh.ll
@@ -24,7 +24,7 @@ entry:
lpad: ; preds = %entry
; CHECK-EL: # %lpad
-; CHECK-EL: beq $5
+; CHECK-EL: bne $5
%exn.val = landingpad { i8*, i32 }
cleanup
diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
index a51cfb7e0fcd..d6d47678590a 100644
--- a/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/test/CodeGen/Mips/ehframe-indirect.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=mipsel-linux-gnu < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=O32 %s
-; RUN: llc -mtriple=mipsel-linux-android < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=O32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu -target-abi=n32 < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-android -target-abi=n32 < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N32 %s
-; RUN: llc -mtriple=mips64el-linux-gnu < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N64 %s
-; RUN: llc -mtriple=mips64el-linux-android < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N64 %s
+; RUN: llc -mtriple=mipsel-linux-gnu < %s -asm-verbose -relocation-model=pic | FileCheck -check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mipsel-linux-android < %s -asm-verbose -relocation-model=pic | FileCheck -check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -target-abi=n32 < %s -asm-verbose -relocation-model=pic | FileCheck -check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-linux-android -target-abi=n32 < %s -asm-verbose -relocation-model=pic | FileCheck -check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu < %s -asm-verbose -relocation-model=pic | FileCheck -check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el-linux-android < %s -asm-verbose -relocation-model=pic | FileCheck -check-prefixes=ALL,N64 %s
@_ZTISt9exception = external constant i8*
@@ -42,9 +42,9 @@ declare void @foo()
; ALL: .hidden DW.ref.__gxx_personality_v0
; ALL: .weak DW.ref.__gxx_personality_v0
; ALL: .section .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat
-; O32: .align 2
-; N32: .align 2
-; N64: .align 3
+; O32: .p2align 2
+; N32: .p2align 2
+; N64: .p2align 3
; ALL: .type DW.ref.__gxx_personality_v0,@object
; O32: .size DW.ref.__gxx_personality_v0, 4
; N32: .size DW.ref.__gxx_personality_v0, 4
diff --git a/test/CodeGen/Mips/elf_eflags.ll b/test/CodeGen/Mips/elf_eflags.ll
index 00d8584fdad2..40910d8987d2 100644
--- a/test/CodeGen/Mips/elf_eflags.ll
+++ b/test/CodeGen/Mips/elf_eflags.ll
@@ -23,13 +23,13 @@
; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS %s
; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS_PIC %s
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 -target-abi n64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 -target-abi n64 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64R2 %s
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 %s -o - | FileCheck -check-prefix=CHECK-LE64R2_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 -target-abi n64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 -target-abi n64 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -target-abi n64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64R2 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -target-abi n64 %s -o - | FileCheck -check-prefix=CHECK-LE64R2_PIC %s
; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+mips16 -relocation-model=pic %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MIPS16 %s
diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
index 54092b4e3ebe..a08b68149a70 100644
--- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
+++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
@@ -1,5 +1,5 @@
; Check that register scavenging spill slot is close to $fp.
-; RUN: llc -march=mipsel -O0 < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 -relocation-model=pic < %s | FileCheck %s
; CHECK: sw ${{.*}}, 8($sp)
; CHECK: lw ${{.*}}, 8($sp)
diff --git a/test/CodeGen/Mips/emutls_generic.ll b/test/CodeGen/Mips/emutls_generic.ll
index a6cf23aa67ff..cda8dec7d12f 100644
--- a/test/CodeGen/Mips/emutls_generic.ll
+++ b/test/CodeGen/Mips/emutls_generic.ll
@@ -31,13 +31,13 @@ entry:
; MIPS_32-NOT: __emutls_t.external_x
; MIPS_32-NOT: __emutls_v.external_x:
; MIPS_32: .data
-; MIPS_32: .align 2
+; MIPS_32: .p2align 2
; MIPS_32-LABEL: __emutls_v.external_y:
; MIPS_32: .section .rodata,
; MIPS_32-LABEL: __emutls_t.external_y:
; MIPS_32-NEXT: .byte 7
; MIPS_32: .data
-; MIPS_32: .align 2
+; MIPS_32: .p2align 2
; MIPS_32-LABEL: __emutls_v.internal_y:
; MIPS_32-NEXT: .4byte 8
; MIPS_32-NEXT: .4byte 16
@@ -59,7 +59,7 @@ entry:
; MIPS_64-LABEL: __emutls_t.external_y:
; MIPS_64-NEXT: .byte 7
; MIPS_64: .data
-; MIPS_64: .align 3
+; MIPS_64: .p2align 3
; MIPS_64-LABEL: __emutls_v.internal_y:
; MIPS_64-NEXT: .8byte 8
; MIPS_64-NEXT: .8byte 16
diff --git a/test/CodeGen/Mips/fastcc.ll b/test/CodeGen/Mips/fastcc.ll
index 299e0d696cbb..4b6191f9ae89 100644
--- a/test/CodeGen/Mips/fastcc.ll
+++ b/test/CodeGen/Mips/fastcc.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=mipsel | FileCheck %s
-; RUN: llc < %s -mtriple=mipsel-none-nacl-gnu \
+; RUN: llc < %s -march=mipsel -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=mipsel-none-nacl-gnu -relocation-model=pic \
; RUN: | FileCheck %s -check-prefix=CHECK-NACL
-; RUN: llc < %s -march=mipsel -mcpu=mips32 -mattr=+nooddspreg | FileCheck %s -check-prefix=NOODDSPREG
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -mattr=+fp64,+nooddspreg | FileCheck %s -check-prefix=FP64-NOODDSPREG
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -mattr=+nooddspreg -relocation-model=pic | FileCheck %s -check-prefix=NOODDSPREG
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -mattr=+fp64,+nooddspreg -relocation-model=pic | FileCheck %s -check-prefix=FP64-NOODDSPREG
@gi0 = external global i32
diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll
index aa1f09bf7aba..142ee1144bbe 100644
--- a/test/CodeGen/Mips/fcmp.ll
+++ b/test/CodeGen/Mips/fcmp.ll
@@ -1,21 +1,35 @@
; RUN: llc < %s -march=mips -mcpu=mips32 | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32-C
+; RUN: FileCheck %s -check-prefixes=ALL,32-C
; RUN: llc < %s -march=mips -mcpu=mips32r2 | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32-C
+; RUN: FileCheck %s -check-prefixes=ALL,32-C
; RUN: llc < %s -march=mips -mcpu=mips32r6 | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32-CMP
+; RUN: FileCheck %s -check-prefixes=ALL,32-CMP
; RUN: llc < %s -march=mips64 -mcpu=mips4 | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: FileCheck %s -check-prefixes=ALL,64-C
; RUN: llc < %s -march=mips64 -mcpu=mips64 | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: FileCheck %s -check-prefixes=ALL,64-C
; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: FileCheck %s -check-prefixes=ALL,64-C
; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=64-CMP
+; RUN: FileCheck %s -check-prefixes=ALL,64-CMP
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32R3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR6,MM32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR6,MM64R6
define i32 @false_f32(float %a, float %b) nounwind {
; ALL-LABEL: false_f32:
-; ALL: addiu $2, $zero, 0
+; 32-C: addiu $2, $zero, 0
+
+; 32-CMP: addiu $2, $zero, 0
+
+; 64-C: addiu $2, $zero, 0
+
+; 64-CMP: addiu $2, $zero, 0
+
+; MM-DAG: lui $2, 0
%1 = fcmp false float %a, %b
%2 = zext i1 %1 to i32
@@ -41,6 +55,16 @@ define i32 @oeq_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.eq.s $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.eq.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.eq.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp oeq float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -65,6 +89,16 @@ define i32 @ogt_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ule.s $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.lt.s $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.lt.s $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ogt float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -89,6 +123,16 @@ define i32 @oge_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ult.s $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.le.s $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.le.s $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp oge float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -113,6 +157,16 @@ define i32 @olt_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.olt.s $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.lt.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.lt.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp olt float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -137,6 +191,16 @@ define i32 @ole_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ole.s $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.le.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.le.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ole float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -163,6 +227,17 @@ define i32 @one_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: not $[[T2:[0-9]+]], $[[T1]]
; 64-CMP-DAG: andi $2, $[[T2]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ueq.s $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: not $[[T2:[0-9]+]], $[[T1]]
+; MMR6-DAG: andi16 $2, $[[T2]], 1
+
%1 = fcmp one float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -189,6 +264,17 @@ define i32 @ord_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: not $[[T2:[0-9]+]], $[[T1]]
; 64-CMP-DAG: andi $2, $[[T2]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.un.s $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.un.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.un.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: not $[[T2:[0-9]+]], $[[T1]]
+; MMR6-DAG: andi16 $2, $[[T2]], 1
+
%1 = fcmp ord float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -213,6 +299,16 @@ define i32 @ueq_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ueq.s $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ueq float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -237,6 +333,16 @@ define i32 @ugt_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ole.s $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ult.s $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.ult.s $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ugt float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -261,6 +367,16 @@ define i32 @uge_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.olt.s $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ule.s $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.ule.s $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp uge float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -285,6 +401,15 @@ define i32 @ult_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ult.s $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ult.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ult.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
%1 = fcmp ult float %a, %b
%2 = zext i1 %1 to i32
@@ -310,6 +435,16 @@ define i32 @ule_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ule.s $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ule.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ule.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ule float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -336,6 +471,17 @@ define i32 @une_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: not $[[T2:[0-9]+]], $[[T1]]
; 64-CMP-DAG: andi $2, $[[T2]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.eq.s $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.eq.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.eq.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: not $[[T2:[0-9]+]], $[[T1]]
+; MMR6-DAG: andi16 $2, $[[T2]], 1
+
%1 = fcmp une float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -360,6 +506,16 @@ define i32 @uno_f32(float %a, float %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.un.s $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.un.s $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.un.s $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp uno float %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -367,7 +523,15 @@ define i32 @uno_f32(float %a, float %b) nounwind {
define i32 @true_f32(float %a, float %b) nounwind {
; ALL-LABEL: true_f32:
-; ALL: addiu $2, $zero, 1
+; 32-C: addiu $2, $zero, 1
+
+; 32-CMP: addiu $2, $zero, 1
+
+; 64-C: addiu $2, $zero, 1
+
+; 64-CMP: addiu $2, $zero, 1
+
+; MM-DAG: li16 $2, 1
%1 = fcmp true float %a, %b
%2 = zext i1 %1 to i32
@@ -376,7 +540,15 @@ define i32 @true_f32(float %a, float %b) nounwind {
define i32 @false_f64(double %a, double %b) nounwind {
; ALL-LABEL: false_f64:
-; ALL: addiu $2, $zero, 0
+; 32-C: addiu $2, $zero, 0
+
+; 32-CMP: addiu $2, $zero, 0
+
+; 64-C: addiu $2, $zero, 0
+
+; 64-CMP: addiu $2, $zero, 0
+
+; MM-DAG: lui $2, 0
%1 = fcmp false double %a, %b
%2 = zext i1 %1 to i32
@@ -402,6 +574,16 @@ define i32 @oeq_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.eq.d $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.eq.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.eq.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp oeq double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -426,6 +608,16 @@ define i32 @ogt_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ule.d $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.lt.d $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.lt.d $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ogt double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -450,6 +642,16 @@ define i32 @oge_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ult.d $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.le.d $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.le.d $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp oge double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -474,6 +676,16 @@ define i32 @olt_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.olt.d $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.lt.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.lt.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp olt double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -498,6 +710,16 @@ define i32 @ole_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ole.d $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.le.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.le.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ole double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -524,6 +746,17 @@ define i32 @one_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: not $[[T2:[0-9]+]], $[[T1]]
; 64-CMP-DAG: andi $2, $[[T2]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ueq.d $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: not $[[T2:[0-9]+]], $[[T1]]
+; MMR6-DAG: andi16 $2, $[[T2]], 1
+
%1 = fcmp one double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -550,6 +783,17 @@ define i32 @ord_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: not $[[T2:[0-9]+]], $[[T1]]
; 64-CMP-DAG: andi $2, $[[T2]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.un.d $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.un.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.un.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: not $[[T2:[0-9]+]], $[[T1]]
+; MMR6-DAG: andi16 $2, $[[T2]], 1
+
%1 = fcmp ord double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -574,6 +818,16 @@ define i32 @ueq_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ueq.d $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ueq double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -598,6 +852,16 @@ define i32 @ugt_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ole.d $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ult.d $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.ult.d $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ugt double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -622,6 +886,16 @@ define i32 @uge_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.olt.d $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ule.d $[[T0:f[0-9]+]], $f14, $f12
+; MM64R6-DAG: cmp.ule.d $[[T0:f[0-9]+]], $f13, $f12
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp uge double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -646,6 +920,16 @@ define i32 @ult_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ult.d $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ult.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ult.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ult double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -670,6 +954,16 @@ define i32 @ule_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.ule.d $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.ule.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.ule.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp ule double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -696,6 +990,17 @@ define i32 @une_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: not $[[T2:[0-9]+]], $[[T1]]
; 64-CMP-DAG: andi $2, $[[T2]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.eq.d $f12, $f14
+; MM32R3-DAG: movt $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.eq.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.eq.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: not $[[T2:[0-9]+]], $[[T1]]
+; MMR6-DAG: andi16 $2, $[[T2]], 1
+
%1 = fcmp une double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -720,6 +1025,16 @@ define i32 @uno_f64(double %a, double %b) nounwind {
; 64-CMP-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
; 64-CMP-DAG: andi $2, $[[T1]], 1
+; MM32R3-DAG: lui $[[T0:[0-9]+]], 0
+; MM32R3-DAG: li16 $[[T1:[0-9]+]], 1
+; MM32R3-DAG: c.un.d $f12, $f14
+; MM32R3-DAG: movf $[[T1]], $[[T0]], $fcc0
+
+; MM32R6-DAG: cmp.un.d $[[T0:f[0-9]+]], $f12, $f14
+; MM64R6-DAG: cmp.un.d $[[T0:f[0-9]+]], $f12, $f13
+; MMR6-DAG: mfc1 $[[T1:[0-9]+]], $[[T0]]
+; MMR6-DAG: andi16 $2, $[[T1]], 1
+
%1 = fcmp uno double %a, %b
%2 = zext i1 %1 to i32
ret i32 %2
@@ -727,7 +1042,15 @@ define i32 @uno_f64(double %a, double %b) nounwind {
define i32 @true_f64(double %a, double %b) nounwind {
; ALL-LABEL: true_f64:
-; ALL: addiu $2, $zero, 1
+; 32-C: addiu $2, $zero, 1
+
+; 32-CMP: addiu $2, $zero, 1
+
+; 64-C: addiu $2, $zero, 1
+
+; 64-CMP: addiu $2, $zero, 1
+
+; MM-DAG: li16 $2, 1
%1 = fcmp true double %a, %b
%2 = zext i1 %1 to i32
@@ -750,7 +1073,7 @@ entry:
; 32-CMP-DAG: mfc1 $[[T3:[0-9]+]], $[[T2]]
; FIXME: This instruction is redundant.
; 32-CMP-DAG: andi $[[T4:[0-9]+]], $[[T3]], 1
-; 32-CMP-DAG: bnez $[[T4]],
+; 32-CMP-DAG: bnezc $[[T4]],
; 64-C-DAG: add.s $[[T0:f[0-9]+]], $f13, $f12
; 64-C-DAG: lwc1 $[[T1:f[0-9]+]], %got_ofst($CPI32_0)(
@@ -763,7 +1086,32 @@ entry:
; 64-CMP-DAG: mfc1 $[[T3:[0-9]+]], $[[T2]]
; FIXME: This instruction is redundant.
; 64-CMP-DAG: andi $[[T4:[0-9]+]], $[[T3]], 1
-; 64-CMP-DAG: bnez $[[T4]],
+; 64-CMP-DAG: bnezc $[[T4]],
+
+; MM32R3-DAG: add.s $[[T0:f[0-9]+]], $f14, $f12
+; MM32R3-DAG: lui $[[T1:[0-9]+]], %hi($CPI32_0)
+; MM32R3-DAG: lwc1 $[[T2:f[0-9]+]], %lo($CPI32_0)($[[T1]])
+; MM32R3-DAG: c.ole.s $[[T0]], $[[T2]]
+; MM32R3-DAG: bc1t
+
+; MM32R6-DAG: add.s $[[T0:f[0-9]+]], $f14, $f12
+; MM32R6-DAG: lui $[[T1:[0-9]+]], %hi($CPI32_0)
+; MM32R6-DAG: lwc1 $[[T2:f[0-9]+]], %lo($CPI32_0)($[[T1]])
+; MM32R6-DAG: cmp.le.s $[[T3:f[0-9]+]], $[[T0]], $[[T2]]
+; MM32R6-DAG: mfc1 $[[T4:[0-9]+]], $[[T3:f[0-9]+]]
+; MM32R6-DAG: andi16 $[[T5:[0-9]+]], $[[T4]], 1
+; MM32R6-DAG: bnez $[[T5]],
+
+; MM64R6-DAG: lui $[[T0:[0-9]+]], %hi(%neg(%gp_rel(bug1_f32)))
+; MM64R6-DAG: daddu $[[T1:[0-9]+]], $[[T0]], $25
+; MM64R6-DAG: daddiu $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f32)))
+; MM64R6-DAG: add.s $[[T3:f[0-9]+]], $f13, $f12
+; MM64R6-DAG: ld $[[T4:[0-9]+]], %got_page($CPI32_0)($[[T2]])
+; MM64R6-DAG: lwc1 $[[T5:f[0-9]+]], %got_ofst($CPI32_0)($[[T4]])
+; MM64R6-DAG: cmp.le.s $[[T6:f[0-9]+]], $[[T3]], $[[T5]]
+; MM64R6-DAG: mfc1 $[[T7:[0-9]+]], $[[T6]]
+; MM64R6-DAG: andi16 $[[T8:[0-9]+]], $[[T7]], 1
+; MM64R6-DAG: bnez $[[T8]],
%add = fadd fast float %at, %angle
%cmp = fcmp ogt float %add, 1.000000e+00
@@ -794,7 +1142,7 @@ entry:
; 32-CMP-DAG: mfc1 $[[T3:[0-9]+]], $[[T2]]
; FIXME: This instruction is redundant.
; 32-CMP-DAG: andi $[[T4:[0-9]+]], $[[T3]], 1
-; 32-CMP-DAG: bnez $[[T4]],
+; 32-CMP-DAG: bnezc $[[T4]],
; 64-C-DAG: add.d $[[T0:f[0-9]+]], $f13, $f12
; 64-C-DAG: ldc1 $[[T1:f[0-9]+]], %got_ofst($CPI33_0)(
@@ -807,7 +1155,32 @@ entry:
; 64-CMP-DAG: mfc1 $[[T3:[0-9]+]], $[[T2]]
; FIXME: This instruction is redundant.
; 64-CMP-DAG: andi $[[T4:[0-9]+]], $[[T3]], 1
-; 64-CMP-DAG: bnez $[[T4]],
+; 64-CMP-DAG: bnezc $[[T4]],
+
+; MM32R3-DAG: add.d $[[T0:f[0-9]+]], $f14, $f12
+; MM32R3-DAG: lui $[[T1:[0-9]+]], %hi($CPI33_0)
+; MM32R3-DAG: ldc1 $[[T2:f[0-9]+]], %lo($CPI33_0)($[[T1]])
+; MM32R3-DAG: c.ole.d $[[T0]], $[[T2]]
+; MM32R3-DAG: bc1t
+
+; MM32R6-DAG: add.d $[[T0:f[0-9]+]], $f14, $f12
+; MM32R6-DAG: lui $[[T1:[0-9]+]], %hi($CPI33_0)
+; MM32R6-DAG: ldc1 $[[T2:f[0-9]+]], %lo($CPI33_0)($[[T1]])
+; MM32R6-DAG: cmp.le.d $[[T3:f[0-9]+]], $[[T0]], $[[T2]]
+; MM32R6-DAG: mfc1 $[[T4:[0-9]+]], $[[T3]]
+; MM32R6-DAG: andi16 $[[T5:[0-9]+]], $[[T4]], 1
+; MM32R6-DAG: bnez $[[T5]],
+
+; MM64R6-DAG: lui $[[T0:[0-9]+]], %hi(%neg(%gp_rel(bug1_f64)))
+; MM64R6-DAG: daddu $[[T1:[0-9]+]], $[[T0]], $25
+; MM64R6-DAG: daddiu $[[T2:[0-9]+]], $[[T1]], %lo(%neg(%gp_rel(bug1_f64)))
+; MM64R6-DAG: add.d $[[T3:f[0-9]+]], $f13, $f12
+; MM64R6-DAG: ld $[[T4:[0-9]+]], %got_page($CPI33_0)($[[T2]])
+; MM64R6-DAG: ldc1 $[[T5:f[0-9]+]], %got_ofst($CPI33_0)($[[T4]])
+; MM64R6-DAG: cmp.le.d $[[T6:f[0-9]+]], $[[T3]], $[[T5]]
+; MM64R6-DAG: mfc1 $[[T7:[0-9]+]], $[[T6]]
+; MM64R6-DAG: andi16 $[[T8:[0-9]+]], $[[T7]], 1
+; MM64R6-DAG: bnez $[[T8]],
%add = fadd fast double %at, %angle
%cmp = fcmp ogt double %add, 1.000000e+00
diff --git a/test/CodeGen/Mips/fcopysign-f32-f64.ll b/test/CodeGen/Mips/fcopysign-f32-f64.ll
index 860bc79956fc..b1f065af0745 100644
--- a/test/CodeGen/Mips/fcopysign-f32-f64.ll
+++ b/test/CodeGen/Mips/fcopysign-f32-f64.ll
@@ -1,6 +1,9 @@
-; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi=n64 | FileCheck %s -check-prefix=64
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s -check-prefix=64
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s -check-prefix=64R2
+; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi=n64 | \
+; RUN: FileCheck %s -check-prefixes=ALL,64
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | \
+; RUN: FileCheck %s -check-prefixes=ALL,64
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | \
+; RUN: FileCheck %s -check-prefixes=ALL,64R2
declare double @copysign(double, double) nounwind readnone
@@ -8,7 +11,8 @@ declare float @copysignf(float, float) nounwind readnone
define float @func2(float %d, double %f) nounwind readnone {
entry:
-; 64: func2
+; ALL-LABEL: func2:
+
; 64-DAG: lui $[[T0:[0-9]+]], 32767
; 64-DAG: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
; 64-DAG: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
@@ -18,7 +22,7 @@ entry:
; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[SLL1]]
; 64: mtc1 $[[OR]], $f0
-; 64R2: dext ${{[0-9]+}}, ${{[0-9]+}}, 63, 1
+; 64R2: dextu ${{[0-9]+}}, ${{[0-9]+}}, 63, 1
; 64R2: ins $[[INS:[0-9]+]], ${{[0-9]+}}, 31, 1
; 64R2: mtc1 $[[INS]], $f0
@@ -30,17 +34,18 @@ entry:
define double @func3(double %d, float %f) nounwind readnone {
entry:
-
-; 64: func3
-; 64-DAG: daddiu $[[T0:[0-9]+]], $zero, 1
-; 64-DAG: dsll $[[T1:[0-9]+]], $[[T0]], 63
-; 64-DAG: daddiu $[[MSK0:[0-9]+]], $[[T1]], -1
-; 64-DAG: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 64-DAG: srl $[[SRL:[0-9]+]], ${{[0-9]+}}, 31
-; 64-DAG: sll $[[SLL:[0-9]+]], $[[SRL]], 0
-; 64-DAG: dsll $[[DSLL:[0-9]+]], $[[SLL]], 63
-; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[DSLL]]
-; 64: dmtc1 $[[OR]], $f0
+; ALL-LABEL: func3:
+
+; 64-DAG: mfc1 $[[MFC:[0-9]+]], $f13
+; 64-DAG: srl $[[SRL:[0-9]+]], $[[MFC:[0-9]+]], 31
+; 64: dsll $[[DSLL:[0-9]+]], $[[SRL]], 63
+; 64-DAG: daddiu $[[R1:[0-9]+]], $zero, 1
+; 64-DAG: dsll $[[R2:[0-9]+]], $[[R1]], 63
+; 64-DAG: daddiu $[[R3:[0-9]+]], $[[R2]], -1
+; 64-DAG: dmfc1 $[[R0:[0-9]+]], ${{.*}}
+; 64: and $[[AND0:[0-9]+]], $[[R0]], $[[R3]]
+; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[DSLL]]
+; 64: dmtc1 $[[OR]], $f0
; 64R2: ext ${{[0-9]+}}, ${{[0-9]+}}, 31, 1
; 64R2: dins $[[INS:[0-9]+]], ${{[0-9]+}}, 63, 1
@@ -51,4 +56,3 @@ entry:
%call = tail call double @copysign(double %add, double %conv) nounwind readnone
ret double %call
}
-
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
index 6928f2fe507f..ffc72a12f23d 100644
--- a/test/CodeGen/Mips/fcopysign.ll
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -27,7 +27,7 @@ entry:
; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
; 64: dmtc1 $[[OR]], $f0
-; 64R2: dext $[[EXT:[0-9]+]], ${{[0-9]+}}, 63, 1
+; 64R2: dextu $[[EXT:[0-9]+]], ${{[0-9]+}}, 63, 1
; 64R2: dins $[[INS:[0-9]+]], $[[EXT]], 63, 1
; 64R2: dmtc1 $[[INS]], $f0
diff --git a/test/CodeGen/Mips/fmadd1.ll b/test/CodeGen/Mips/fmadd1.ll
index 99d99fada1cf..c155eedd62c4 100644
--- a/test/CodeGen/Mips/fmadd1.ll
+++ b/test/CodeGen/Mips/fmadd1.ll
@@ -5,18 +5,18 @@
; IEEE 754 (1985) and IEEE 754 (2008). These instructions are therefore only
; available when -enable-no-nans-fp-math is given.
-; RUN: llc < %s -march=mipsel -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32 -check-prefix=32-NONAN
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NONAN
-; RUN: llc < %s -march=mipsel -mcpu=mips32r6 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NONAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64 -check-prefix=64-NONAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NONAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NONAN
-; RUN: llc < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=ALL -check-prefix=32 -check-prefix=32-NAN
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NAN
-; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64 -check-prefix=64-NAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NAN
-; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -target-abi=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NAN
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s -check-prefixes=ALL,32,32-NONAN
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefixes=ALL,32R2,32R2-NONAN
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 -enable-no-nans-fp-math | FileCheck %s -check-prefixes=ALL,32R6,32R6-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefixes=ALL,64,64-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefixes=ALL,64R2,64R2-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -target-abi=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefixes=ALL,64R6,64R6-NONAN
+; RUN: llc < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefixes=ALL,32,32-NAN
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefixes=ALL,32R2,32R2-NAN
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefixes=ALL,32R6,32R6-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s -check-prefixes=ALL,64,64-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s -check-prefixes=ALL,64R2,64R2-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -target-abi=n64 | FileCheck %s -check-prefixes=ALL,64R6,64R6-NAN
define float @FOO0float(float %a, float %b, float %c) nounwind readnone {
entry:
diff --git a/test/CodeGen/Mips/fp-indexed-ls.ll b/test/CodeGen/Mips/fp-indexed-ls.ll
index 219ca99d3f94..87fb248e56fd 100644
--- a/test/CodeGen/Mips/fp-indexed-ls.ll
+++ b/test/CodeGen/Mips/fp-indexed-ls.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R1
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R2
-; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R6
-; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
-; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
-; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
-; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64R6
+; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS32R1
+; RUN: llc -march=mipsel -mcpu=mips32r2 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS32R2
+; RUN: llc -march=mipsel -mcpu=mips32r6 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS32R6
+; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS64R6
; Check that [ls][dwu]xc1 are not emitted for nacl.
; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=CHECK-NACL
diff --git a/test/CodeGen/Mips/fp-spill-reload.ll b/test/CodeGen/Mips/fp-spill-reload.ll
index 4a53ad8c8e13..431389ae9acb 100644
--- a/test/CodeGen/Mips/fp-spill-reload.ll
+++ b/test/CodeGen/Mips/fp-spill-reload.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
; check that $fp is not reserved.
define void @foo0(i32* nocapture %b) nounwind {
diff --git a/test/CodeGen/Mips/fp16-promote.ll b/test/CodeGen/Mips/fp16-promote.ll
index 2ac46e028072..f060f6ab03ee 100644
--- a/test/CodeGen/Mips/fp16-promote.ll
+++ b/test/CodeGen/Mips/fp16-promote.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false -mtriple=mipsel-linux-gnueabi < %s | FileCheck %s -check-prefix=CHECK-LIBCALL
+; RUN: llc -asm-verbose=false -mtriple=mipsel-linux-gnueabi -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-LIBCALL
; CHECK-LIBCALL-LABEL: test_fadd:
; CHECK-LIBCALL: %call16(__gnu_h2f_ieee)
diff --git a/test/CodeGen/Mips/fp64a.ll b/test/CodeGen/Mips/fp64a.ll
index fadce5cb748b..528a0c816842 100644
--- a/test/CodeGen/Mips/fp64a.ll
+++ b/test/CodeGen/Mips/fp64a.ll
@@ -11,14 +11,14 @@
; incorrectly using this case. We should fix those test cases then add
; this check here.
-; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NO-FP64A-BE
-; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FP64A
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NO-FP64A-LE
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FP64A
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefixes=ALL,32R2-NO-FP64A-BE
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefixes=ALL,32R2-FP64A
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefixes=ALL,32R2-NO-FP64A-LE
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefixes=ALL,32R2-FP64A
-; RUN: llc -march=mips64 -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NO-FP64A
+; RUN: llc -march=mips64 -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefixes=ALL,64-NO-FP64A
; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fp64,nooddspreg < %s 2>&1 | FileCheck %s -check-prefix=64-FP64A
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NO-FP64A
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefixes=ALL,64-NO-FP64A
; RUN: not llc -march=mips64el -mcpu=mips64 -mattr=fp64,nooddspreg < %s 2>&1 | FileCheck %s -check-prefix=64-FP64A
; 64-FP64A: LLVM ERROR: -mattr=+nooddspreg requires the O32 ABI.
diff --git a/test/CodeGen/Mips/fpbr.ll b/test/CodeGen/Mips/fpbr.ll
index 27d7094376e6..bf1b045dbf28 100644
--- a/test/CodeGen/Mips/fpbr.ll
+++ b/test/CodeGen/Mips/fpbr.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=32-FCC
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=32-FCC
-; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=32-GPR
-; RUN: llc < %s -march=mips64el -mcpu=mips64 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=64-FCC
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=64-FCC
-; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=64-GPR
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,FCC,32-FCC
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,FCC,32-FCC
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,GPR,32-GPR
+; RUN: llc < %s -march=mips64el -mcpu=mips64 | FileCheck %s -check-prefixes=ALL,FCC,64-FCC
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,FCC,64-FCC
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefixes=ALL,GPR,64-GPR
define void @func0(float %f2, float %f3) nounwind {
entry:
@@ -18,7 +18,8 @@ entry:
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
; FIXME: We ought to be able to transform not+bnez -> beqz
; GPR: not $[[GPRCC]], $[[GPRCC]]
-; GPR: bnez $[[GPRCC]], $BB0_2
+; 32-GPR: bnez $[[GPRCC]], $BB0_2
+; 64-GPR: bnezc $[[GPRCC]], $BB0_2
%cmp = fcmp oeq float %f2, %f3
br i1 %cmp, label %if.then, label %if.else
@@ -51,7 +52,8 @@ entry:
; 64-GPR: cmp.ule.s $[[FGRCC:f[0-9]+]], $f13, $f12
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
; GPR-NOT: not $[[GPRCC]], $[[GPRCC]]
-; GPR: bnez $[[GPRCC]], $BB1_2
+; 32-GPR: bnez $[[GPRCC]], $BB1_2
+; 64-GPR: bnezc $[[GPRCC]], $BB1_2
%cmp = fcmp olt float %f2, %f3
br i1 %cmp, label %if.then, label %if.else
@@ -80,7 +82,8 @@ entry:
; 64-GPR: cmp.ult.s $[[FGRCC:f[0-9]+]], $f13, $f12
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
; GPR-NOT: not $[[GPRCC]], $[[GPRCC]]
-; GPR: beqz $[[GPRCC]], $BB2_2
+; 32-GPR: beqz $[[GPRCC]], $BB2_2
+; 64-GPR: beqzc $[[GPRCC]], $BB2_2
%cmp = fcmp ugt float %f2, %f3
br i1 %cmp, label %if.else, label %if.then
@@ -110,7 +113,8 @@ entry:
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
; FIXME: We ought to be able to transform not+bnez -> beqz
; GPR: not $[[GPRCC]], $[[GPRCC]]
-; GPR: bnez $[[GPRCC]], $BB3_2
+; 32-GPR: bnez $[[GPRCC]], $BB3_2
+; 64-GPR: bnezc $[[GPRCC]], $BB3_2
%cmp = fcmp oeq double %f2, %f3
br i1 %cmp, label %if.then, label %if.else
@@ -139,7 +143,8 @@ entry:
; 64-GPR: cmp.ule.d $[[FGRCC:f[0-9]+]], $f13, $f12
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
; GPR-NOT: not $[[GPRCC]], $[[GPRCC]]
-; GPR: bnez $[[GPRCC]], $BB4_2
+; 32-GPR: bnez $[[GPRCC]], $BB4_2
+; 64-GPR: bnezc $[[GPRCC]], $BB4_2
%cmp = fcmp olt double %f2, %f3
br i1 %cmp, label %if.then, label %if.else
@@ -168,7 +173,8 @@ entry:
; 64-GPR: cmp.ult.d $[[FGRCC:f[0-9]+]], $f13, $f12
; GPR: mfc1 $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
; GPR-NOT: not $[[GPRCC]], $[[GPRCC]]
-; GPR: beqz $[[GPRCC]], $BB5_2
+; 32-GPR: beqz $[[GPRCC]], $BB5_2
+; 64-GPR: beqzc $[[GPRCC]], $BB5_2
%cmp = fcmp ugt double %f2, %f3
br i1 %cmp, label %if.else, label %if.then
diff --git a/test/CodeGen/Mips/fpxx.ll b/test/CodeGen/Mips/fpxx.ll
index 5b42ecec53e8..6fdb95efe8ec 100644
--- a/test/CodeGen/Mips/fpxx.ll
+++ b/test/CodeGen/Mips/fpxx.ll
@@ -1,20 +1,20 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-NOFPXX
-; RUN: llc -march=mipsel -mcpu=mips32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-FPXX
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefixes=ALL,32-NOFPXX
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=fpxx < %s | FileCheck %s -check-prefixes=ALL,32-FPXX
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NOFPXX
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FPXX
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefixes=ALL,32R2-NOFPXX
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fpxx < %s | FileCheck %s -check-prefixes=ALL,32R2-FPXX
-; RUN: llc -march=mips64 -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-NOFPXX
+; RUN: llc -march=mips64 -mcpu=mips4 < %s | FileCheck %s -check-prefixes=ALL,4-NOFPXX
; RUN: not llc -march=mips64 -mcpu=mips4 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=4-FPXX
-; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NOFPXX
+; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefixes=ALL,64-NOFPXX
; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=64-FPXX
-; RUN-TODO: llc -march=mips64 -mcpu=mips4 -target-abi o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-NOFPXX
-; RUN-TODO: llc -march=mips64 -mcpu=mips4 -target-abi o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-FPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -target-abi o32 < %s | FileCheck %s -check-prefixes=ALL,4-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -target-abi o32 -mattr=fpxx < %s | FileCheck %s -check-prefixes=ALL,4-O32-FPXX
-; RUN-TODO: llc -march=mips64 -mcpu=mips64 -target-abi o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-NOFPXX
-; RUN-TODO: llc -march=mips64 -mcpu=mips64 -target-abi o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-FPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -target-abi o32 < %s | FileCheck %s -check-prefixes=ALL,64-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -target-abi o32 -mattr=fpxx < %s | FileCheck %s -check-prefixes=ALL,64-O32-FPXX
declare double @dbl();
diff --git a/test/CodeGen/Mips/gpreg-lazy-binding.ll b/test/CodeGen/Mips/gpreg-lazy-binding.ll
index 800a74f5358f..98849ded7070 100644
--- a/test/CodeGen/Mips/gpreg-lazy-binding.ll
+++ b/test/CodeGen/Mips/gpreg-lazy-binding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | FileCheck %s
+; RUN: llc -march=mipsel -disable-mips-delay-filler -relocation-model=pic < %s | FileCheck %s
@g = external global i32
diff --git a/test/CodeGen/Mips/hf16call32.ll b/test/CodeGen/Mips/hf16call32.ll
index 9fc94cac5175..5159477f2c69 100644
--- a/test/CodeGen/Mips/hf16call32.ll
+++ b/test/CodeGen/Mips/hf16call32.ll
@@ -820,7 +820,7 @@ declare void @v_df_df(double, double) #1
declare float @sf_v() #1
; stel: .section .mips16.call.fp.sf_v,"ax",@progbits
; stel: .ent __call_stub_fp_sf_v
-; stel: move $18, $31
+; stel: move $18, $ra
; stel: jal sf_v
; stel: mfc1 $2, $f0
; stel: jr $18
@@ -898,7 +898,7 @@ declare float @sf_df_df(double, double) #1
declare double @df_v() #1
; stel: .section .mips16.call.fp.df_v,"ax",@progbits
; stel: .ent __call_stub_fp_df_v
-; stel: move $18, $31
+; stel: move $18, $ra
; stel: jal df_v
; stel: mfc1 $2, $f0
; stel: mfc1 $3, $f1
@@ -983,7 +983,7 @@ declare double @df_df_df(double, double) #1
declare { float, float } @sc_v() #1
; stel: .section .mips16.call.fp.sc_v,"ax",@progbits
; stel: .ent __call_stub_fp_sc_v
-; stel: move $18, $31
+; stel: move $18, $ra
; stel: jal sc_v
; stel: mfc1 $2, $f0
; stel: mfc1 $3, $f2
@@ -1004,7 +1004,7 @@ declare { float, float } @sc_sf(float) #1
declare { double, double } @dc_v() #1
; stel: .section .mips16.call.fp.dc_v,"ax",@progbits
; stel: .ent __call_stub_fp_dc_v
-; stel: move $18, $31
+; stel: move $18, $ra
; stel: jal dc_v
; stel: mfc1 $4, $f2
; stel: mfc1 $5, $f3
diff --git a/test/CodeGen/Mips/hf16call32_body.ll b/test/CodeGen/Mips/hf16call32_body.ll
index 1a04fd46f8bd..49ce181b015a 100644
--- a/test/CodeGen/Mips/hf16call32_body.ll
+++ b/test/CodeGen/Mips/hf16call32_body.ll
@@ -20,7 +20,8 @@ entry:
}
; stel: .section .mips16.fn.v_sf,"ax",@progbits
; stel: .ent __fn_stub_v_sf
-; stel: la $25, v_sf
+; stel: lui $25, %hi(v_sf)
+; stel: addiu $25, $25, %lo(v_sf)
; stel: mfc1 $4, $f12
; stel: jr $25
; stel: __fn_local_v_sf = v_sf
@@ -40,7 +41,8 @@ entry:
; stel: .section .mips16.fn.v_df,"ax",@progbits
; stel: .ent __fn_stub_v_df
-; stel: la $25, v_df
+; stel: lui $25, %hi(v_df)
+; stel: addiu $25, $25, %lo(v_df)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f13
; stel: jr $25
@@ -63,7 +65,8 @@ entry:
; stel: .section .mips16.fn.v_sf_sf,"ax",@progbits
; stel: .ent __fn_stub_v_sf_sf
-; stel: la $25, v_sf_sf
+; stel: lui $25, %hi(v_sf_sf)
+; stel: addiu $25, $25, %lo(v_sf_sf)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f14
; stel: jr $25
@@ -86,7 +89,8 @@ entry:
; stel: .section .mips16.fn.v_sf_df,"ax",@progbits
; stel: .ent __fn_stub_v_sf_df
-; stel: la $25, v_sf_df
+; stel: lui $25, %hi(v_sf_df)
+; stel: addiu $25, $25, %lo(v_sf_df)
; stel: mfc1 $4, $f12
; stel: mfc1 $6, $f14
; stel: mfc1 $7, $f15
@@ -110,7 +114,8 @@ entry:
; stel: .section .mips16.fn.v_df_sf,"ax",@progbits
; stel: .ent __fn_stub_v_df_sf
-; stel: la $25, v_df_sf
+; stel: lui $25, %hi(v_df_sf)
+; stel: addiu $25, $25, %lo(v_df_sf)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f13
; stel: mfc1 $6, $f14
@@ -134,7 +139,8 @@ entry:
; stel: .section .mips16.fn.v_df_df,"ax",@progbits
; stel: .ent __fn_stub_v_df_df
-; stel: la $25, v_df_df
+; stel: lui $25, %hi(v_df_df)
+; stel: addiu $25, $25, %lo(v_df_df)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f13
; stel: mfc1 $6, $f14
@@ -164,7 +170,8 @@ entry:
; stel: .section .mips16.fn.sf_sf,"ax",@progbits
; stel: .ent __fn_stub_sf_sf
-; stel: la $25, sf_sf
+; stel: lui $25, %hi(sf_sf)
+; stel: addiu $25, $25, %lo(sf_sf)
; stel: mfc1 $4, $f12
; stel: jr $25
; stel: __fn_local_sf_sf = sf_sf
@@ -184,7 +191,8 @@ entry:
; stel: .section .mips16.fn.sf_df,"ax",@progbits
; stel: .ent __fn_stub_sf_df
-; stel: la $25, sf_df
+; stel: lui $25, %hi(sf_df)
+; stel: addiu $25, $25, %lo(sf_df)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f13
; stel: jr $25
@@ -208,7 +216,8 @@ entry:
; stel: .section .mips16.fn.sf_sf_sf,"ax",@progbits
; stel: .ent __fn_stub_sf_sf_sf
-; stel: la $25, sf_sf_sf
+; stel: lui $25, %hi(sf_sf_sf)
+; stel: addiu $25, $25, %lo(sf_sf_sf)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f14
; stel: jr $25
@@ -232,7 +241,8 @@ entry:
; stel: .section .mips16.fn.sf_sf_df,"ax",@progbits
; stel: .ent __fn_stub_sf_sf_df
-; stel: la $25, sf_sf_df
+; stel: lui $25, %hi(sf_sf_df)
+; stel: addiu $25, $25, %lo(sf_sf_df)
; stel: mfc1 $4, $f12
; stel: mfc1 $6, $f14
; stel: mfc1 $7, $f15
@@ -257,7 +267,8 @@ entry:
; stel: .section .mips16.fn.sf_df_sf,"ax",@progbits
; stel: .ent __fn_stub_sf_df_sf
-; stel: la $25, sf_df_sf
+; stel: lui $25, %hi(sf_df_sf)
+; stel: addiu $25, $25, %lo(sf_df_sf)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f13
; stel: mfc1 $6, $f14
@@ -282,7 +293,8 @@ entry:
; stel: .section .mips16.fn.sf_df_df,"ax",@progbits
; stel: .ent __fn_stub_sf_df_df
-; stel: la $25, sf_df_df
+; stel: lui $25, %hi(sf_df_df)
+; stel: addiu $25, $25, %lo(sf_df_df)
; stel: mfc1 $4, $f12
; stel: mfc1 $5, $f13
; stel: mfc1 $6, $f14
diff --git a/test/CodeGen/Mips/hf1_body.ll b/test/CodeGen/Mips/hf1_body.ll
index adf45109d69a..b6469716176f 100644
--- a/test/CodeGen/Mips/hf1_body.ll
+++ b/test/CodeGen/Mips/hf1_body.ll
@@ -1,4 +1,11 @@
-; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=picfp16
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 \
+; RUN: -relocation-model=pic -no-integrated-as < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,GAS
+
+; The integrated assembler expands assembly macros before printing.
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 \
+; RUN: -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,IAS
@x = external global float
@@ -11,11 +18,13 @@ entry:
store float %0, float* @x, align 4
ret void
}
-; picfp16: .ent __fn_stub_v_sf
-; picfp16: .cpload $25
-; picfp16: .set reorder
-; picfp16: .reloc 0, R_MIPS_NONE, v_sf
-; picfp16: la $25, $__fn_local_v_sf
-; picfp16: mfc1 $4, $f12
-; picfp16: jr $25
-; picfp16: .end __fn_stub_v_sf
+; ALL-LABEL: .ent __fn_stub_v_sf
+; ALL: .cpload $25
+; ALL: .set reorder
+; ALL: .reloc 0, R_MIPS_NONE, v_sf
+; GAS: la $25, $__fn_local_v_sf
+; IAS: lw $25, %got($$__fn_local_v_sf)($gp)
+; IAS: addiu $25, $25, %lo($$__fn_local_v_sf)
+; ALL: mfc1 $4, $f12
+; ALL: jr $25
+; ALL: .end __fn_stub_v_sf
diff --git a/test/CodeGen/Mips/i64arg.ll b/test/CodeGen/Mips/i64arg.ll
index 5b2d13518035..22a0c1f51f7d 100644
--- a/test/CodeGen/Mips/i64arg.ll
+++ b/test/CodeGen/Mips/i64arg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; RUN: llc -march=mips -relocation-model=pic < %s | FileCheck %s
define void @f1(i64 %ll1, float %f, i64 %ll, i32 %i, float %f2) nounwind {
entry:
diff --git a/test/CodeGen/Mips/inlineasm-constraint_ZC_2.ll b/test/CodeGen/Mips/inlineasm-constraint_ZC_2.ll
new file mode 100644
index 000000000000..a99cb976eaa9
--- /dev/null
+++ b/test/CodeGen/Mips/inlineasm-constraint_ZC_2.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s --check-prefixes=ALL,R6
+; RUN: llc -march=mips -mcpu=mips64r6 -target-abi=n64 < %s | FileCheck %s --check-prefixes=ALL,R6
+; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s --check-prefixes=ALL,PRER6
+; RUN: llc -march=mips -mcpu=mips64 -target-abi=n64 < %s | FileCheck %s --check-prefixes=ALL,PRER6
+
+
+%struct.anon = type { [63 x i32], i32, i32 }
+
+define i32 @Atomic() {
+; CHECK-LABEL: Atomic:
+entry:
+ %s = alloca %struct.anon, align 4
+ %0 = bitcast %struct.anon* %s to i8*
+ %count = getelementptr inbounds %struct.anon, %struct.anon* %s, i64 0, i32 1
+ store i32 0, i32* %count, align 4
+; R6: addiu $[[R0:[0-9a-z]+]], $sp, {{[0-9]+}}
+
+; ALL: #APP
+
+; R6: ll ${{[0-9a-z]+}}, 0($[[R0]])
+; R6: sc ${{[0-9a-z]+}}, 0($[[R0]])
+
+; PRER6: ll ${{[0-9a-z]+}}, {{[0-9]+}}(${{[0-9a-z]+}})
+; PRER6: sc ${{[0-9a-z]+}}, {{[0-9]+}}(${{[0-9a-z]+}})
+
+; ALL: #NO_APP
+
+ %1 = call { i32, i32 } asm sideeffect ".set push\0A.set noreorder\0A1:\0All $0, $2\0Aaddu $1, $0, $3\0Asc $1, $2\0Abeqz $1, 1b\0Aaddu $1, $0, $3\0A.set pop\0A", "=&r,=&r,=*^ZC,Ir,*^ZC,~{memory},~{$1}"(i32* %count, i32 10, i32* %count)
+ %asmresult1.i = extractvalue { i32, i32 } %1, 1
+ %cmp = icmp ne i32 %asmresult1.i, 10
+ %conv = zext i1 %cmp to i32
+ %call2 = call i32 @f(i32 signext %conv)
+ ret i32 %call2
+}
+
+declare i32 @f(i32 signext)
diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
index 6d41385d18de..6b46884e9af7 100644
--- a/test/CodeGen/Mips/inlineasm-operand-code.ll
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll
@@ -1,9 +1,15 @@
; Positive test for inline register constraints
;
-; RUN: llc -no-integrated-as -march=mipsel < %s | \
-; RUN: FileCheck -check-prefix=ALL -check-prefix=LE32 -check-prefix=GAS %s
-; RUN: llc -no-integrated-as -march=mips < %s | \
-; RUN: FileCheck -check-prefix=ALL -check-prefix=BE32 -check-prefix=GAS %s
+; RUN: llc -no-integrated-as -march=mipsel -relocation-model=pic < %s | \
+; RUN: FileCheck -check-prefixes=ALL,LE32,GAS %s
+; RUN: llc -no-integrated-as -march=mips -relocation-model=pic < %s | \
+; RUN: FileCheck -check-prefixes=ALL,BE32,GAS %s
+
+; IAS might not print in the same way since it parses the assembly.
+; RUN: llc -march=mipsel -relocation-model=pic < %s | \
+; RUN: FileCheck -check-prefixes=ALL,LE32,IAS %s
+; RUN: llc -march=mips -relocation-model=pic < %s | \
+; RUN: FileCheck -check-prefixes=ALL,BE32,IAS %s
%union.u_tag = type { i64 }
%struct.anon = type { i32, i32 }
@@ -15,6 +21,7 @@ entry:
; ALL-LABEL: constraint_X:
; ALL: #APP
; GAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, 0xfffffffffffffffd
+; IAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
; ALL: #NO_APP
tail call i32 asm sideeffect "addiu $0, $1, ${2:X}", "=r,r,I"(i32 7, i32 -3) ;
ret i32 0
@@ -26,6 +33,9 @@ entry:
; ALL-LABEL: constraint_x:
; ALL: #APP
; GAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, 0xfffd
+; This is _also_ -3 because uimm16 values are silently coerced to simm16 when
+; it would otherwise fail to match.
+; IAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
; ALL: #NO_APP
tail call i32 asm sideeffect "addiu $0, $1, ${2:x}", "=r,r,I"(i32 7, i32 -3) ;
ret i32 0
@@ -54,39 +64,66 @@ entry:
}
; z with -3
-define i32 @constraint_z() nounwind {
+define void @constraint_z_0() nounwind {
entry:
-; ALL-LABEL: constraint_z:
+; ALL-LABEL: constraint_z_0:
; ALL: #APP
; ALL: addiu ${{[0-9]+}}, ${{[0-9]+}}, -3
; ALL: #NO_APP
tail call i32 asm sideeffect "addiu $0, $1, ${2:z}", "=r,r,I"(i32 7, i32 -3) ;
+ ret void
+}
; z with 0
+define void @constraint_z_1() nounwind {
+entry:
+; ALL-LABEL: constraint_z_1:
; ALL: #APP
-; GAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, $0
+; GAS: addu ${{[0-9]+}}, ${{[0-9]+}}, $0
+; IAS: move ${{[0-9]+}}, ${{[0-9]+}}
; ALL: #NO_APP
- tail call i32 asm sideeffect "addiu $0, $1, ${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+ tail call i32 asm sideeffect "addu $0, $1, ${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+ ret void
+}
; z with non-zero and the "r"(register) and "J"(integer zero) constraints
+define void @constraint_z_2() nounwind {
+entry:
+; ALL-LABEL: constraint_z_2:
; ALL: #APP
; ALL: mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
; ALL: #NO_APP
call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 7) nounwind
+ ret void
+}
; z with zero and the "r"(register) and "J"(integer zero) constraints
+define void @constraint_z_3() nounwind {
+entry:
+; ALL-LABEL: constraint_z_3:
; ALL: #APP
-; ALL: mtc0 $0, ${{[0-9]+}}
+; GAS: mtc0 $0, ${{[0-9]+}}
+; IAS: mtc0 $zero, ${{[0-9]+}}, 0
; ALL: #NO_APP
call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 0) nounwind
+ ret void
+}
; z with non-zero and just the "r"(register) constraint
+define void @constraint_z_4() nounwind {
+entry:
+; ALL-LABEL: constraint_z_4:
; ALL: #APP
; ALL: mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
; ALL: #NO_APP
call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 7) nounwind
+ ret void
+}
; z with zero and just the "r"(register) constraint
+define void @constraint_z_5() nounwind {
+entry:
+; ALL-LABEL: constraint_z_5:
; FIXME: Check for $0, instead of other registers.
; We should be using $0 directly in this case, not real registers.
; When the materialization of 0 gets fixed, this test will fail.
@@ -94,7 +131,7 @@ entry:
; ALL: mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
; ALL: #NO_APP
call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 0) nounwind
- ret i32 0
+ ret void
}
; A long long in 32 bit mode (use to assert)
diff --git a/test/CodeGen/Mips/inlineasm_constraint.ll b/test/CodeGen/Mips/inlineasm_constraint.ll
index a6ac07182ff5..164d28f733e4 100644
--- a/test/CodeGen/Mips/inlineasm_constraint.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint.ll
@@ -1,5 +1,6 @@
; RUN: llc -no-integrated-as -march=mipsel < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=GAS
+; RUN: FileCheck %s -check-prefixes=ALL,GAS
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefixes=ALL,IAS
define void @constraint_I() nounwind {
; First I with short
@@ -31,6 +32,7 @@ define void @constraint_K() nounwind {
; Now K with 64
; ALL: #APP
; GAS: addu ${{[0-9]+}}, ${{[0-9]+}}, 64
+; IAS: addiu ${{[0-9]+}}, ${{[0-9]+}}, 64
; ALL: #NO_APP
tail call i16 asm sideeffect "addu $0, $1, $2\0A\09 ", "=r,r,K"(i16 7, i16 64) nounwind
ret void
diff --git a/test/CodeGen/Mips/inlineasm_constraint_R.ll b/test/CodeGen/Mips/inlineasm_constraint_R.ll
index c4105ae6b22c..9c7611ba81d5 100644
--- a/test/CodeGen/Mips/inlineasm_constraint_R.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint_R.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
@data = global [8193 x i32] zeroinitializer
diff --git a/test/CodeGen/Mips/inlineasm_constraint_ZC.ll b/test/CodeGen/Mips/inlineasm_constraint_ZC.ll
index c1746a67564f..59778df3b423 100644
--- a/test/CodeGen/Mips/inlineasm_constraint_ZC.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint_ZC.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=09BIT
-; RUN: llc -march=mipsel -mattr=+micromips < %s | FileCheck %s -check-prefix=ALL -check-prefix=12BIT
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=ALL -check-prefix=16BIT
+; RUN: llc -march=mipsel -mcpu=mips32r6 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,09BIT
+; RUN: llc -march=mipsel -mattr=+micromips -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,12BIT
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,16BIT
@data = global [8193 x i32] zeroinitializer
diff --git a/test/CodeGen/Mips/inlineasm_constraint_m.ll b/test/CodeGen/Mips/inlineasm_constraint_m.ll
index 00053ad3c105..11ef8341cbdb 100644
--- a/test/CodeGen/Mips/inlineasm_constraint_m.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint_m.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
@data = global [8193 x i32] zeroinitializer
diff --git a/test/CodeGen/Mips/inlineasmmemop.ll b/test/CodeGen/Mips/inlineasmmemop.ll
index bdf3ae55b802..61cbf93e667b 100644
--- a/test/CodeGen/Mips/inlineasmmemop.ll
+++ b/test/CodeGen/Mips/inlineasmmemop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
; Simple memory
@g1 = external global i32
diff --git a/test/CodeGen/Mips/internalfunc.ll b/test/CodeGen/Mips/internalfunc.ll
index 2b4a0397f45f..b6b1c96c5f3b 100644
--- a/test/CodeGen/Mips/internalfunc.ll
+++ b/test/CodeGen/Mips/internalfunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=mipsel | FileCheck %s
+; RUN: llc < %s -march=mipsel -relocation-model=pic | FileCheck %s
@caller.sf1 = internal unnamed_addr global void (...)* null, align 4
@gf1 = external global void (...)*
diff --git a/test/CodeGen/Mips/interrupt-attr-64-error.ll b/test/CodeGen/Mips/interrupt-attr-64-error.ll
index 830c199d91d9..9626bda45f51 100644
--- a/test/CodeGen/Mips/interrupt-attr-64-error.ll
+++ b/test/CodeGen/Mips/interrupt-attr-64-error.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mcpu=mips64r6 -march=mipsel -relocation-model=static < %s 2>%t
+; RUN: not llc -mcpu=mips64r6 -march=mipsel -target-abi n64 -relocation-model=static < %s 2>%t
; RUN: FileCheck %s < %t
; CHECK: LLVM ERROR: "interrupt" attribute is only supported for the O32 ABI on MIPS32R2+ at the present time.
diff --git a/test/CodeGen/Mips/largeimm1.ll b/test/CodeGen/Mips/largeimm1.ll
index 06c4d6bd9603..b4d15f9e1e8c 100644
--- a/test/CodeGen/Mips/largeimm1.ll
+++ b/test/CodeGen/Mips/largeimm1.ll
@@ -1,13 +1,19 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
-; CHECK: lui ${{[0-9]+}}, 49152
-; CHECK: lui ${{[0-9]+}}, 16384
define void @f() nounwind {
entry:
%a1 = alloca [1073741824 x i8], align 1
%arrayidx = getelementptr inbounds [1073741824 x i8], [1073741824 x i8]* %a1, i32 0, i32 1048676
call void @f2(i8* %arrayidx) nounwind
ret void
+; CHECK-LABEL: f:
+
+; CHECK: lui $[[R0:[a-z0-9]+]], 16384
+; CHECK: addiu $[[R1:[a-z0-9]+]], $[[R0]], 24
+; CHECK: subu $sp, $sp, $[[R1]]
+
+; CHECK: lui $[[R2:[a-z0-9]+]], 16384
+; CHECK: addu ${{[0-9]+}}, $sp, $[[R2]]
}
declare void @f2(i8*)
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index a53a953a7883..f27e11425b97 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
-; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 < %s | \
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 -relocation-model=pic < %s | \
; RUN: FileCheck %s -check-prefix=64
-; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 < %s | \
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -relocation-model=pic < %s | \
; RUN: FileCheck %s -check-prefix=64
%struct.S1 = type { [65536 x i8] }
@@ -10,21 +10,19 @@
define void @f() nounwind {
entry:
-; 32: lui $[[R0:[0-9]+]], 65535
-; 32: addiu $[[R0]], $[[R0]], -24
-; 32: addu $sp, $sp, $[[R0]]
-; 32: lui $[[R1:[0-9]+]], 1
-; 32: addu $[[R1]], $sp, $[[R1]]
-; 32: sw $ra, 20($[[R1]])
-; 64: daddiu $[[R0:[0-9]+]], $zero, 1
-; 64: dsll $[[R0]], $[[R0]], 48
-; 64: daddiu $[[R0]], $[[R0]], -1
-; 64: dsll $[[R0]], $[[R0]], 16
-; 64: daddiu $[[R0]], $[[R0]], -32
-; 64: daddu $sp, $sp, $[[R0]]
-; 64: lui $[[R1:[0-9]+]], 1
-; 64: daddu $[[R1]], $sp, $[[R1]]
-; 64: sd $ra, 24($[[R1]])
+; 32: lui $[[R0:[0-9]+]], 1
+; 32: addiu $[[R0]], $[[R0]], 24
+; 32: subu $sp, $sp, $[[R0]]
+; 32: lui $[[R1:[0-9]+]], 1
+; 32: addu $[[R1]], $sp, $[[R1]]
+; 32: sw $ra, 20($[[R1]])
+
+; 64: lui $[[R0:[0-9]+]], 1
+; 64: daddiu $[[R0]], $[[R0]], 32
+; 64: dsubu $sp, $sp, $[[R0]]
+; 64: lui $[[R1:[0-9]+]], 1
+; 64: daddu $[[R1]], $sp, $[[R1]]
+; 64: sd $ra, 24($[[R1]])
%agg.tmp = alloca %struct.S1, align 1
%tmp = getelementptr inbounds %struct.S1, %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/lazy-binding.ll b/test/CodeGen/Mips/lazy-binding.ll
index 839155adad9a..87040cc2b3ac 100644
--- a/test/CodeGen/Mips/lazy-binding.ll
+++ b/test/CodeGen/Mips/lazy-binding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
; CHECK-LABEL: foo6:
; CHECK: %while.body
diff --git a/test/CodeGen/Mips/lcb5.ll b/test/CodeGen/Mips/lcb5.ll
index ec4c3da6515c..83f5fa0fb1c2 100644
--- a/test/CodeGen/Mips/lcb5.ll
+++ b/test/CodeGen/Mips/lcb5.ll
@@ -212,7 +212,7 @@ if.end: ; preds = %if.then, %entry
; ci: btnez $BB7_1 # 16 bit inst
; ci: jal $BB7_2 # branch
; ci: nop
-; ci: .align 2
+; ci: .p2align 2
; ci: $BB7_1:
; ci: .end z4
diff --git a/test/CodeGen/Mips/llvm-ir/add.ll b/test/CodeGen/Mips/llvm-ir/add.ll
index 6cccc7df19f9..7a60585d8fbf 100644
--- a/test/CodeGen/Mips/llvm-ir/add.ll
+++ b/test/CodeGen/Mips/llvm-ir/add.ll
@@ -1,37 +1,51 @@
; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=ALL,NOT-R2-R6,GP32
; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=ALL,NOT-R2-R6,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=ALL,R2-R6,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=ALL,R2-R6,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=ALL,R2-R6,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=ALL,R2-R6,GP32
; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=ALL,NOT-R2-R6,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=ALL,R2-R6,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=ALL,R2-R6,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=ALL,R2-R6,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=ALL,R2-R6,GP64
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -O2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -O2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -O2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM64
define signext i1 @add_i1(i1 signext %a, i1 signext %b) {
entry:
; ALL-LABEL: add_i1:
- ; ALL: addu $[[T0:[0-9]+]], $4, $5
- ; ALL: sll $[[T0]], $[[T0]], 31
- ; ALL: sra $2, $[[T0]], 31
+ ; NOT-R2-R6: addu $[[T0:[0-9]+]], $4, $5
+ ; NOT-R2-R6: sll $[[T0]], $[[T0]], 31
+ ; NOT-R2-R6: sra $2, $[[T0]], 31
+
+ ; R2-R6: addu $[[T0:[0-9]+]], $4, $5
+ ; R2-R6: sll $[[T0]], $[[T0]], 31
+ ; R2-R6: sra $2, $[[T0]], 31
+
+ ; MMR6: addu16 $[[T0:[0-9]+]], $4, $5
+ ; MMR6: sll $[[T1:[0-9]+]], $[[T0]], 31
+ ; MMR6: sra $2, $[[T1]], 31
%r = add i1 %a, %b
ret i1 %r
@@ -45,8 +59,11 @@ entry:
; NOT-R2-R6: sll $[[T0]], $[[T0]], 24
; NOT-R2-R6: sra $2, $[[T0]], 24
- ; R2-R6: addu $[[T0:[0-9]+]], $4, $5
- ; R2-R6: seb $2, $[[T0:[0-9]+]]
+ ; R2-R6: addu $[[T0:[0-9]+]], $4, $5
+ ; R2-R6: seb $2, $[[T0:[0-9]+]]
+
+ ; MMR6: addu16 $[[T0:[0-9]+]], $4, $5
+ ; MMR6: seb $2, $[[T0]]
%r = add i8 %a, %b
ret i8 %r
@@ -60,8 +77,11 @@ entry:
; NOT-R2-R6: sll $[[T0]], $[[T0]], 16
; NOT-R2-R6: sra $2, $[[T0]], 16
- ; R2-R6: addu $[[T0:[0-9]+]], $4, $5
- ; R2-R6: seh $2, $[[T0:[0-9]+]]
+ ; R2-R6: addu $[[T0:[0-9]+]], $4, $5
+ ; R2-R6: seh $2, $[[T0]]
+
+ ; MMR6: addu16 $[[T0:[0-9]+]], $4, $5
+ ; MMR6: seh $2, $[[T0]]
%r = add i16 %a, %b
ret i16 %r
@@ -71,7 +91,10 @@ define signext i32 @add_i32(i32 signext %a, i32 signext %b) {
entry:
; ALL-LABEL: add_i32:
- ; ALL: addu $2, $4, $5
+ ; NOT-R2-R6: addu $2, $4, $5
+ ; R2-R6: addu $2, $4, $5
+
+ ; MMR6: addu16 $[[T0:[0-9]+]], $4, $5
%r = add i32 %a, %b
ret i32 %r
@@ -88,6 +111,13 @@ entry:
; GP64: daddu $2, $4, $5
+ ; MM32: addu $3, $5, $7
+ ; MM32: sltu $[[T0:[0-9]+]], $3, $7
+ ; MM32: addu $[[T1:[0-9]+]], $[[T0]], $6
+ ; MM32: addu $2, $4, $[[T1]]
+
+ ; MM64: daddu $2, $4, $5
+
%r = add i64 %a, %b
ret i64 %r
}
@@ -102,13 +132,13 @@ entry:
; GP32: lw $[[T3:[0-9]+]], 24($sp)
; GP32: addu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
; GP32: addu $[[T5:[0-9]+]], $6, $[[T4]]
- ; GP32: sltu $[[T6:[0-9]+]], $[[T5]], $[[T3]]
+ ; GP32: lw $[[T6:[0-9]+]], 16($sp)
; GP32: lw $[[T7:[0-9]+]], 20($sp)
- ; GP32: addu $[[T8:[0-9]+]], $[[T6]], $[[T7]]
- ; GP32: lw $[[T9:[0-9]+]], 16($sp)
+ ; GP32: sltu $[[T8:[0-9]+]], $[[T5]], $[[T3]]
+ ; GP32: addu $[[T9:[0-9]+]], $[[T8]], $[[T7]]
; GP32: addu $3, $5, $[[T8]]
; GP32: sltu $[[T10:[0-9]+]], $3, $[[T7]]
- ; GP32: addu $[[T11:[0-9]+]], $[[T10]], $[[T9]]
+ ; GP32: addu $[[T11:[0-9]+]], $[[T10]], $[[T6]]
; GP32: addu $2, $4, $[[T11]]
; GP32: move $4, $[[T5]]
; GP32: move $5, $[[T1]]
@@ -118,6 +148,285 @@ entry:
; GP64: daddu $[[T1:[0-9]+]], $[[T0]], $6
; GP64: daddu $2, $4, $[[T1]]
+ ; MM32: lw $[[T0:[0-9]+]], 28($sp)
+ ; MM32: addu $[[T1:[0-9]+]], $7, $[[T0]]
+ ; MM32: sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+ ; MM32: lw $[[T3:[0-9]+]], 24($sp)
+ ; MM32: addu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+ ; MM32: addu $[[T5:[0-9]+]], $6, $[[T4]]
+ ; MM32: sltu $[[T6:[0-9]+]], $[[T5]], $[[T3]]
+ ; MM32: lw $[[T7:[0-9]+]], 20($sp)
+ ; MM32: addu $[[T8:[0-9]+]], $[[T6]], $[[T7]]
+ ; MM32: addu $[[T9:[0-9]+]], $5, $[[T8]]
+ ; MM32: lw $[[T10:[0-9]+]], 16($sp)
+ ; MM32: sltu $[[T11:[0-9]+]], $[[T9]], $[[T7]]
+ ; MM32: addu $[[T12:[0-9]+]], $[[T11]], $[[T10]]
+ ; MM32: addu $[[T13:[0-9]+]], $4, $[[T12]]
+ ; MM32: move $4, $[[T5]]
+ ; MM32: move $5, $[[T1]]
+
+ ; MM64: daddu $3, $5, $7
+ ; MM64: sltu $[[T0:[0-9]+]], $3, $7
+ ; MM64: daddu $[[T1:[0-9]+]], $[[T0]], $6
+ ; MM64: daddu $2, $4, $[[T1]]
+
%r = add i128 %a, %b
ret i128 %r
}
+
+define signext i1 @add_i1_4(i1 signext %a) {
+; ALL-LABEL: add_i1_4:
+
+ ; ALL: move $2, $4
+
+ %r = add i1 4, %a
+ ret i1 %r
+}
+
+define signext i8 @add_i8_4(i8 signext %a) {
+; ALL-LABEL: add_i8_4:
+
+ ; NOT-R2-R6: sll $[[T0:[0-9]+]], $4, 24
+ ; NOT-R2-R6: lui $[[T1:[0-9]+]], 1024
+ ; NOT-R2-R6: addu $[[T0]], $[[T0]], $[[T1]]
+ ; NOT-R2-R6: sra $2, $[[T0]], 24
+
+ ; R2-R6: addiu $[[T0:[0-9]+]], $4, 4
+ ; R2-R6: seb $2, $[[T0]]
+
+ ; MM32: addiur2 $[[T0:[0-9]+]], $4, 4
+ ; MM32: seb $2, $[[T0]]
+
+ ; MM64: addiur2 $[[T0:[0-9]+]], $4, 4
+ ; MM64: seb $2, $[[T0]]
+
+ %r = add i8 4, %a
+ ret i8 %r
+}
+
+define signext i16 @add_i16_4(i16 signext %a) {
+; ALL-LABEL: add_i16_4:
+
+ ; NOT-R2-R6: sll $[[T0:[0-9]+]], $4, 16
+ ; NOT-R2-R6: lui $[[T1:[0-9]+]], 4
+ ; NOT-R2-R6: addu $[[T0]], $[[T0]], $[[T1]]
+ ; NOT-R2-R6: sra $2, $[[T0]], 16
+
+ ; R2-R6: addiu $[[T0:[0-9]+]], $4, 4
+ ; R2-R6: seh $2, $[[T0]]
+
+ ; MM32: addiur2 $[[T0:[0-9]+]], $4, 4
+ ; MM32: seh $2, $[[T0]]
+
+ ; MM64: addiur2 $[[T0:[0-9]+]], $4, 4
+ ; MM64: seh $2, $[[T0]]
+
+ %r = add i16 4, %a
+ ret i16 %r
+}
+
+define signext i32 @add_i32_4(i32 signext %a) {
+; ALL-LABEL: add_i32_4:
+
+ ; GP32: addiu $2, $4, 4
+
+ ; GP64: addiu $2, $4, 4
+
+ ; MM32: addiur2 $2, $4, 4
+
+ ; MM64: addiur2 $2, $4, 4
+
+ %r = add i32 4, %a
+ ret i32 %r
+}
+
+define signext i64 @add_i64_4(i64 signext %a) {
+; ALL-LABEL: add_i64_4:
+
+ ; GP32: addiu $[[T0:[0-9]+]], $5, 4
+ ; GP32: addiu $[[T1:[0-9]+]], $zero, 4
+ ; GP32: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; GP32: addu $2, $4, $[[T1]]
+
+ ; GP64: daddiu $2, $4, 4
+
+ ; MM32: addiu $[[T0:[0-9]+]], $5, 4
+ ; MM32: li16 $[[T1:[0-9]+]], 4
+ ; MM32: sltu $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+ ; MM32: addu $2, $4, $[[T2]]
+
+ ; MM64: daddiu $2, $4, 4
+
+ %r = add i64 4, %a
+ ret i64 %r
+}
+
+define signext i128 @add_i128_4(i128 signext %a) {
+; ALL-LABEL: add_i128_4:
+
+ ; GP32: addiu $[[T0:[0-9]+]], $7, 4
+ ; GP32: addiu $[[T1:[0-9]+]], $zero, 4
+ ; GP32: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; GP32: addu $[[T2:[0-9]+]], $6, $[[T1]]
+ ; GP32: sltu $[[T1]], $[[T2]], $zero
+ ; GP32: addu $[[T3:[0-9]+]], $5, $[[T1]]
+ ; GP32: sltu $[[T1]], $[[T3]], $zero
+ ; GP32: addu $[[T1]], $4, $[[T1]]
+ ; GP32: move $4, $[[T2]]
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: daddiu $[[T0:[0-9]+]], $5, 4
+ ; GP64: daddiu $[[T1:[0-9]+]], $zero, 4
+ ; GP64: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; GP64: daddu $2, $4, $[[T1]]
+
+ ; MM32: addiu $[[T0:[0-9]+]], $7, 4
+ ; MM32: li16 $[[T1:[0-9]+]], 4
+ ; MM32: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; MM32: addu $[[T2:[0-9]+]], $6, $[[T1]]
+ ; MM32: lui $[[T1]], 0
+ ; MM32: sltu $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+ ; MM32: addu $[[T3]], $5, $[[T3]]
+ ; MM32: sltu $[[T1]], $[[T3]], $[[T1]]
+ ; MM32: addu $[[T1]], $4, $[[T1]]
+ ; MM32: move $4, $[[T2]]
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: daddiu $[[T0:[0-9]+]], $5, 4
+ ; MM64: daddiu $[[T1:[0-9]+]], $zero, 4
+ ; MM64: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; MM64: daddu $2, $4, $[[T1]]
+
+ %r = add i128 4, %a
+ ret i128 %r
+}
+
+define signext i1 @add_i1_3(i1 signext %a) {
+; ALL-LABEL: add_i1_3:
+
+ ; ALL: sll $[[T0:[0-9]+]], $4, 31
+ ; ALL: lui $[[T1:[0-9]+]], 32768
+
+ ; GP32: addu $[[T0]], $[[T0]], $[[T1]]
+ ; GP32: sra $[[T1]], $[[T0]], 31
+
+ ; GP64: addu $[[T0]], $[[T0]], $[[T1]]
+ ; GP64: sra $[[T1]], $[[T0]], 31
+
+ ; MMR6: addu16 $[[T0]], $[[T0]], $[[T1]]
+ ; MMR6: sra $[[T0]], $[[T0]], 31
+
+ %r = add i1 3, %a
+ ret i1 %r
+}
+
+define signext i8 @add_i8_3(i8 signext %a) {
+; ALL-LABEL: add_i8_3:
+
+ ; NOT-R2-R6: sll $[[T0:[0-9]+]], $4, 24
+ ; NOT-R2-R6: lui $[[T1:[0-9]+]], 768
+ ; NOT-R2-R6: addu $[[T0]], $[[T0]], $[[T1]]
+ ; NOT-R2-R6: sra $2, $[[T0]], 24
+
+ ; R2-R6: addiu $[[T0:[0-9]+]], $4, 3
+ ; R2-R6: seb $2, $[[T0]]
+
+ ; MMR6: addius5 $[[T0:[0-9]+]], 3
+ ; MMR6: seb $2, $[[T0]]
+
+ %r = add i8 3, %a
+ ret i8 %r
+}
+
+define signext i16 @add_i16_3(i16 signext %a) {
+; ALL-LABEL: add_i16_3:
+
+ ; NOT-R2-R6: sll $[[T0:[0-9]+]], $4, 16
+ ; NOT-R2-R6: lui $[[T1:[0-9]+]], 3
+ ; NOT-R2-R6: addu $[[T0]], $[[T0]], $[[T1]]
+ ; NOT-R2-R6: sra $2, $[[T0]], 16
+
+ ; R2-R6: addiu $[[T0:[0-9]+]], $4, 3
+ ; R2-R6: seh $2, $[[T0]]
+
+ ; MMR6: addius5 $[[T0:[0-9]+]], 3
+ ; MMR6: seh $2, $[[T0]]
+
+ %r = add i16 3, %a
+ ret i16 %r
+}
+
+define signext i32 @add_i32_3(i32 signext %a) {
+; ALL-LABEL: add_i32_3:
+
+ ; NOT-R2-R6: addiu $2, $4, 3
+
+ ; R2-R6: addiu $2, $4, 3
+
+ ; MMR6: addius5 $[[T0:[0-9]+]], 3
+ ; MMR6: move $2, $[[T0]]
+
+ %r = add i32 3, %a
+ ret i32 %r
+}
+
+define signext i64 @add_i64_3(i64 signext %a) {
+; ALL-LABEL: add_i64_3:
+
+ ; GP32: addiu $[[T0:[0-9]+]], $5, 3
+ ; GP32: addiu $[[T1:[0-9]+]], $zero, 3
+ ; GP32: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; GP32: addu $2, $4, $[[T1]]
+
+ ; GP64: daddiu $2, $4, 3
+
+ ; MM32: addiu $[[T0:[0-9]+]], $5, 3
+ ; MM32: li16 $[[T1:[0-9]+]], 3
+ ; MM32: sltu $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+ ; MM32: addu $2, $4, $[[T2]]
+
+ ; MM64: daddiu $2, $4, 3
+
+ %r = add i64 3, %a
+ ret i64 %r
+}
+
+define signext i128 @add_i128_3(i128 signext %a) {
+; ALL-LABEL: add_i128_3:
+
+ ; GP32: addiu $[[T0:[0-9]+]], $7, 3
+ ; GP32: addiu $[[T1:[0-9]+]], $zero, 3
+ ; GP32: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; GP32: addu $[[T2:[0-9]+]], $6, $[[T1]]
+ ; GP32: sltu $[[T3:[0-9]+]], $[[T2]], $zero
+ ; GP32: addu $[[T4:[0-9]+]], $5, $[[T3]]
+ ; GP32: sltu $[[T5:[0-9]+]], $[[T4]], $zero
+ ; GP32: addu $[[T5]], $4, $[[T5]]
+ ; GP32: move $4, $[[T2]]
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: daddiu $[[T0:[0-9]+]], $5, 3
+ ; GP64: daddiu $[[T1:[0-9]+]], $zero, 3
+ ; GP64: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; GP64: daddu $2, $4, $[[T1]]
+
+ ; MM32: addiu $[[T0:[0-9]+]], $7, 3
+ ; MM32: li16 $[[T1:[0-9]+]], 3
+ ; MM32: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; MM32: addu $[[T2:[0-9]+]], $6, $[[T1]]
+ ; MM32: lui $[[T3:[0-9]+]], 0
+ ; MM32: sltu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+ ; MM32: addu $[[T4]], $5, $[[T4]]
+ ; MM32: sltu $[[T5:[0-9]+]], $[[T4]], $[[T3]]
+ ; MM32: addu $[[T5]], $4, $[[T5]]
+ ; MM32: move $4, $[[T2]]
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: daddiu $[[T0:[0-9]+]], $5, 3
+ ; MM64: daddiu $[[T1:[0-9]+]], $zero, 3
+ ; MM64: sltu $[[T1]], $[[T0]], $[[T1]]
+ ; MM64: daddu $2, $4, $[[T1]]
+
+ %r = add i128 3, %a
+ ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/and.ll b/test/CodeGen/Mips/llvm-ir/and.ll
index c4121701ec15..d320ce60f291 100644
--- a/test/CodeGen/Mips/llvm-ir/and.ll
+++ b/test/CodeGen/Mips/llvm-ir/and.ll
@@ -1,35 +1,46 @@
; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM64
define signext i1 @and_i1(i1 signext %a, i1 signext %b) {
entry:
; ALL-LABEL: and_i1:
- ; ALL: and $2, $4, $5
+ ; GP32: and $2, $4, $5
+
+ ; GP64: and $2, $4, $5
+
+ ; MM: and16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = and i1 %a, %b
ret i1 %r
@@ -39,7 +50,12 @@ define signext i8 @and_i8(i8 signext %a, i8 signext %b) {
entry:
; ALL-LABEL: and_i8:
- ; ALL: and $2, $4, $5
+ ; GP32: and $2, $4, $5
+
+ ; GP64: and $2, $4, $5
+
+ ; MM: and16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = and i8 %a, %b
ret i8 %r
@@ -49,7 +65,12 @@ define signext i16 @and_i16(i16 signext %a, i16 signext %b) {
entry:
; ALL-LABEL: and_i16:
- ; ALL: and $2, $4, $5
+ ; GP32: and $2, $4, $5
+
+ ; GP64: and $2, $4, $5
+
+ ; MM: and16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = and i16 %a, %b
ret i16 %r
@@ -64,6 +85,12 @@ entry:
; GP64: and $[[T0:[0-9]+]], $4, $5
; GP64: sll $2, $[[T0]], 0
+ ; MM32: and16 $[[T0:[0-9]+]], $5
+ ; MM32: move $2, $[[T0]]
+
+ ; MM64: and $[[T0:[0-9]+]], $4, $5
+ ; MM64: sll $2, $[[T0]], 0
+
%r = and i32 %a, %b
ret i32 %r
}
@@ -77,6 +104,13 @@ entry:
; GP64: and $2, $4, $5
+ ; MM32: and16 $[[T0:[0-9]+]], $6
+ ; MM32: and16 $[[T1:[0-9]+]], $7
+ ; MM32: move $2, $[[T0]]
+ ; MM32: move $3, $[[T1]]
+
+ ; MM64: and $2, $4, $5
+
%r = and i64 %a, %b
ret i64 %r
}
@@ -97,6 +131,573 @@ entry:
; GP64: and $2, $4, $6
; GP64: and $3, $5, $7
+ ; MM32: lw $[[T0:[0-9]+]], 20($sp)
+ ; MM32: lw $[[T1:[0-9]+]], 16($sp)
+ ; MM32: and16 $[[T1]], $4
+ ; MM32: and16 $[[T0]], $5
+ ; MM32: lw $[[T2:[0-9]+]], 24($sp)
+ ; MM32: and16 $[[T2]], $6
+ ; MM32: lw $[[T3:[0-9]+]], 28($sp)
+ ; MM32: and16 $[[T3]], $7
+
+ ; MM64: and $2, $4, $6
+ ; MM64: and $3, $5, $7
+
%r = and i128 %a, %b
ret i128 %r
}
+
+define signext i1 @and_i1_4(i1 signext %b) {
+entry:
+; ALL-LABEL: and_i1_4:
+
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: addiu $2, $zero, 0
+
+ ; MM: lui $2, 0
+
+ %r = and i1 4, %b
+ ret i1 %r
+}
+
+define signext i8 @and_i8_4(i8 signext %b) {
+entry:
+; ALL-LABEL: and_i8_4:
+
+ ; GP32: andi $2, $4, 4
+
+ ; GP64: andi $2, $4, 4
+
+ ; MM: andi16 $2, $4, 4
+
+ %r = and i8 4, %b
+ ret i8 %r
+}
+
+define signext i16 @and_i16_4(i16 signext %b) {
+entry:
+; ALL-LABEL: and_i16_4:
+
+ ; GP32: andi $2, $4, 4
+
+ ; GP64: andi $2, $4, 4
+
+ ; MM: andi16 $2, $4, 4
+
+ %r = and i16 4, %b
+ ret i16 %r
+}
+
+define signext i32 @and_i32_4(i32 signext %b) {
+entry:
+; ALL-LABEL: and_i32_4:
+
+ ; GP32: andi $2, $4, 4
+
+ ; GP64: andi $2, $4, 4
+
+ ; MM: andi16 $2, $4, 4
+
+ %r = and i32 4, %b
+ ret i32 %r
+}
+
+define signext i64 @and_i64_4(i64 signext %b) {
+entry:
+; ALL-LABEL: and_i64_4:
+
+ ; GP32: andi $3, $5, 4
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: andi $2, $4, 4
+
+ ; MM32: andi16 $3, $5, 4
+ ; MM32: lui $2, 0
+
+ ; MM64: andi $2, $4, 4
+
+ %r = and i64 4, %b
+ ret i64 %r
+}
+
+define signext i128 @and_i128_4(i128 signext %b) {
+entry:
+; ALL-LABEL: and_i128_4:
+
+ ; GP32: andi $5, $7, 4
+ ; GP32: addiu $2, $zero, 0
+ ; GP32: addiu $3, $zero, 0
+ ; GP32: addiu $4, $zero, 0
+
+ ; GP64: andi $3, $5, 4
+ ; GP64: daddiu $2, $zero, 0
+
+ ; MM32: andi16 $5, $7, 4
+ ; MM32: lui $2, 0
+ ; MM32: lui $3, 0
+ ; MM32: lui $4, 0
+
+ ; MM64: andi $3, $5, 4
+ ; MM64: daddiu $2, $zero, 0
+
+ %r = and i128 4, %b
+ ret i128 %r
+}
+
+define signext i1 @and_i1_31(i1 signext %b) {
+entry:
+; ALL-LABEL: and_i1_31:
+
+ ; ALL: move $2, $4
+
+ %r = and i1 31, %b
+ ret i1 %r
+}
+
+define signext i8 @and_i8_31(i8 signext %b) {
+entry:
+; ALL-LABEL: and_i8_31:
+
+ ; GP32: andi $2, $4, 31
+
+ ; GP64: andi $2, $4, 31
+
+ ; MM: andi16 $2, $4, 31
+
+ %r = and i8 31, %b
+ ret i8 %r
+}
+
+define signext i16 @and_i16_31(i16 signext %b) {
+entry:
+; ALL-LABEL: and_i16_31:
+
+ ; GP32: andi $2, $4, 31
+
+ ; GP64: andi $2, $4, 31
+
+ ; MM: andi16 $2, $4, 31
+
+ %r = and i16 31, %b
+ ret i16 %r
+}
+
+define signext i32 @and_i32_31(i32 signext %b) {
+entry:
+; ALL-LABEL: and_i32_31:
+
+ ; GP32: andi $2, $4, 31
+
+ ; GP64: andi $2, $4, 31
+
+ ; MM: andi16 $2, $4, 31
+
+ %r = and i32 31, %b
+ ret i32 %r
+}
+
+define signext i64 @and_i64_31(i64 signext %b) {
+entry:
+; ALL-LABEL: and_i64_31:
+
+ ; GP32: andi $3, $5, 31
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: andi $2, $4, 31
+
+ ; MM32: andi16 $3, $5, 31
+ ; MM32: lui $2, 0
+
+ ; MM64: andi $2, $4, 31
+
+ %r = and i64 31, %b
+ ret i64 %r
+}
+
+define signext i128 @and_i128_31(i128 signext %b) {
+entry:
+; ALL-LABEL: and_i128_31:
+
+ ; GP32: andi $5, $7, 31
+ ; GP32: addiu $2, $zero, 0
+ ; GP32: addiu $3, $zero, 0
+ ; GP32: addiu $4, $zero, 0
+
+ ; GP64: andi $3, $5, 31
+ ; GP64: daddiu $2, $zero, 0
+
+ ; MM32: andi16 $5, $7, 31
+ ; MM32: lui $2, 0
+ ; MM32: lui $3, 0
+ ; MM32: lui $4, 0
+
+ ; MM64: andi $3, $5, 31
+ ; MM64: daddiu $2, $zero, 0
+
+ %r = and i128 31, %b
+ ret i128 %r
+}
+
+define signext i1 @and_i1_255(i1 signext %b) {
+entry:
+; ALL-LABEL: and_i1_255:
+
+ ; ALL: move $2, $4
+
+ %r = and i1 255, %b
+ ret i1 %r
+}
+
+define signext i8 @and_i8_255(i8 signext %b) {
+entry:
+; ALL-LABEL: and_i8_255:
+
+ ; ALL: move $2, $4
+
+ %r = and i8 255, %b
+ ret i8 %r
+}
+
+define signext i16 @and_i16_255(i16 signext %b) {
+entry:
+; ALL-LABEL: and_i16_255:
+
+ ; GP32: andi $2, $4, 255
+
+ ; GP64: andi $2, $4, 255
+
+ ; MM: andi16 $2, $4, 255
+
+ %r = and i16 255, %b
+ ret i16 %r
+}
+
+define signext i32 @and_i32_255(i32 signext %b) {
+entry:
+; ALL-LABEL: and_i32_255:
+
+ ; GP32: andi $2, $4, 255
+
+ ; GP64: andi $2, $4, 255
+
+ ; MM: andi16 $2, $4, 255
+
+ %r = and i32 255, %b
+ ret i32 %r
+}
+
+define signext i64 @and_i64_255(i64 signext %b) {
+entry:
+; ALL-LABEL: and_i64_255:
+
+ ; GP32: andi $3, $5, 255
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: andi $2, $4, 255
+
+ ; MM32: andi16 $3, $5, 255
+ ; MM32: lui $2, 0
+
+ ; MM64: andi $2, $4, 255
+
+ %r = and i64 255, %b
+ ret i64 %r
+}
+
+define signext i128 @and_i128_255(i128 signext %b) {
+entry:
+; ALL-LABEL: and_i128_255:
+
+ ; GP32: andi $5, $7, 255
+ ; GP32: addiu $2, $zero, 0
+ ; GP32: addiu $3, $zero, 0
+ ; GP32: addiu $4, $zero, 0
+
+ ; GP64: andi $3, $5, 255
+ ; GP64: daddiu $2, $zero, 0
+
+ ; MM32: andi16 $5, $7, 255
+ ; MM32: lui $2, 0
+ ; MM32: lui $3, 0
+ ; MM32: lui $4, 0
+
+ ; MM64: andi $3, $5, 255
+ ; MM64: daddiu $2, $zero, 0
+
+ %r = and i128 255, %b
+ ret i128 %r
+}
+
+define signext i1 @and_i1_32768(i1 signext %b) {
+entry:
+; ALL-LABEL: and_i1_32768:
+
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: addiu $2, $zero, 0
+
+ ; MM: lui $2, 0
+
+ %r = and i1 32768, %b
+ ret i1 %r
+}
+
+define signext i8 @and_i8_32768(i8 signext %b) {
+entry:
+; ALL-LABEL: and_i8_32768:
+
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: addiu $2, $zero, 0
+
+ ; MM: lui $2, 0
+
+ %r = and i8 32768, %b
+ ret i8 %r
+}
+
+define signext i16 @and_i16_32768(i16 signext %b) {
+entry:
+; ALL-LABEL: and_i16_32768:
+
+ ; GP32: addiu $[[T0:[0-9]+]], $zero, -32768
+ ; GP32: and $2, $4, $[[T0]]
+
+ ; GP64: addiu $[[T0:[0-9]+]], $zero, -32768
+ ; GP64: and $2, $4, $[[T0]]
+
+ ; MM: addiu $2, $zero, -32768
+ ; MM: and16 $2, $4
+
+ %r = and i16 32768, %b
+ ret i16 %r
+}
+
+define signext i32 @and_i32_32768(i32 signext %b) {
+entry:
+; ALL-LABEL: and_i32_32768:
+
+ ; GP32: andi $2, $4, 32768
+
+ ; GP64: andi $2, $4, 32768
+
+ ; MM: andi16 $2, $4, 32768
+
+ %r = and i32 32768, %b
+ ret i32 %r
+}
+
+define signext i64 @and_i64_32768(i64 signext %b) {
+entry:
+; ALL-LABEL: and_i64_32768:
+
+ ; GP32: andi $3, $5, 32768
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: andi $2, $4, 32768
+
+ ; MM32: andi16 $3, $5, 32768
+ ; MM32: lui $2, 0
+
+ ; MM64: andi $2, $4, 32768
+
+ %r = and i64 32768, %b
+ ret i64 %r
+}
+
+define signext i128 @and_i128_32768(i128 signext %b) {
+entry:
+; ALL-LABEL: and_i128_32768:
+
+ ; GP32: andi $5, $7, 32768
+ ; GP32: addiu $2, $zero, 0
+ ; GP32: addiu $3, $zero, 0
+ ; GP32: addiu $4, $zero, 0
+
+ ; GP64: andi $3, $5, 32768
+ ; GP64: daddiu $2, $zero, 0
+
+ ; MM32: andi16 $5, $7, 32768
+ ; MM32: lui $2, 0
+ ; MM32: lui $3, 0
+ ; MM32: lui $4, 0
+
+ ; MM64: andi $3, $5, 32768
+ ; MM64: daddiu $2, $zero, 0
+
+ %r = and i128 32768, %b
+ ret i128 %r
+}
+
+define signext i1 @and_i1_65(i1 signext %b) {
+entry:
+; ALL-LABEL: and_i1_65:
+
+ ; ALL: move $2, $4
+
+ %r = and i1 65, %b
+ ret i1 %r
+}
+
+define signext i8 @and_i8_65(i8 signext %b) {
+entry:
+; ALL-LABEL: and_i8_65:
+
+ ; ALL: andi $2, $4, 65
+
+ %r = and i8 65, %b
+ ret i8 %r
+}
+
+define signext i16 @and_i16_65(i16 signext %b) {
+entry:
+; ALL-LABEL: and_i16_65:
+
+ ; ALL: andi $2, $4, 65
+
+ %r = and i16 65, %b
+ ret i16 %r
+}
+
+define signext i32 @and_i32_65(i32 signext %b) {
+entry:
+; ALL-LABEL: and_i32_65:
+
+ ; ALL: andi $2, $4, 65
+
+ %r = and i32 65, %b
+ ret i32 %r
+}
+
+define signext i64 @and_i64_65(i64 signext %b) {
+entry:
+; ALL-LABEL: and_i64_65:
+
+ ; GP32: andi $3, $5, 65
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: andi $2, $4, 65
+
+ ; MM32: andi $3, $5, 65
+ ; MM32: lui $2, 0
+
+ ; MM64: andi $2, $4, 65
+
+ %r = and i64 65, %b
+ ret i64 %r
+}
+
+define signext i128 @and_i128_65(i128 signext %b) {
+entry:
+; ALL-LABEL: and_i128_65:
+
+ ; GP32: andi $5, $7, 65
+ ; GP32: addiu $2, $zero, 0
+ ; GP32: addiu $3, $zero, 0
+ ; GP32: addiu $4, $zero, 0
+
+ ; GP64: andi $3, $5, 65
+ ; GP64: daddiu $2, $zero, 0
+
+ ; MM32: andi $5, $7, 65
+ ; MM32: lui $2, 0
+ ; MM32: lui $3, 0
+ ; MM32: lui $4, 0
+
+ ; MM64: andi $3, $5, 65
+ ; MM64: daddiu $2, $zero, 0
+
+ %r = and i128 65, %b
+ ret i128 %r
+}
+
+define signext i1 @and_i1_256(i1 signext %b) {
+entry:
+; ALL-LABEL: and_i1_256:
+
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: addiu $2, $zero, 0
+
+ ; MM: lui $2, 0
+
+ %r = and i1 256, %b
+ ret i1 %r
+}
+
+define signext i8 @and_i8_256(i8 signext %b) {
+entry:
+; ALL-LABEL: and_i8_256:
+
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: addiu $2, $zero, 0
+
+ ; MM: lui $2, 0
+
+ %r = and i8 256, %b
+ ret i8 %r
+}
+
+define signext i16 @and_i16_256(i16 signext %b) {
+entry:
+; ALL-LABEL: and_i16_256:
+
+ ; ALL: andi $2, $4, 256
+
+ %r = and i16 256, %b
+ ret i16 %r
+}
+
+define signext i32 @and_i32_256(i32 signext %b) {
+entry:
+; ALL-LABEL: and_i32_256:
+
+ ; ALL: andi $2, $4, 256
+
+ %r = and i32 256, %b
+ ret i32 %r
+}
+
+define signext i64 @and_i64_256(i64 signext %b) {
+entry:
+; ALL-LABEL: and_i64_256:
+
+ ; GP32: andi $3, $5, 256
+ ; GP32: addiu $2, $zero, 0
+
+ ; GP64: andi $2, $4, 256
+
+ ; MM32: andi $3, $5, 256
+ ; MM32: lui $2, 0
+
+ ; MM64: andi $2, $4, 256
+
+ %r = and i64 256, %b
+ ret i64 %r
+}
+
+define signext i128 @and_i128_256(i128 signext %b) {
+entry:
+; ALL-LABEL: and_i128_256:
+
+ ; GP32: andi $5, $7, 256
+ ; GP32: addiu $2, $zero, 0
+ ; GP32: addiu $3, $zero, 0
+ ; GP32: addiu $4, $zero, 0
+
+ ; GP64: andi $3, $5, 256
+ ; GP64: daddiu $2, $zero, 0
+
+ ; MM32: andi $5, $7, 256
+ ; MM32: lui $2, 0
+ ; MM32: lui $3, 0
+ ; MM32: lui $4, 0
+
+ ; MM64: andi $3, $5, 256
+ ; MM64: daddiu $2, $zero, 0
+
+ %r = and i128 256, %b
+ ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/ashr.ll b/test/CodeGen/Mips/llvm-ir/ashr.ll
index cad4a39d7743..af9b81f9203f 100644
--- a/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -1,42 +1,33 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=M2
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R6
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=M3
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=64R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,M2
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,64R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR6
define signext i1 @ashr_i1(i1 signext %a, i1 signext %b) {
entry:
@@ -53,7 +44,9 @@ entry:
; ALL-LABEL: ashr_i8:
; FIXME: The andi instruction is redundant.
- ; ALL: andi $[[T0:[0-9]+]], $5, 255
+ ; GP32: andi $[[T0:[0-9]+]], $5, 255
+ ; GP64: andi $[[T0:[0-9]+]], $5, 255
+ ; MM: andi16 $[[T0:[0-9]+]], $5, 255
; ALL: srav $2, $4, $[[T0]]
%r = ashr i8 %a, %b
@@ -65,7 +58,9 @@ entry:
; ALL-LABEL: ashr_i16:
; FIXME: The andi instruction is redundant.
- ; ALL: andi $[[T0:[0-9]+]], $5, 65535
+ ; GP32: andi $[[T0:[0-9]+]], $5, 65535
+ ; GP64: andi $[[T0:[0-9]+]], $5, 65535
+ ; MM: andi16 $[[T0:[0-9]+]], $5, 65535
; ALL: srav $2, $4, $[[T0]]
%r = ashr i16 %a, %b
@@ -133,6 +128,32 @@ entry:
; GP64: dsrav $2, $4, $5
+ ; MMR3: srlv $[[T0:[0-9]+]], $5, $7
+ ; MMR3: sll16 $[[T1:[0-9]+]], $4, 1
+ ; MMR3: not16 $[[T2:[0-9]+]], $7
+ ; MMR3: sllv $[[T3:[0-9]+]], $[[T1]], $[[T2]]
+ ; MMR3: or16 $[[T4:[0-9]+]], $[[T0]]
+ ; MMR3: srav $[[T5:[0-9]+]], $4, $7
+ ; MMR3: andi16 $[[T6:[0-9]+]], $7, 32
+ ; MMR3: movn $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+ ; MMR3: sra $[[T8:[0-9]+]], $4, 31
+ ; MMR3: movn $2, $[[T8]], $[[T6]]
+
+ ; MMR6: srav $[[T0:[0-9]+]], $4, $7
+ ; MMR6: andi16 $[[T1:[0-9]+]], $7, 32
+ ; MMR6: seleqz $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+ ; MMR6: sra $[[T3:[0-9]+]], $4, 31
+ ; MMR6: selnez $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+ ; MMR6: or $[[T5:[0-9]+]], $[[T4]], $[[T2]]
+ ; MMR6: srlv $[[T6:[0-9]+]], $5, $7
+ ; MMR6: sll16 $[[T7:[0-9]+]], $4, 1
+ ; MMR6: not16 $[[T8:[0-9]+]], $7
+ ; MMR6: sllv $[[T9:[0-9]+]], $[[T7]], $[[T8]]
+ ; MMR6: or16 $[[T10:[0-9]+]], $[[T6]]
+ ; MMR6: seleqz $[[T11:[0-9]+]], $[[T10]], $[[T1]]
+ ; MMR6: selnez $[[T12:[0-9]+]], $[[T0]], $[[T1]]
+ ; MMR6: or $3, $[[T12]], $[[T11]]
+
%r = ashr i64 %a, %b
ret i64 %r
}
@@ -192,6 +213,8 @@ entry:
; 64R6: jr $ra
; 64R6: or $3, $[[T13]], $[[T12]]
+ ; MM: lw $25, %call16(__ashrti3)($2)
+
%r = ashr i128 %a, %b
ret i128 %r
}
diff --git a/test/CodeGen/Mips/llvm-ir/call.ll b/test/CodeGen/Mips/llvm-ir/call.ll
index a4b03405f72b..0d524d439600 100644
--- a/test/CodeGen/Mips/llvm-ir/call.ll
+++ b/test/CodeGen/Mips/llvm-ir/call.ll
@@ -1,18 +1,18 @@
; Test the 'call' instruction and the tailcall variant.
; FIXME: We should remove the need for -enable-mips-tail-calls
-; RUN: llc -march=mips -mcpu=mips32 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
-; RUN: llc -march=mips -mcpu=mips32r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
-; RUN: llc -march=mips -mcpu=mips32r3 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
-; RUN: llc -march=mips -mcpu=mips32r5 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
-; RUN: llc -march=mips -mcpu=mips32r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
-; RUN: llc -march=mips -mcpu=mips32r6 -mattr=+fp64,+nooddspreg -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
-; RUN: llc -march=mips64 -mcpu=mips4 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
-; RUN: llc -march=mips64 -mcpu=mips64 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
-; RUN: llc -march=mips64 -mcpu=mips64r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
-; RUN: llc -march=mips64 -mcpu=mips64r3 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
-; RUN: llc -march=mips64 -mcpu=mips64r5 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
-; RUN: llc -march=mips64 -mcpu=mips64r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
+; RUN: llc -march=mips -mcpu=mips32 -relocation-model=pic -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,O32,NOT-R6C
+; RUN: llc -march=mips -mcpu=mips32r2 -relocation-model=pic -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,O32,NOT-R6C
+; RUN: llc -march=mips -mcpu=mips32r3 -relocation-model=pic -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,O32,NOT-R6C
+; RUN: llc -march=mips -mcpu=mips32r5 -relocation-model=pic -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,O32,NOT-R6C
+; RUN: llc -march=mips -mcpu=mips32r6 -relocation-model=pic -disable-mips-delay-filler -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,O32,R6C
+; RUN: llc -march=mips -mcpu=mips32r6 -relocation-model=pic -mattr=+fp64,+nooddspreg -disable-mips-delay-filler -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,O32,R6C
+; RUN: llc -march=mips64 -mcpu=mips4 -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,N64,NOT-R6C
+; RUN: llc -march=mips64 -mcpu=mips64 -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,N64,NOT-R6C
+; RUN: llc -march=mips64 -mcpu=mips64r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,N64,NOT-R6C
+; RUN: llc -march=mips64 -mcpu=mips64r3 -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,N64,NOT-R6C
+; RUN: llc -march=mips64 -mcpu=mips64r5 -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,N64,NOT-R6C
+; RUN: llc -march=mips64 -mcpu=mips64r6 -disable-mips-delay-filler -enable-mips-tail-calls < %s | FileCheck %s -check-prefixes=ALL,N64,R6C
declare void @extern_void_void()
declare i32 @extern_i32_void()
@@ -25,9 +25,11 @@ define i32 @call_void_void() {
; N64: ld $[[TGT:[0-9]+]], %call16(extern_void_void)($gp)
-; ALL: jalr $[[TGT]]
+; NOT-R6C: jalr $[[TGT]]
+; R6C: jalrc $[[TGT]]
call void @extern_void_void()
+; R6C: jrc $ra
ret i32 0
}
@@ -38,10 +40,12 @@ define i32 @call_i32_void() {
; N64: ld $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp)
-; ALL: jalr $[[TGT]]
+; NOT-R6C: jalr $[[TGT]]
+; R6C: jalrc $[[TGT]]
%1 = call i32 @extern_i32_void()
%2 = add i32 %1, 1
+; R6C: jrc $ra
ret i32 %2
}
@@ -55,12 +59,13 @@ define float @call_float_void() {
; N64: ld $[[TGT:[0-9]+]], %call16(extern_float_void)($gp)
-; ALL: jalr $[[TGT]]
+; NOT-R6C: jalr $[[TGT]]
+; R6C: jalrc $[[TGT]]
-; O32: move $gp, $[[GP]]
%1 = call float @extern_float_void()
%2 = fadd float %1, 1.0
+; R6C: jrc $ra
ret float %2
}
@@ -71,8 +76,7 @@ define void @musttail_call_void_void() {
; N64: ld $[[TGT:[0-9]+]], %call16(extern_void_void)($gp)
-; NOT-R6: jr $[[TGT]]
-; R6: r6.jr $[[TGT]]
+; ALL: jr $[[TGT]]
musttail call void @extern_void_void()
ret void
@@ -85,8 +89,7 @@ define i32 @musttail_call_i32_void() {
; N64: ld $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp)
-; NOT-R6: jr $[[TGT]]
-; R6: r6.jr $[[TGT]]
+; ALL: jr $[[TGT]]
%1 = musttail call i32 @extern_i32_void()
ret i32 %1
@@ -99,8 +102,7 @@ define float @musttail_call_float_void() {
; N64: ld $[[TGT:[0-9]+]], %call16(extern_float_void)($gp)
-; NOT-R6: jr $[[TGT]]
-; R6: r6.jr $[[TGT]]
+; ALL: jr $[[TGT]]
%1 = musttail call float @extern_float_void()
ret float %1
@@ -110,9 +112,11 @@ define i32 @indirect_call_void_void(void ()* %addr) {
; ALL-LABEL: indirect_call_void_void:
; ALL: move $25, $4
-; ALL: jalr $25
+; NOT-R6C: jalr $25
+; R6C: jalrc $25
call void %addr()
+; R6C: jrc $ra
ret i32 0
}
@@ -120,10 +124,13 @@ define i32 @indirect_call_i32_void(i32 ()* %addr) {
; ALL-LABEL: indirect_call_i32_void:
; ALL: move $25, $4
-; ALL: jalr $25
+; NOT-R6C: jalr $25
+; R6C: jalrc $25
+
%1 = call i32 %addr()
%2 = add i32 %1, 1
+; R6C: jrc $ra
ret i32 %2
}
@@ -131,10 +138,13 @@ define float @indirect_call_float_void(float ()* %addr) {
; ALL-LABEL: indirect_call_float_void:
; ALL: move $25, $4
-; ALL: jalr $25
+; NOT-R6C: jalr $25
+; R6C: jalrc $25
+
%1 = call float %addr()
%2 = fadd float %1, 1.0
+; R6C: jrc $ra
ret float %2
}
@@ -178,7 +188,8 @@ define hidden void @thunk_undef_double(i32 %this, double %volume) unnamed_addr a
; ALL-LABEL: thunk_undef_double:
; O32: # implicit-def: %A2
; O32: # implicit-def: %A3
-; ALL: jr $25
+; ALL: jr $25
+
tail call void @undef_double(i32 undef, double undef) #8
ret void
}
@@ -190,10 +201,12 @@ define i32 @jal_only_allows_symbols() {
; ALL-NOT: {{jal }}
; ALL: addiu $[[TGT:[0-9]+]], $zero, 1234
; ALL-NOT: {{jal }}
-; ALL: jalr $[[TGT]]
+; NOT-R6C: jalr $[[TGT]]
+; R6C: jalrc $[[TGT]]
; ALL-NOT: {{jal }}
call void () inttoptr (i32 1234 to void ()*)()
+; R6C: jrc $ra
ret i32 0
}
diff --git a/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
index debfeb35b213..d982b570d7c2 100644
--- a/test/CodeGen/Mips/llvm-ir/indirectbr.ll
+++ b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
@@ -1,30 +1,33 @@
; Test all important variants of the unconditional 'br' instruction.
-; RUN: llc -march=mips -mcpu=mips32 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6
-; RUN: llc -march=mips64 -mcpu=mips4 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6
+; RUN: llc -march=mips -mcpu=mips32 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,R6C
+; RUN: llc -march=mips64 -mcpu=mips4 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,R6
define i32 @br(i8 *%addr) {
; ALL-LABEL: br:
; NOT-R6: jr $4 # <MCInst #{{[0-9]+}} JR
-; R6: jr $4 # <MCInst #{{[0-9]+}} JALR
+; R6C: jrc $4 # <MCInst #{{[0-9]+}} JIC
+
; ALL: $BB0_1: # %L1
; NOT-R6: jr $ra # <MCInst #{{[0-9]+}} JR
; R6: jr $ra # <MCInst #{{[0-9]+}} JALR
+; R6C: jr $ra # <MCInst #{{[0-9]+}} JALR
; ALL: addiu $2, $zero, 0
; ALL: $BB0_2: # %L2
; NOT-R6: jr $ra # <MCInst #{{[0-9]+}} JR
; R6: jr $ra # <MCInst #{{[0-9]+}} JALR
+; R6C: jr $ra # <MCInst #{{[0-9]+}} JALR
; ALL: addiu $2, $zero, 1
entry:
diff --git a/test/CodeGen/Mips/llvm-ir/lh_lhu.ll b/test/CodeGen/Mips/llvm-ir/lh_lhu.ll
new file mode 100644
index 000000000000..fadcfdb0fb4f
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/lh_lhu.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -mattr=+micromips -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -relocation-model=pic | FileCheck %s
+
+@us = global i16 0, align 2
+
+define i32 @lhfunc() {
+entry:
+; CHECK-LABEL: lhfunc
+; CHECK: lh $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+ %0 = load i16, i16* @us, align 2
+ %conv = sext i16 %0 to i32
+ ret i32 %conv
+}
+
+define i16 @lhfunc_atomic() {
+entry:
+; CHECK-LABEL: lhfunc_atomic
+; CHECK: lh $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+ %0 = load atomic i16, i16* @us acquire, align 2
+ ret i16 %0
+}
+
+define i32 @lhufunc() {
+entry:
+; CHECK-LABEL: lhufunc
+; CHECK: lhu $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+ %0 = load i16, i16* @us, align 2
+ %conv = zext i16 %0 to i32
+ ret i32 %conv
+}
diff --git a/test/CodeGen/Mips/llvm-ir/load-atomic.ll b/test/CodeGen/Mips/llvm-ir/load-atomic.ll
index a44b00bff586..baf9a74a2c54 100644
--- a/test/CodeGen/Mips/llvm-ir/load-atomic.ll
+++ b/test/CodeGen/Mips/llvm-ir/load-atomic.ll
@@ -1,9 +1,9 @@
; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL
; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL
; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=M64
+; RUN: FileCheck %s -check-prefixes=ALL,M64
; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=M64
+; RUN: FileCheck %s -check-prefixes=ALL,M64
define i8 @load_i8(i8* %ptr) {
; ALL-LABEL: load_i8
diff --git a/test/CodeGen/Mips/llvm-ir/lshr.ll b/test/CodeGen/Mips/llvm-ir/lshr.ll
index 3a7029fa5b7a..10748b9c803a 100644
--- a/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -1,42 +1,33 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=M2
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R6
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=M3
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=64R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,M2
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,64R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR6
define signext i1 @lshr_i1(i1 signext %a, i1 signext %b) {
entry:
@@ -53,7 +44,9 @@ entry:
; ALL-LABEL: lshr_i8:
; ALL: srlv $[[T0:[0-9]+]], $4, $5
- ; ALL: andi $2, $[[T0]], 255
+ ; GP32: andi $2, $[[T0]], 255
+ ; GP64: andi $2, $[[T0]], 255
+ ; MM: andi16 $2, $[[T0]], 255
%r = lshr i8 %a, %b
ret i8 %r
@@ -64,7 +57,9 @@ entry:
; ALL-LABEL: lshr_i16:
; ALL: srlv $[[T0:[0-9]+]], $4, $5
- ; ALL: andi $2, $[[T0]], 65535
+ ; GP32: andi $2, $[[T0]], 65535
+ ; GP64: andi $2, $[[T0]], 65535
+ ; MM: andi16 $2, $[[T0]], 65535
%r = lshr i16 %a, %b
ret i16 %r
@@ -127,6 +122,29 @@ entry:
; GP64: dsrlv $2, $4, $5
+ ; MMR3: srlv $[[T0:[0-9]+]], $5, $7
+ ; MMR3: sll16 $[[T1:[0-9]+]], $4, 1
+ ; MMR3: not16 $[[T2:[0-9]+]], $7
+ ; MMR3: sllv $[[T3:[0-9]+]], $[[T1]], $[[T2]]
+ ; MMR3: or16 $[[T4:[0-9]+]], $[[T0]]
+ ; MMR3: srlv $[[T5:[0-9]+]], $4, $7
+ ; MMR3: andi16 $[[T6:[0-9]+]], $7, 32
+ ; MMR3: movn $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+ ; MMR3: lui $[[T8:[0-9]+]], 0
+ ; MMR3: movn $2, $[[T8]], $[[T6]]
+
+ ; MMR6: srlv $[[T0:[0-9]+]], $5, $7
+ ; MMR6: sll16 $[[T1:[0-9]+]], $4, 1
+ ; MMR6: not16 $[[T2:[0-9]+]], $7
+ ; MMR6: sllv $[[T3:[0-9]+]], $[[T1]], $[[T2]]
+ ; MMR6: or16 $[[T4:[0-9]+]], $[[T0]]
+ ; MMR6: andi16 $[[T5:[0-9]+]], $7, 32
+ ; MMR6: seleqz $[[T6:[0-9]+]], $[[T4]], $[[T5]]
+ ; MMR6: srlv $[[T7:[0-9]+]], $4, $7
+ ; MMR6: selnez $[[T8:[0-9]+]], $[[T7]], $[[T5]]
+ ; MMR6: or $3, $[[T8]], $[[T6]]
+ ; MMR6: seleqz $2, $[[T7]], $[[T5]]
+
%r = lshr i64 %a, %b
ret i64 %r
}
@@ -182,6 +200,8 @@ entry:
; 64R6: jr $ra
; 64R6: seleqz $2, $[[T9]], $[[T7]]
+ ; MM: lw $25, %call16(__lshrti3)($2)
+
%r = lshr i128 %a, %b
ret i128 %r
}
diff --git a/test/CodeGen/Mips/llvm-ir/mul.ll b/test/CodeGen/Mips/llvm-ir/mul.ll
index a7582805dd74..8d63e496806c 100644
--- a/test/CodeGen/Mips/llvm-ir/mul.ll
+++ b/test/CodeGen/Mips/llvm-ir/mul.ll
@@ -1,27 +1,33 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=M2 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=32R1-R5 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=32R1-R5 -check-prefix=32R2-R5 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=32R1-R5 -check-prefix=32R2-R5 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=32R1-R5 -check-prefix=32R2-R5 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=32R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=M4 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=64R1-R5 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=64R1-R5 -check-prefix=GP64 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=64R1-R5 -check-prefix=GP64 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=64R1-R5 -check-prefix=GP64 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=64R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,M2,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R1-R5,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R1-R5,32R2-R5,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R1-R5,32R2-R5,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R1-R5,32R2-R5,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R6,GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,M4,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,64R1-R5,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,64R1-R5,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,64R1-R5,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,64R1-R5,GP64,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=ALL,64R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=MM32,MM32R3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefixes=MM32,MM32R6
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic | \
+; RUN: FileCheck %s -check-prefix=64R6
define signext i1 @mul_i1(i1 signext %a, i1 signext %b) {
entry:
@@ -53,6 +59,10 @@ entry:
; 64R6: sll $[[T0]], $[[T0]], 31
; 64R6: sra $2, $[[T0]], 31
+ ; MM32: mul $[[T0:[0-9]+]], $4, $5
+ ; MM32: sll $[[T0]], $[[T0]], 31
+ ; MM32: sra $2, $[[T0]], 31
+
%r = mul i1 %a, %b
ret i1 %r
}
@@ -90,6 +100,10 @@ entry:
; 64R6: mul $[[T0:[0-9]+]], $4, $5
; 64R6: seb $2, $[[T0]]
+
+ ; MM32: mul $[[T0:[0-9]+]], $4, $5
+ ; MM32: seb $2, $[[T0]]
+
%r = mul i8 %a, %b
ret i8 %r
}
@@ -127,6 +141,10 @@ entry:
; 64R6: mul $[[T0:[0-9]+]], $4, $5
; 64R6: seh $2, $[[T0]]
+
+ ; MM32: mul $[[T0:[0-9]+]], $4, $5
+ ; MM32: seh $2, $[[T0]]
+
%r = mul i16 %a, %b
ret i16 %r
}
@@ -143,6 +161,9 @@ entry:
; 64R1-R5: mul $2, $4, $5
; 64R6: mul $2, $4, $5
+
+ ; MM32: mul $2, $4, $5
+
%r = mul i32 %a, %b
ret i32 %r
}
@@ -169,12 +190,12 @@ entry:
; 32R1-R5: addu $[[T0]], $[[T0]], $[[T2:[0-9]+]]
; 32R1-R5: addu $2, $[[T0]], $[[T1]]
- ; 32R6: mul $[[T0:[0-9]+]], $5, $6
- ; 32R6: muhu $[[T1:[0-9]+]], $5, $7
- ; 32R6: addu $[[T0]], $[[T1]], $[[T0]]
- ; 32R6: mul $[[T2:[0-9]+]], $4, $7
- ; 32R6: addu $2, $[[T0]], $[[T2]]
- ; 32R6: mul $3, $5, $7
+ ; 32R6-DAG: mul $3, $5, $7
+ ; 32R6-DAG: mul $[[T0:[0-9]+]], $4, $7
+ ; 32R6-DAG: mul $[[T1:[0-9]+]], $5, $6
+ ; 32R6: muhu $[[T2:[0-9]+]], $5, $7
+ ; 32R6: addu $[[T1]], $[[T2]], $[[T1]]
+ ; 32R6: addu $2, $[[T1]], $[[T0]]
; M4: dmult $4, $5
; M4: mflo $2
@@ -184,6 +205,21 @@ entry:
; 64R6: dmul $2, $4, $5
+ ; MM32R3: multu $[[T0:[0-9]+]], $7
+ ; MM32R3: mflo $[[T1:[0-9]+]]
+ ; MM32R3: mfhi $[[T2:[0-9]+]]
+ ; MM32R3: mul $[[T3:[0-9]+]], $4, $7
+ ; MM32R3: mul $[[T0]], $[[T0]], $6
+ ; MM32R3: addu16 $[[T2]], $[[T2]], $[[T0]]
+ ; MM32R3: addu16 $2, $[[T2]], $[[T3]]
+
+ ; MM32R6: mul $[[T0:[0-9]+]], $5, $7
+ ; MM32R6: mul $[[T1:[0-9]+]], $4, $7
+ ; MM32R6: mul $[[T2:[0-9]+]], $5, $6
+ ; MM32R6: muhu $[[T3:[0-9]+]], $5, $7
+ ; MM32R6: addu16 $[[T2]], $[[T3]], $[[T2]]
+ ; MM32R6: addu16 $2, $[[T2]], $[[T1]]
+
%r = mul i64 %a, %b
ret i64 %r
}
@@ -204,12 +240,14 @@ entry:
; GP64-NOT-R6: daddu $[[T3:[0-9]+]], $[[T2]], $[[T1]]
; GP64-NOT-R6: daddu $2, $[[T3:[0-9]+]], $[[T0]]
- ; 64R6: dmul $[[T0:[0-9]+]], $5, $6
- ; 64R6: dmuhu $[[T1:[0-9]+]], $5, $7
- ; 64R6: daddu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
- ; 64R6: dmul $[[T3:[0-9]+]], $4, $7
- ; 64R6: daddu $2, $[[T2]], $[[T3]]
- ; 64R6: dmul $3, $5, $7
+ ; 64R6-DAG: dmul $3, $5, $7
+ ; 64R6-DAG: dmul $[[T0:[0-9]+]], $4, $7
+ ; 64R6-DAG: dmul $[[T1:[0-9]+]], $5, $6
+ ; 64R6: dmuhu $[[T2:[0-9]+]], $5, $7
+ ; 64R6: daddu $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+ ; 64R6: daddu $2, $[[T1]], $[[T0]]
+
+ ; MM32: lw $25, %call16(__multi3)($2)
%r = mul i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/not.ll b/test/CodeGen/Mips/llvm-ir/not.ll
new file mode 100644
index 000000000000..5f7374f6dfbb
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/not.ll
@@ -0,0 +1,239 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM64
+
+define signext i1 @not_i1(i1 signext %a) {
+entry:
+; ALL-LABEL: not_i1:
+
+ ; GP32: not $2, $4
+
+ ; GP64: not $2, $4
+
+ ; MM: not16 $2, $4
+
+ %r = xor i1 %a, -1
+ ret i1 %r
+}
+
+define signext i8 @not_i8(i8 signext %a) {
+entry:
+; ALL-LABEL: not_i8:
+
+ ; GP32: not $2, $4
+
+ ; GP64: not $2, $4
+
+ ; MM: not16 $2, $4
+
+ %r = xor i8 %a, -1
+ ret i8 %r
+}
+
+define signext i16 @not_i16(i16 signext %a) {
+entry:
+; ALL-LABEL: not_i16:
+
+ ; GP32: not $2, $4
+
+ ; GP64: not $2, $4
+
+ ; MM: not16 $2, $4
+
+ %r = xor i16 %a, -1
+ ret i16 %r
+}
+
+define signext i32 @not_i32(i32 signext %a) {
+entry:
+; ALL-LABEL: not_i32:
+
+ ; GP32: not $2, $4
+
+ ; GP64: not $2, $4
+
+ ; MM: not16 $2, $4
+
+ %r = xor i32 %a, -1
+ ret i32 %r
+}
+
+define signext i64 @not_i64(i64 signext %a) {
+entry:
+; ALL-LABEL: not_i64:
+
+ ; GP32: not $2, $4
+ ; GP32: not $3, $5
+
+ ; GP64: daddiu $[[T0:[0-9]+]], $zero, -1
+ ; GP64: xor $2, $4, $[[T0]]
+
+ ; MM32: not16 $2, $4
+ ; MM32: not16 $3, $5
+
+ ; MM64: daddiu $[[T0:[0-9]+]], $zero, -1
+ ; MM64: xor $2, $4, $[[T0]]
+
+ %r = xor i64 %a, -1
+ ret i64 %r
+}
+
+define signext i128 @not_i128(i128 signext %a) {
+entry:
+; ALL-LABEL: not_i128:
+
+ ; GP32: not $2, $4
+ ; GP32: not $3, $5
+ ; GP32: not $4, $6
+ ; GP32: not $5, $7
+
+ ; GP64: daddiu $[[T0:[0-9]+]], $zero, -1
+ ; GP64: xor $2, $4, $[[T0]]
+ ; GP64: xor $3, $5, $[[T0]]
+
+ ; MM32: not16 $2, $4
+ ; MM32: not16 $3, $5
+ ; MM32: not16 $4, $6
+ ; MM32: not16 $5, $7
+
+ ; MM64: daddiu $[[T0:[0-9]+]], $zero, -1
+ ; MM64: xor $2, $4, $[[T0]]
+ ; MM64: xor $3, $5, $[[T0]]
+
+ %r = xor i128 %a, -1
+ ret i128 %r
+}
+
+define signext i1 @nor_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: nor_i1:
+
+ ; ALL: nor $2, $5, $4
+
+ %or = or i1 %b, %a
+ %r = xor i1 %or, -1
+ ret i1 %r
+}
+
+define signext i8 @nor_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: nor_i8:
+
+ ; ALL: nor $2, $5, $4
+
+ %or = or i8 %b, %a
+ %r = xor i8 %or, -1
+ ret i8 %r
+}
+
+define signext i16 @nor_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: nor_i16:
+
+ ; ALL: nor $2, $5, $4
+
+ %or = or i16 %b, %a
+ %r = xor i16 %or, -1
+ ret i16 %r
+}
+
+define signext i32 @nor_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: nor_i32:
+
+ ; GP32: nor $2, $5, $4
+
+ ; GP64: or $[[T0:[0-9]+]], $5, $4
+ ; GP64: sll $[[T1:[0-9]+]], $[[T0]], 0
+ ; GP64: not $2, $[[T1]]
+
+ ; MM32: nor $2, $5, $4
+
+ ; MM64: or $[[T0:[0-9]+]], $5, $4
+ ; MM64: sll $[[T1:[0-9]+]], $[[T0]], 0
+ ; MM64: not16 $2, $[[T1]]
+
+ %or = or i32 %b, %a
+ %r = xor i32 %or, -1
+ ret i32 %r
+}
+
+
+define signext i64 @nor_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: nor_i64:
+
+ ; GP32: nor $2, $6, $4
+ ; GP32: nor $3, $7, $5
+
+ ; GP64: nor $2, $5, $4
+
+ ; MM32: nor $2, $6, $4
+ ; MM32: nor $3, $7, $5
+
+ ; MM64: nor $2, $5, $4
+
+ %or = or i64 %b, %a
+ %r = xor i64 %or, -1
+ ret i64 %r
+}
+
+define signext i128 @nor_i128(i128 signext %a, i128 signext %b) {
+entry:
+; ALL-LABEL: nor_i128:
+
+ ; GP32: lw $[[T0:[0-9]+]], 24($sp)
+ ; GP32: lw $[[T1:[0-9]+]], 20($sp)
+ ; GP32: lw $[[T2:[0-9]+]], 16($sp)
+ ; GP32: nor $2, $[[T2]], $4
+ ; GP32: nor $3, $[[T1]], $5
+ ; GP32: nor $4, $[[T0]], $6
+ ; GP32: lw $[[T3:[0-9]+]], 28($sp)
+ ; GP32: nor $5, $[[T3]], $7
+
+ ; GP64: nor $2, $6, $4
+ ; GP64: nor $3, $7, $5
+
+ ; MM32: lw $[[T0:[0-9]+]], 20($sp)
+ ; MM32: lw $[[T1:[0-9]+]], 16($sp)
+ ; MM32: nor $2, $[[T1]], $4
+ ; MM32: nor $3, $[[T0]], $5
+ ; MM32: lw $[[T2:[0-9]+]], 24($sp)
+ ; MM32: nor $4, $[[T2]], $6
+ ; MM32: lw $[[T3:[0-9]+]], 28($sp)
+ ; MM32: nor $5, $[[T3]], $7
+
+ ; MM64: nor $2, $6, $4
+ ; MM64: nor $3, $7, $5
+
+ %or = or i128 %b, %a
+ %r = xor i128 %or, -1
+ ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/or.ll b/test/CodeGen/Mips/llvm-ir/or.ll
index 8509d6ce93f3..192e5de62301 100644
--- a/test/CodeGen/Mips/llvm-ir/or.ll
+++ b/test/CodeGen/Mips/llvm-ir/or.ll
@@ -1,35 +1,33 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM64
define signext i1 @or_i1(i1 signext %a, i1 signext %b) {
entry:
; ALL-LABEL: or_i1:
- ; ALL: or $2, $4, $5
+ ; GP32: or $2, $4, $5
+
+ ; GP64: or $2, $4, $5
+
+ ; MM: or16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = or i1 %a, %b
ret i1 %r
@@ -39,7 +37,12 @@ define signext i8 @or_i8(i8 signext %a, i8 signext %b) {
entry:
; ALL-LABEL: or_i8:
- ; ALL: or $2, $4, $5
+ ; GP32: or $2, $4, $5
+
+ ; GP64: or $2, $4, $5
+
+ ; MM: or16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = or i8 %a, %b
ret i8 %r
@@ -49,7 +52,12 @@ define signext i16 @or_i16(i16 signext %a, i16 signext %b) {
entry:
; ALL-LABEL: or_i16:
- ; ALL: or $2, $4, $5
+ ; GP32: or $2, $4, $5
+
+ ; GP64: or $2, $4, $5
+
+ ; MM: or16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = or i16 %a, %b
ret i16 %r
@@ -59,12 +67,18 @@ define signext i32 @or_i32(i32 signext %a, i32 signext %b) {
entry:
; ALL-LABEL: or_i32:
- ; GP32: or $2, $4, $5
+ ; GP32: or $2, $4, $5
- ; GP64: or $[[T0:[0-9]+]], $4, $5
+ ; GP64: or $[[T0:[0-9]+]], $4, $5
; FIXME: The sll instruction below is redundant.
; GP64: sll $2, $[[T0]], 0
+ ; MM32: or16 $[[T0:[0-9]+]], $5
+ ; MM32: move $2, $[[T0]]
+
+ ; MM64: or $[[T0:[0-9]+]], $4, $5
+ ; MM64: sll $2, $[[T0]], 0
+
%r = or i32 %a, %b
ret i32 %r
}
@@ -73,10 +87,17 @@ define signext i64 @or_i64(i64 signext %a, i64 signext %b) {
entry:
; ALL-LABEL: or_i64:
- ; GP32: or $2, $4, $6
- ; GP32: or $3, $5, $7
+ ; GP32: or $2, $4, $6
+ ; GP32: or $3, $5, $7
+
+ ; GP64: or $2, $4, $5
- ; GP64: or $2, $4, $5
+ ; MM32: or16 $[[T0:[0-9]+]], $6
+ ; MM32: or16 $[[T1:[0-9]+]], $7
+ ; MM32: move $2, $[[T0]]
+ ; MM32: move $3, $[[T1]]
+
+ ; MM64: or $2, $4, $5
%r = or i64 %a, %b
ret i64 %r
@@ -86,18 +107,557 @@ define signext i128 @or_i128(i128 signext %a, i128 signext %b) {
entry:
; ALL-LABEL: or_i128:
- ; GP32: lw $[[T0:[0-9]+]], 24($sp)
- ; GP32: lw $[[T1:[0-9]+]], 20($sp)
- ; GP32: lw $[[T2:[0-9]+]], 16($sp)
- ; GP32: or $2, $4, $[[T2]]
- ; GP32: or $3, $5, $[[T1]]
- ; GP32: or $4, $6, $[[T0]]
- ; GP32: lw $[[T3:[0-9]+]], 28($sp)
- ; GP32: or $5, $7, $[[T3]]
+ ; GP32: lw $[[T0:[0-9]+]], 24($sp)
+ ; GP32: lw $[[T1:[0-9]+]], 20($sp)
+ ; GP32: lw $[[T2:[0-9]+]], 16($sp)
+ ; GP32: or $2, $4, $[[T2]]
+ ; GP32: or $3, $5, $[[T1]]
+ ; GP32: or $4, $6, $[[T0]]
+ ; GP32: lw $[[T3:[0-9]+]], 28($sp)
+ ; GP32: or $5, $7, $[[T3]]
+
+ ; GP64: or $2, $4, $6
+ ; GP64: or $3, $5, $7
+
+ ; MM32: lw $[[T0:[0-9]+]], 20($sp)
+ ; MM32: lw $[[T1:[0-9]+]], 16($sp)
+ ; MM32: or16 $[[T1]], $4
+ ; MM32: or16 $[[T0]], $5
+ ; MM32: lw $[[T2:[0-9]+]], 24($sp)
+ ; MM32: or16 $[[T2]], $6
+ ; MM32: lw $[[T3:[0-9]+]], 28($sp)
+ ; MM32: or16 $[[T3]], $7
- ; GP64: or $2, $4, $6
- ; GP64: or $3, $5, $7
+ ; MM64: or $2, $4, $6
+ ; MM64: or $3, $5, $7
%r = or i128 %a, %b
ret i128 %r
}
+
+define signext i1 @or_i1_4(i1 signext %b) {
+entry:
+; ALL-LABEL: or_i1_4:
+
+ ; ALL: move $2, $4
+
+ %r = or i1 4, %b
+ ret i1 %r
+}
+
+define signext i8 @or_i8_4(i8 signext %b) {
+entry:
+; ALL-LABEL: or_i8_4:
+
+ ; ALL: ori $2, $4, 4
+
+ %r = or i8 4, %b
+ ret i8 %r
+}
+
+define signext i16 @or_i16_4(i16 signext %b) {
+entry:
+; ALL-LABEL: or_i16_4:
+
+ ; ALL: ori $2, $4, 4
+
+ %r = or i16 4, %b
+ ret i16 %r
+}
+
+define signext i32 @or_i32_4(i32 signext %b) {
+entry:
+; ALL-LABEL: or_i32_4:
+
+ ; ALL: ori $2, $4, 4
+
+ %r = or i32 4, %b
+ ret i32 %r
+}
+
+define signext i64 @or_i64_4(i64 signext %b) {
+entry:
+; ALL-LABEL: or_i64_4:
+
+ ; GP32: ori $3, $5, 4
+ ; GP32: move $2, $4
+
+ ; GP64: ori $2, $4, 4
+
+ ; MM32: ori $3, $5, 4
+ ; MM32: move $2, $4
+
+ ; MM64: ori $2, $4, 4
+
+ %r = or i64 4, %b
+ ret i64 %r
+}
+
+define signext i128 @or_i128_4(i128 signext %b) {
+entry:
+; ALL-LABEL: or_i128_4:
+
+ ; GP32: ori $[[T0:[0-9]+]], $7, 4
+ ; GP32: move $2, $4
+ ; GP32: move $3, $5
+ ; GP32: move $4, $6
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: ori $3, $5, 4
+ ; GP64: move $2, $4
+
+ ; MM32: ori $[[T0:[0-9]+]], $7, 4
+ ; MM32: move $2, $4
+ ; MM32: move $3, $5
+ ; MM32: move $4, $6
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: ori $3, $5, 4
+ ; MM64: move $2, $4
+
+ %r = or i128 4, %b
+ ret i128 %r
+}
+
+define signext i1 @or_i1_31(i1 signext %b) {
+entry:
+; ALL-LABEL: or_i1_31:
+
+ ; GP32: addiu $2, $zero, -1
+
+ ; GP64: addiu $2, $zero, -1
+
+ ; MM: li16 $2, -1
+
+ %r = or i1 31, %b
+ ret i1 %r
+}
+
+define signext i8 @or_i8_31(i8 signext %b) {
+entry:
+; ALL-LABEL: or_i8_31:
+
+ ; ALL: ori $2, $4, 31
+
+ %r = or i8 31, %b
+ ret i8 %r
+}
+
+define signext i16 @or_i16_31(i16 signext %b) {
+entry:
+; ALL-LABEL: or_i16_31:
+
+ ; ALL: ori $2, $4, 31
+
+ %r = or i16 31, %b
+ ret i16 %r
+}
+
+define signext i32 @or_i32_31(i32 signext %b) {
+entry:
+; ALL-LABEL: or_i32_31:
+
+ ; ALL: ori $2, $4, 31
+
+ %r = or i32 31, %b
+ ret i32 %r
+}
+
+define signext i64 @or_i64_31(i64 signext %b) {
+entry:
+; ALL-LABEL: or_i64_31:
+
+ ; GP32: ori $3, $5, 31
+ ; GP32: move $2, $4
+
+ ; GP64: ori $2, $4, 31
+
+ ; MM32: ori $3, $5, 31
+ ; MM32: move $2, $4
+
+ ; MM64: ori $2, $4, 31
+
+ %r = or i64 31, %b
+ ret i64 %r
+}
+
+define signext i128 @or_i128_31(i128 signext %b) {
+entry:
+; ALL-LABEL: or_i128_31:
+
+ ; GP32: ori $[[T0:[0-9]+]], $7, 31
+ ; GP32: move $2, $4
+ ; GP32: move $3, $5
+ ; GP32: move $4, $6
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: ori $3, $5, 31
+ ; GP64: move $2, $4
+
+ ; MM32: ori $[[T0:[0-9]+]], $7, 31
+ ; MM32: move $2, $4
+ ; MM32: move $3, $5
+ ; MM32: move $4, $6
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: ori $3, $5, 31
+ ; MM64: move $2, $4
+
+ %r = or i128 31, %b
+ ret i128 %r
+}
+
+define signext i1 @or_i1_255(i1 signext %b) {
+entry:
+; ALL-LABEL: or_i1_255:
+
+ ; GP32: addiu $2, $zero, -1
+
+ ; GP64: addiu $2, $zero, -1
+
+ ; MM: li16 $2, -1
+
+ %r = or i1 255, %b
+ ret i1 %r
+}
+
+define signext i8 @or_i8_255(i8 signext %b) {
+entry:
+; ALL-LABEL: or_i8_255:
+
+ ; GP32: addiu $2, $zero, -1
+
+ ; GP64: addiu $2, $zero, -1
+
+ ; MM: li16 $2, -1
+
+ %r = or i8 255, %b
+ ret i8 %r
+}
+
+define signext i16 @or_i16_255(i16 signext %b) {
+entry:
+; ALL-LABEL: or_i16_255:
+
+ ; ALL: ori $2, $4, 255
+
+ %r = or i16 255, %b
+ ret i16 %r
+}
+
+define signext i32 @or_i32_255(i32 signext %b) {
+entry:
+; ALL-LABEL: or_i32_255:
+
+ ; ALL: ori $2, $4, 255
+
+ %r = or i32 255, %b
+ ret i32 %r
+}
+
+define signext i64 @or_i64_255(i64 signext %b) {
+entry:
+; ALL-LABEL: or_i64_255:
+
+ ; GP32: ori $3, $5, 255
+ ; GP32: move $2, $4
+
+ ; GP64: ori $2, $4, 255
+
+ ; MM32: ori $3, $5, 255
+ ; MM32: move $2, $4
+
+ ; MM64: ori $2, $4, 255
+
+ %r = or i64 255, %b
+ ret i64 %r
+}
+
+define signext i128 @or_i128_255(i128 signext %b) {
+entry:
+; ALL-LABEL: or_i128_255:
+
+ ; GP32: ori $[[T0:[0-9]+]], $7, 255
+ ; GP32: move $2, $4
+ ; GP32: move $3, $5
+ ; GP32: move $4, $6
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: ori $3, $5, 255
+ ; GP64: move $2, $4
+
+ ; MM32: ori $[[T0:[0-9]+]], $7, 255
+ ; MM32: move $2, $4
+ ; MM32: move $3, $5
+ ; MM32: move $4, $6
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: ori $3, $5, 255
+ ; MM64: move $2, $4
+
+ %r = or i128 255, %b
+ ret i128 %r
+}
+
+define signext i1 @or_i1_32768(i1 signext %b) {
+entry:
+; ALL-LABEL: or_i1_32768:
+
+ ; ALL: move $2, $4
+
+ %r = or i1 32768, %b
+ ret i1 %r
+}
+
+define signext i8 @or_i8_32768(i8 signext %b) {
+entry:
+; ALL-LABEL: or_i8_32768:
+
+ ; ALL: move $2, $4
+
+ %r = or i8 32768, %b
+ ret i8 %r
+}
+
+define signext i16 @or_i16_32768(i16 signext %b) {
+entry:
+; ALL-LABEL: or_i16_32768:
+
+ ; GP32: addiu $[[T0:[0-9]+]], $zero, -32768
+ ; GP32: or $2, $4, $[[T0]]
+
+ ; GP64: addiu $[[T0:[0-9]+]], $zero, -32768
+ ; GP64: or $2, $4, $[[T0]]
+
+ ; MM: addiu $2, $zero, -32768
+ ; MM: or16 $2, $4
+
+ %r = or i16 32768, %b
+ ret i16 %r
+}
+
+define signext i32 @or_i32_32768(i32 signext %b) {
+entry:
+; ALL-LABEL: or_i32_32768:
+
+ ; ALL: ori $2, $4, 32768
+
+ %r = or i32 32768, %b
+ ret i32 %r
+}
+
+define signext i64 @or_i64_32768(i64 signext %b) {
+entry:
+; ALL-LABEL: or_i64_32768:
+
+ ; GP32: ori $3, $5, 32768
+ ; GP32: move $2, $4
+
+ ; GP64: ori $2, $4, 32768
+
+ ; MM32: ori $3, $5, 32768
+ ; MM32: move $2, $4
+
+ ; MM64: ori $2, $4, 32768
+
+ %r = or i64 32768, %b
+ ret i64 %r
+}
+
+define signext i128 @or_i128_32768(i128 signext %b) {
+entry:
+; ALL-LABEL: or_i128_32768:
+
+ ; GP32: ori $[[T0:[0-9]+]], $7, 32768
+ ; GP32: move $2, $4
+ ; GP32: move $3, $5
+ ; GP32: move $4, $6
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: ori $3, $5, 32768
+ ; GP64: move $2, $4
+
+ ; MM32: ori $[[T0:[0-9]+]], $7, 32768
+ ; MM32: move $2, $4
+ ; MM32: move $3, $5
+ ; MM32: move $4, $6
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: ori $3, $5, 32768
+ ; MM64: move $2, $4
+
+ %r = or i128 32768, %b
+ ret i128 %r
+}
+
+define signext i1 @or_i1_65(i1 signext %b) {
+entry:
+; ALL-LABEL: or_i1_65:
+
+ ; GP32: addiu $2, $zero, -1
+
+ ; GP64: addiu $2, $zero, -1
+
+ ; MM: li16 $2, -1
+
+ %r = or i1 65, %b
+ ret i1 %r
+}
+
+define signext i8 @or_i8_65(i8 signext %b) {
+entry:
+; ALL-LABEL: or_i8_65:
+
+ ; ALL: ori $2, $4, 65
+
+ %r = or i8 65, %b
+ ret i8 %r
+}
+
+define signext i16 @or_i16_65(i16 signext %b) {
+entry:
+; ALL-LABEL: or_i16_65:
+
+ ; ALL: ori $2, $4, 65
+
+ %r = or i16 65, %b
+ ret i16 %r
+}
+
+define signext i32 @or_i32_65(i32 signext %b) {
+entry:
+; ALL-LABEL: or_i32_65:
+
+ ; ALL: ori $2, $4, 65
+
+ %r = or i32 65, %b
+ ret i32 %r
+}
+
+define signext i64 @or_i64_65(i64 signext %b) {
+entry:
+; ALL-LABEL: or_i64_65:
+
+ ; GP32: ori $3, $5, 65
+ ; GP32: move $2, $4
+
+ ; GP64: ori $2, $4, 65
+
+ ; MM32: ori $3, $5, 65
+ ; MM32: move $2, $4
+
+ ; MM64: ori $2, $4, 65
+
+ %r = or i64 65, %b
+ ret i64 %r
+}
+
+define signext i128 @or_i128_65(i128 signext %b) {
+entry:
+; ALL-LABEL: or_i128_65:
+
+ ; GP32: ori $[[T0:[0-9]+]], $7, 65
+ ; GP32: move $2, $4
+ ; GP32: move $3, $5
+ ; GP32: move $4, $6
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: ori $3, $5, 65
+ ; GP64: move $2, $4
+
+ ; MM32: ori $[[T0:[0-9]+]], $7, 65
+ ; MM32: move $2, $4
+ ; MM32: move $3, $5
+ ; MM32: move $4, $6
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: ori $3, $5, 65
+ ; MM64: move $2, $4
+
+ %r = or i128 65, %b
+ ret i128 %r
+}
+
+define signext i1 @or_i1_256(i1 signext %b) {
+entry:
+; ALL-LABEL: or_i1_256:
+
+ ; ALL: move $2, $4
+
+ %r = or i1 256, %b
+ ret i1 %r
+}
+
+define signext i8 @or_i8_256(i8 signext %b) {
+entry:
+; ALL-LABEL: or_i8_256:
+
+ ; ALL: move $2, $4
+
+ %r = or i8 256, %b
+ ret i8 %r
+}
+
+define signext i16 @or_i16_256(i16 signext %b) {
+entry:
+; ALL-LABEL: or_i16_256:
+
+ ; ALL: ori $2, $4, 256
+
+ %r = or i16 256, %b
+ ret i16 %r
+}
+
+define signext i32 @or_i32_256(i32 signext %b) {
+entry:
+; ALL-LABEL: or_i32_256:
+
+ ; ALL: ori $2, $4, 256
+
+ %r = or i32 256, %b
+ ret i32 %r
+}
+
+define signext i64 @or_i64_256(i64 signext %b) {
+entry:
+; ALL-LABEL: or_i64_256:
+
+ ; GP32: ori $3, $5, 256
+ ; GP32: move $2, $4
+
+ ; GP64: ori $2, $4, 256
+
+ ; MM32: ori $3, $5, 256
+ ; MM32: move $2, $4
+
+ ; MM64: ori $2, $4, 256
+
+ %r = or i64 256, %b
+ ret i64 %r
+}
+
+define signext i128 @or_i128_256(i128 signext %b) {
+entry:
+; ALL-LABEL: or_i128_256:
+
+ ; GP32: ori $[[T0:[0-9]+]], $7, 256
+ ; GP32: move $2, $4
+ ; GP32: move $3, $5
+ ; GP32: move $4, $6
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: ori $3, $5, 256
+ ; GP64: move $2, $4
+
+ ; MM32: ori $[[T0:[0-9]+]], $7, 256
+ ; MM32: move $2, $4
+ ; MM32: move $3, $5
+ ; MM32: move $4, $6
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: ori $3, $5, 256
+ ; MM64: move $2, $4
+
+ %r = or i128 256, %b
+ ret i128 %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/ret.ll b/test/CodeGen/Mips/llvm-ir/ret.ll
index 0561c24219ce..9be80dc200fa 100644
--- a/test/CodeGen/Mips/llvm-ir/ret.ll
+++ b/test/CodeGen/Mips/llvm-ir/ret.ll
@@ -7,23 +7,30 @@
; affects it and it's undesirable to repeat the non-pointer returns for each
; relocation model.
-; RUN: llc -march=mips -mcpu=mips32 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=NO-MTHC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=R6
-; RUN: llc -march=mips64 -mcpu=mips4 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
-; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=R6
+; RUN: llc -march=mips -mcpu=mips32 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR32,NO-MTHC1,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR32,MTHC1,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR32,MTHC1,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR32,MTHC1,NOT-R6
+; RUN: llc -march=mips -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR32,MTHC1,R6C
+; RUN: llc -march=mips64 -mcpu=mips4 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR64,DMTC1,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR64,DMTC1,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR64,DMTC1,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR64,DMTC1,NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,GPR64,DMTC1,NOT-R6
+
+; FIXME: for the test ret_double_0x0, the delay slot of jr cannot be filled
+; as mthc1 has unmodeled side effects. This is an artifact of our backend.
+; Force the delay slot filler off to check that the sequence jr $ra; nop is
+; turned into jic 0, $ra.
+
+; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst -disable-mips-delay-filler < %s | FileCheck %s -check-prefixes=ALL,GPR64,DMTC1,R6C
define void @ret_void() {
; ALL-LABEL: ret_void:
; NOT-R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JR
; R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JALR
+; R6C-DAG: jrc $ra # <MCInst #{{[0-9]+}} JIC
ret void
}
@@ -173,6 +180,7 @@ define float @ret_float_0x3() {
; NOT-R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JR
; R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JALR
+; R6C-DAG: jrc $ra # <MCInst #{{[0-9]+}} JIC
; float constants are written as double constants
ret float 0x36b8000000000000
@@ -191,6 +199,7 @@ define double @ret_double_0x0() {
; NOT-R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JR
; R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JALR
+; R6C-DAG: jrc $ra # <MCInst #{{[0-9]+}} JIC
ret double 0x0000000000000000
}
@@ -204,6 +213,7 @@ define double @ret_double_0x3() {
; NOT-R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JR
; R6-DAG: jr $ra # <MCInst #{{[0-9]+}} JALR
+; R6C-DAG: jrc $ra # <MCInst #{{[0-9]+}} JIC
ret double 0x0000000000000003
}
diff --git a/test/CodeGen/Mips/llvm-ir/sdiv.ll b/test/CodeGen/Mips/llvm-ir/sdiv.ll
index 929ee88bb7f7..2d2b8ff12c04 100644
--- a/test/CodeGen/Mips/llvm-ir/sdiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/sdiv.ll
@@ -1,29 +1,37 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=R2-R5 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=R6 -check-prefix=64R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,NOT-R2-R6,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,NOT-R2-R6,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,R2-R5,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,R2-R5,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,R2-R5,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R6,GP32
+
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,NOT-R2-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,NOT-R2-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,NOT-R2-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,R2-R5,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,R2-R5,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,R2-R5,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R6,64R6
+
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR3,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM64
define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) {
entry:
@@ -42,6 +50,17 @@ entry:
; R6: sll $[[T1:[0-9]+]], $[[T0]], 31
; R6: sra $2, $[[T1]], 31
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $[[T0:[0-9]+]]
+ ; MMR3: sll $[[T1:[0-9]+]], $[[T0]], 31
+ ; MMR3: sra $2, $[[T1]], 31
+
+ ; MMR6: div $[[T0:[0-9]+]], $4, $5
+ ; MMR6: teq $5, $zero, 7
+ ; MMR6: sll $[[T1:[0-9]+]], $[[T0]], 31
+ ; MMR6: sra $2, $[[T1]], 31
+
%r = sdiv i1 %a, %b
ret i1 %r
}
@@ -68,6 +87,15 @@ entry:
; FIXME: This instruction is redundant.
; R6: seb $2, $[[T0]]
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $[[T0:[0-9]+]]
+ ; MMR3: seb $2, $[[T0]]
+
+ ; MMR6: div $[[T0:[0-9]+]], $4, $5
+ ; MMR6: teq $5, $zero, 7
+ ; MMR6: seb $2, $[[T0]]
+
%r = sdiv i8 %a, %b
ret i8 %r
}
@@ -94,6 +122,15 @@ entry:
; FIXME: This is instruction is redundant since div is signed.
; R6: seh $2, $[[T0]]
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $[[T0:[0-9]+]]
+ ; MMR3: seh $2, $[[T0]]
+
+ ; MMR6: div $[[T0:[0-9]+]], $4, $5
+ ; MMR6: teq $5, $zero, 7
+ ; MMR6: seh $2, $[[T0]]
+
%r = sdiv i16 %a, %b
ret i16 %r
}
@@ -109,6 +146,13 @@ entry:
; R6: div $2, $4, $5
; R6: teq $5, $zero, 7
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $2
+
+ ; MMR6: div $2, $4, $5
+ ; MMR6: teq $5, $zero, 7
+
%r = sdiv i32 %a, %b
ret i32 %r
}
@@ -126,6 +170,11 @@ entry:
; 64R6: ddiv $2, $4, $5
; 64R6: teq $5, $zero, 7
+ ; MM32: lw $25, %call16(__divdi3)($2)
+
+ ; MM64: ddiv $2, $4, $5
+ ; MM64: teq $5, $zero, 7
+
%r = sdiv i64 %a, %b
ret i64 %r
}
@@ -134,11 +183,15 @@ define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) {
entry:
; ALL-LABEL: sdiv_i128:
- ; GP32: lw $25, %call16(__divti3)($gp)
+ ; GP32: lw $25, %call16(__divti3)($gp)
+
+ ; GP64-NOT-R6: ld $25, %call16(__divti3)($gp)
+ ; 64R6: ld $25, %call16(__divti3)($gp)
+
+ ; MM32: lw $25, %call16(__divti3)($2)
- ; GP64-NOT-R6: ld $25, %call16(__divti3)($gp)
- ; 64R6: ld $25, %call16(__divti3)($gp)
+ ; MM64: ld $25, %call16(__divti3)($2)
- %r = sdiv i128 %a, %b
- ret i128 %r
+ %r = sdiv i128 %a, %b
+ ret i128 %r
}
diff --git a/test/CodeGen/Mips/llvm-ir/select-dbl.ll b/test/CodeGen/Mips/llvm-ir/select-dbl.ll
new file mode 100644
index 000000000000..1ca5b4e054ba
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/select-dbl.ll
@@ -0,0 +1,358 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,M2,M2-M3
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R1
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,SEL-32,32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,M3,M2-M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,SEL-64,64R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM32R3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM32R6,SEL-32
+
+define double @tst_select_i1_double(i1 signext %s, double %x, double %y) {
+entry:
+ ; ALL-LABEL: tst_select_i1_double:
+
+ ; M2: andi $[[T0:[0-9]+]], $4, 1
+ ; M2: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M2: nop
+ ; M2: ldc1 $f0, 16($sp)
+ ; M2: jr $ra
+ ; M2: nop
+ ; M2: $[[BB0]]:
+ ; M2: mtc1 $7, $f0
+ ; M2: jr $ra
+ ; M2: mtc1 $6, $f1
+
+ ; CMOV-32: mtc1 $7, $[[F0:f[0-9]+]]
+ ; CMOV-32R1: mtc1 $6, $f{{[0-9]+}}
+ ; CMOV-32R2-R5: mthc1 $6, $[[F0]]
+ ; CMOV-32: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV-32: ldc1 $f0, 16($sp)
+ ; CMOV-32: movn.d $f0, $[[F0]], $[[T0]]
+
+ ; SEL-32: mtc1 $7, $[[F0:f[0-9]+]]
+ ; SEL-32: mthc1 $6, $[[F0]]
+ ; SEL-32: ldc1 $[[F1:f[0-9]+]], 16($sp)
+ ; SEL-32: mtc1 $4, $f0
+ ; SEL-32: sel.d $f0, $[[F1]], $[[F0]]
+
+ ; M3: andi $[[T0:[0-9]+]], $4, 1
+ ; M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M3: nop
+ ; M3: mov.d $f13, $f14
+ ; M3: $[[BB0]]:
+ ; M3: jr $ra
+ ; M3: mov.d $f0, $f13
+
+ ; CMOV-64: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV-64: movn.d $f14, $f13, $[[T0]]
+ ; CMOV-64: mov.d $f0, $f14
+
+ ; SEL-64: mtc1 $4, $f0
+ ; SEL-64: sel.d $f0, $f14, $f13
+
+ ; MM32R3: mtc1 $7, $[[F0:f[0-9]+]]
+ ; MM32R3: mthc1 $6, $[[F0]]
+ ; MM32R3: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MM32R3: ldc1 $f0, 16($sp)
+ ; MM32R3: movn.d $f0, $[[F0]], $[[T0]]
+
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
+
+define double @tst_select_i1_double_reordered(double %x, double %y,
+ i1 signext %s) {
+entry:
+ ; ALL-LABEL: tst_select_i1_double_reordered:
+
+ ; M2: lw $[[T0:[0-9]+]], 16($sp)
+ ; M2: andi $[[T1:[0-9]+]], $[[T0]], 1
+ ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]]
+ ; M2: nop
+ ; M2: mov.d $f12, $f14
+ ; M2: $[[BB0]]:
+ ; M2: jr $ra
+ ; M2: mov.d $f0, $f12
+
+ ; CMOV-32: lw $[[T0:[0-9]+]], 16($sp)
+ ; CMOV-32: andi $[[T1:[0-9]+]], $[[T0]], 1
+ ; CMOV-32: movn.d $f14, $f12, $[[T1]]
+ ; CMOV-32: mov.d $f0, $f14
+
+ ; SEL-32: lw $[[T0:[0-9]+]], 16($sp)
+ ; SEL-32: mtc1 $[[T0]], $f0
+ ; SEL-32: sel.d $f0, $f14, $f12
+
+ ; M3: andi $[[T0:[0-9]+]], $6, 1
+ ; M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M3: nop
+ ; M3: mov.d $f12, $f13
+ ; M3: $[[BB0]]:
+ ; M3: jr $ra
+ ; M3: mov.d $f0, $f12
+
+ ; CMOV-64: andi $[[T0:[0-9]+]], $6, 1
+ ; CMOV-64: movn.d $f13, $f12, $[[T0]]
+ ; CMOV-64: mov.d $f0, $f13
+
+ ; SEL-64: mtc1 $6, $f0
+ ; SEL-64: sel.d $f0, $f13, $f12
+
+ ; MM32R3: lw $[[T0:[0-9]+]], 16($sp)
+ ; MM32R3: andi16 $[[T1:[0-9]+]], $[[T0:[0-9]+]], 1
+ ; MM32R3: movn.d $f14, $f12, $[[T1]]
+ ; MM32R3: mov.d $f0, $f14
+
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
+
+define double @tst_select_fcmp_olt_double(double %x, double %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_olt_double:
+
+ ; M2: c.olt.d $f12, $f14
+ ; M3: c.olt.d $f12, $f13
+ ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.d $f12, $f14
+ ; M3: mov.d $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.d $f0, $f12
+
+ ; CMOV-32: c.olt.d $f12, $f14
+ ; CMOV-32: movt.d $f14, $f12, $fcc0
+ ; CMOV-32: mov.d $f0, $f14
+
+ ; SEL-32: cmp.lt.d $f0, $f12, $f14
+ ; SEL-32: sel.d $f0, $f14, $f12
+
+ ; CMOV-64: c.olt.d $f12, $f13
+ ; CMOV-64: movt.d $f13, $f12, $fcc0
+ ; CMOV-64: mov.d $f0, $f13
+
+ ; SEL-64: cmp.lt.d $f0, $f12, $f13
+ ; SEL-64: sel.d $f0, $f13, $f12
+
+ ; MM32R3: c.olt.d $f12, $f14
+ ; MM32R3: movt.d $f14, $f12, $fcc0
+ ; MM32R3: mov.d $f0, $f14
+
+ %s = fcmp olt double %x, %y
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
+
+define double @tst_select_fcmp_ole_double(double %x, double %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_ole_double:
+
+ ; M2: c.ole.d $f12, $f14
+ ; M3: c.ole.d $f12, $f13
+ ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.d $f12, $f14
+ ; M3: mov.d $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.d $f0, $f12
+
+ ; CMOV-32: c.ole.d $f12, $f14
+ ; CMOV-32: movt.d $f14, $f12, $fcc0
+ ; CMOV-32: mov.d $f0, $f14
+
+ ; SEL-32: cmp.le.d $f0, $f12, $f14
+ ; SEL-32: sel.d $f0, $f14, $f12
+
+ ; CMOV-64: c.ole.d $f12, $f13
+ ; CMOV-64: movt.d $f13, $f12, $fcc0
+ ; CMOV-64: mov.d $f0, $f13
+
+ ; SEL-64: cmp.le.d $f0, $f12, $f13
+ ; SEL-64: sel.d $f0, $f13, $f12
+
+ ; MM32R3: c.ole.d $f12, $f14
+ ; MM32R3: movt.d $f14, $f12, $fcc0
+ ; MM32R3: mov.d $f0, $f14
+
+ %s = fcmp ole double %x, %y
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
+
+define double @tst_select_fcmp_ogt_double(double %x, double %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_ogt_double:
+
+ ; M2: c.ule.d $f12, $f14
+ ; M3: c.ule.d $f12, $f13
+ ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.d $f12, $f14
+ ; M3: mov.d $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.d $f0, $f12
+
+ ; CMOV-32: c.ule.d $f12, $f14
+ ; CMOV-32: movf.d $f14, $f12, $fcc0
+ ; CMOV-32: mov.d $f0, $f14
+
+ ; SEL-32: cmp.lt.d $f0, $f14, $f12
+ ; SEL-32: sel.d $f0, $f14, $f12
+
+ ; CMOV-64: c.ule.d $f12, $f13
+ ; CMOV-64: movf.d $f13, $f12, $fcc0
+ ; CMOV-64: mov.d $f0, $f13
+
+ ; SEL-64: cmp.lt.d $f0, $f13, $f12
+ ; SEL-64: sel.d $f0, $f13, $f12
+
+ ; MM32R3: c.ule.d $f12, $f14
+ ; MM32R3: movf.d $f14, $f12, $fcc0
+ ; MM32R3: mov.d $f0, $f14
+
+ %s = fcmp ogt double %x, %y
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
+
+define double @tst_select_fcmp_oge_double(double %x, double %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_oge_double:
+
+ ; M2: c.ult.d $f12, $f14
+ ; M3: c.ult.d $f12, $f13
+ ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.d $f12, $f14
+ ; M3: mov.d $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.d $f0, $f12
+
+ ; CMOV-32: c.ult.d $f12, $f14
+ ; CMOV-32: movf.d $f14, $f12, $fcc0
+ ; CMOV-32: mov.d $f0, $f14
+
+ ; SEL-32: cmp.le.d $f0, $f14, $f12
+ ; SEL-32: sel.d $f0, $f14, $f12
+
+ ; CMOV-64: c.ult.d $f12, $f13
+ ; CMOV-64: movf.d $f13, $f12, $fcc0
+ ; CMOV-64: mov.d $f0, $f13
+
+ ; SEL-64: cmp.le.d $f0, $f13, $f12
+ ; SEL-64: sel.d $f0, $f13, $f12
+
+ ; MM32R3: c.ult.d $f12, $f14
+ ; MM32R3: movf.d $f14, $f12, $fcc0
+ ; MM32R3: mov.d $f0, $f14
+
+ %s = fcmp oge double %x, %y
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
+
+define double @tst_select_fcmp_oeq_double(double %x, double %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_oeq_double:
+
+ ; M2: c.eq.d $f12, $f14
+ ; M3: c.eq.d $f12, $f13
+ ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.d $f12, $f14
+ ; M3: mov.d $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.d $f0, $f12
+
+ ; CMOV-32: c.eq.d $f12, $f14
+ ; CMOV-32: movt.d $f14, $f12, $fcc0
+ ; CMOV-32: mov.d $f0, $f14
+
+ ; SEL-32: cmp.eq.d $f0, $f12, $f14
+ ; SEL-32: sel.d $f0, $f14, $f12
+
+ ; CMOV-64: c.eq.d $f12, $f13
+ ; CMOV-64: movt.d $f13, $f12, $fcc0
+ ; CMOV-64: mov.d $f0, $f13
+
+ ; SEL-64: cmp.eq.d $f0, $f12, $f13
+ ; SEL-64: sel.d $f0, $f13, $f12
+
+ ; MM32R3: c.eq.d $f12, $f14
+ ; MM32R3: movt.d $f14, $f12, $fcc0
+ ; MM32R3: mov.d $f0, $f14
+
+ %s = fcmp oeq double %x, %y
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
+
+define double @tst_select_fcmp_one_double(double %x, double %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_one_double:
+
+ ; M2: c.ueq.d $f12, $f14
+ ; M3: c.ueq.d $f12, $f13
+ ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.d $f12, $f14
+ ; M3: mov.d $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.d $f0, $f12
+
+ ; CMOV-32: c.ueq.d $f12, $f14
+ ; CMOV-32: movf.d $f14, $f12, $fcc0
+ ; CMOV-32: mov.d $f0, $f14
+
+ ; SEL-32: cmp.ueq.d $f0, $f12, $f14
+ ; SEL-32: mfc1 $[[T0:[0-9]+]], $f0
+ ; SEL-32: not $[[T0]], $[[T0]]
+ ; SEL-32: mtc1 $[[T0:[0-9]+]], $f0
+ ; SEL-32: sel.d $f0, $f14, $f12
+
+ ; CMOV-64: c.ueq.d $f12, $f13
+ ; CMOV-64: movf.d $f13, $f12, $fcc0
+ ; CMOV-64: mov.d $f0, $f13
+
+ ; SEL-64: cmp.ueq.d $f0, $f12, $f13
+ ; SEL-64: mfc1 $[[T0:[0-9]+]], $f0
+ ; SEL-64: not $[[T0]], $[[T0]]
+ ; SEL-64: mtc1 $[[T0:[0-9]+]], $f0
+ ; SEL-64: sel.d $f0, $f13, $f12
+
+ ; MM32R3: c.ueq.d $f12, $f14
+ ; MM32R3: movf.d $f14, $f12, $fcc0
+ ; MM32R3: mov.d $f0, $f14
+
+ %s = fcmp one double %x, %y
+ %r = select i1 %s, double %x, double %y
+ ret double %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/select-flt.ll b/test/CodeGen/Mips/llvm-ir/select-flt.ll
new file mode 100644
index 000000000000..6a0334da4833
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/select-flt.ll
@@ -0,0 +1,335 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,M2,M2-M3
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R1
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,SEL-32,32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,M3,M2-M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,SEL-64,64R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM32R3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM32R6,SEL-32
+
+define float @tst_select_i1_float(i1 signext %s, float %x, float %y) {
+entry:
+ ; ALL-LABEL: tst_select_i1_float:
+
+ ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
+ ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: jr $ra
+ ; M2: mtc1 $6, $f0
+ ; M3: mov.s $f13, $f14
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2: mtc1 $5, $f0
+ ; M3: mov.s $f0, $f13
+
+ ; CMOV-32: mtc1 $6, $f0
+ ; CMOV-32: mtc1 $5, $f1
+ ; CMOV-32: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV-32: movn.s $f0, $f1, $[[T0]]
+
+ ; SEL-32: mtc1 $5, $[[F0:f[0-9]+]]
+ ; SEL-32: mtc1 $6, $[[F1:f[0-9]+]]
+ ; SEL-32: mtc1 $4, $f0
+ ; SEL-32: sel.s $f0, $[[F1]], $[[F0]]
+
+ ; CMOV-64: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV-64: movn.s $f14, $f13, $[[T0]]
+ ; CMOV-64: mov.s $f0, $f14
+
+ ; SEL-64: mtc1 $4, $f0
+ ; SEL-64: sel.s $f0, $f14, $f13
+
+ ; MM32R3: mtc1 $6, $[[F0:f[0-9]+]]
+ ; MM32R3: mtc1 $5, $[[F1:f[0-9]+]]
+ ; MM32R3: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MM32R3: movn.s $f0, $[[F1]], $[[T0]]
+
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
+
+define float @tst_select_i1_float_reordered(float %x, float %y,
+ i1 signext %s) {
+entry:
+ ; ALL-LABEL: tst_select_i1_float_reordered:
+
+ ; M2-M3: andi $[[T0:[0-9]+]], $6, 1
+ ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.s $f12, $f14
+ ; M3: mov.s $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.s $f0, $f12
+
+ ; CMOV-32: andi $[[T0:[0-9]+]], $6, 1
+ ; CMOV-32: movn.s $f14, $f12, $[[T0]]
+ ; CMOV-32: mov.s $f0, $f14
+
+ ; SEL-32: mtc1 $6, $f0
+ ; SEL-32: sel.s $f0, $f14, $f12
+
+ ; CMOV-64: andi $[[T0:[0-9]+]], $6, 1
+ ; CMOV-64: movn.s $f13, $f12, $[[T0]]
+ ; CMOV-64: mov.s $f0, $f13
+
+ ; SEL-64: mtc1 $6, $f0
+ ; SEL-64: sel.s $f0, $f13, $f12
+
+ ; MM32R3: andi16 $[[T0:[0-9]+]], $6, 1
+ ; MM32R3: movn.s $[[F0:f[0-9]+]], $f12, $[[T0]]
+ ; MM32R3: mov.s $f0, $[[F0]]
+
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
+
+define float @tst_select_fcmp_olt_float(float %x, float %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_olt_float:
+
+ ; M2: c.olt.s $f12, $f14
+ ; M3: c.olt.s $f12, $f13
+ ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.s $f12, $f14
+ ; M3: mov.s $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.s $f0, $f12
+
+ ; CMOV-32: c.olt.s $f12, $f14
+ ; CMOV-32: movt.s $f14, $f12, $fcc0
+ ; CMOV-32: mov.s $f0, $f14
+
+ ; SEL-32: cmp.lt.s $f0, $f12, $f14
+ ; SEL-32: sel.s $f0, $f14, $f12
+
+ ; CMOV-64: c.olt.s $f12, $f13
+ ; CMOV-64: movt.s $f13, $f12, $fcc0
+ ; CMOV-64: mov.s $f0, $f13
+
+ ; SEL-64: cmp.lt.s $f0, $f12, $f13
+ ; SEL-64: sel.s $f0, $f13, $f12
+
+ ; MM32R3: c.olt.s $f12, $f14
+ ; MM32R3: movt.s $f14, $f12, $fcc0
+ ; MM32R3: mov.s $f0, $f14
+
+ %s = fcmp olt float %x, %y
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
+
+define float @tst_select_fcmp_ole_float(float %x, float %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_ole_float:
+
+ ; M2: c.ole.s $f12, $f14
+ ; M3: c.ole.s $f12, $f13
+ ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.s $f12, $f14
+ ; M3: mov.s $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.s $f0, $f12
+
+ ; CMOV-32: c.ole.s $f12, $f14
+ ; CMOV-32: movt.s $f14, $f12, $fcc0
+ ; CMOV-32: mov.s $f0, $f14
+
+ ; SEL-32: cmp.le.s $f0, $f12, $f14
+ ; SEL-32: sel.s $f0, $f14, $f12
+
+ ; CMOV-64: c.ole.s $f12, $f13
+ ; CMOV-64: movt.s $f13, $f12, $fcc0
+ ; CMOV-64: mov.s $f0, $f13
+
+ ; SEL-64: cmp.le.s $f0, $f12, $f13
+ ; SEL-64: sel.s $f0, $f13, $f12
+
+ ; MM32R3: c.ole.s $f12, $f14
+ ; MM32R3: movt.s $f14, $f12, $fcc0
+ ; MM32R3: mov.s $f0, $f14
+
+ %s = fcmp ole float %x, %y
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
+
+define float @tst_select_fcmp_ogt_float(float %x, float %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_ogt_float:
+
+ ; M2: c.ule.s $f12, $f14
+ ; M3: c.ule.s $f12, $f13
+ ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.s $f12, $f14
+ ; M3: mov.s $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.s $f0, $f12
+
+ ; CMOV-32: c.ule.s $f12, $f14
+ ; CMOV-32: movf.s $f14, $f12, $fcc0
+ ; CMOV-32: mov.s $f0, $f14
+
+ ; SEL-32: cmp.lt.s $f0, $f14, $f12
+ ; SEL-32: sel.s $f0, $f14, $f12
+
+ ; CMOV-64: c.ule.s $f12, $f13
+ ; CMOV-64: movf.s $f13, $f12, $fcc0
+ ; CMOV-64: mov.s $f0, $f13
+
+ ; SEL-64: cmp.lt.s $f0, $f13, $f12
+ ; SEL-64: sel.s $f0, $f13, $f12
+
+ ; MM32R3: c.ule.s $f12, $f14
+ ; MM32R3: movf.s $f14, $f12, $fcc0
+ ; MM32R3: mov.s $f0, $f14
+
+ %s = fcmp ogt float %x, %y
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
+
+define float @tst_select_fcmp_oge_float(float %x, float %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_oge_float:
+
+ ; M2: c.ult.s $f12, $f14
+ ; M3: c.ult.s $f12, $f13
+ ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.s $f12, $f14
+ ; M3: mov.s $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.s $f0, $f12
+
+ ; CMOV-32: c.ult.s $f12, $f14
+ ; CMOV-32: movf.s $f14, $f12, $fcc0
+ ; CMOV-32: mov.s $f0, $f14
+
+ ; SEL-32: cmp.le.s $f0, $f14, $f12
+ ; SEL-32: sel.s $f0, $f14, $f12
+
+ ; CMOV-64: c.ult.s $f12, $f13
+ ; CMOV-64: movf.s $f13, $f12, $fcc0
+ ; CMOV-64: mov.s $f0, $f13
+
+ ; SEL-64: cmp.le.s $f0, $f13, $f12
+ ; SEL-64: sel.s $f0, $f13, $f12
+
+ ; MM32R3: c.ult.s $f12, $f14
+ ; MM32R3: movf.s $f14, $f12, $fcc0
+ ; MM32R3: mov.s $f0, $f14
+
+ %s = fcmp oge float %x, %y
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
+
+define float @tst_select_fcmp_oeq_float(float %x, float %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_oeq_float:
+
+ ; M2: c.eq.s $f12, $f14
+ ; M3: c.eq.s $f12, $f13
+ ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.s $f12, $f14
+ ; M3: mov.s $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.s $f0, $f12
+
+ ; CMOV-32: c.eq.s $f12, $f14
+ ; CMOV-32: movt.s $f14, $f12, $fcc0
+ ; CMOV-32: mov.s $f0, $f14
+
+ ; SEL-32: cmp.eq.s $f0, $f12, $f14
+ ; SEL-32: sel.s $f0, $f14, $f12
+
+ ; CMOV-64: c.eq.s $f12, $f13
+ ; CMOV-64: movt.s $f13, $f12, $fcc0
+ ; CMOV-64: mov.s $f0, $f13
+
+ ; SEL-64: cmp.eq.s $f0, $f12, $f13
+ ; SEL-64: sel.s $f0, $f13, $f12
+
+ ; MM32R3: c.eq.s $f12, $f14
+ ; MM32R3: movt.s $f14, $f12, $fcc0
+ ; MM32R3: mov.s $f0, $f14
+
+ %s = fcmp oeq float %x, %y
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
+
+define float @tst_select_fcmp_one_float(float %x, float %y) {
+entry:
+ ; ALL-LABEL: tst_select_fcmp_one_float:
+
+ ; M2: c.ueq.s $f12, $f14
+ ; M3: c.ueq.s $f12, $f13
+ ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2: mov.s $f12, $f14
+ ; M3: mov.s $f12, $f13
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: mov.s $f0, $f12
+
+ ; CMOV-32: c.ueq.s $f12, $f14
+ ; CMOV-32: movf.s $f14, $f12, $fcc0
+ ; CMOV-32: mov.s $f0, $f14
+
+ ; SEL-32: cmp.ueq.s $f0, $f12, $f14
+ ; SEL-32: mfc1 $[[T0:[0-9]+]], $f0
+ ; SEL-32: not $[[T0]], $[[T0]]
+ ; SEL-32: mtc1 $[[T0:[0-9]+]], $f0
+ ; SEL-32: sel.s $f0, $f14, $f12
+
+ ; CMOV-64: c.ueq.s $f12, $f13
+ ; CMOV-64: movf.s $f13, $f12, $fcc0
+ ; CMOV-64: mov.s $f0, $f13
+
+ ; SEL-64: cmp.ueq.s $f0, $f12, $f13
+ ; SEL-64: mfc1 $[[T0:[0-9]+]], $f0
+ ; SEL-64: not $[[T0]], $[[T0]]
+ ; SEL-64: mtc1 $[[T0:[0-9]+]], $f0
+ ; SEL-64: sel.s $f0, $f13, $f12
+
+ ; MM32R3: c.ueq.s $f12, $f14
+ ; MM32R3: movf.s $f14, $f12, $fcc0
+ ; MM32R3: mov.s $f0, $f14
+
+ %s = fcmp one float %x, %y
+ %r = select i1 %s, float %x, float %y
+ ret float %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/select-int.ll b/test/CodeGen/Mips/llvm-ir/select-int.ll
new file mode 100644
index 000000000000..e8f78ffdcb6a
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/select-int.ll
@@ -0,0 +1,270 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,M2,M2-M3
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R1
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-32,CMOV-32R2-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,SEL,SEL-32
+; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,M3,M2-M3
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN: -check-prefixes=ALL,CMOV,CMOV-64
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN: -check-prefixes=ALL,SEL,SEL-64
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM32R3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM32R6
+
+define signext i1 @tst_select_i1_i1(i1 signext %s,
+ i1 signext %x, i1 signext %y) {
+entry:
+ ; ALL-LABEL: tst_select_i1_i1:
+
+ ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
+ ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2-M3: move $5, $6
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: move $2, $5
+
+ ; CMOV: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV: movn $6, $5, $[[T0]]
+ ; CMOV: move $2, $6
+
+ ; SEL: andi $[[T0:[0-9]+]], $4, 1
+ ; SEL: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
+ ; SEL: selnez $[[T2:[0-9]+]], $5, $[[T0]]
+ ; SEL: or $2, $[[T2]], $[[T1]]
+
+ ; MM32R3: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MM32R3: movn $[[T1:[0-9]+]], $5, $[[T0]]
+ ; MM32R3: move $2, $[[T1]]
+
+ ; MMR6: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MMR6: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
+ ; MMR6: selnez $[[T2:[0-9]+]], $5, $[[T0]]
+ ; MMR6: or $2, $[[T2]], $[[T1]]
+
+ %r = select i1 %s, i1 %x, i1 %y
+ ret i1 %r
+}
+
+define signext i8 @tst_select_i1_i8(i1 signext %s,
+ i8 signext %x, i8 signext %y) {
+entry:
+ ; ALL-LABEL: tst_select_i1_i8:
+
+ ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
+ ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2-M3: move $5, $6
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: move $2, $5
+
+ ; CMOV: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV: movn $6, $5, $[[T0]]
+ ; CMOV: move $2, $6
+
+ ; SEL: andi $[[T0:[0-9]+]], $4, 1
+ ; SEL: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
+ ; SEL: selnez $[[T2:[0-9]+]], $5, $[[T0]]
+ ; SEL: or $2, $[[T2]], $[[T1]]
+
+ ; MM32R3: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MM32R3: movn $[[T1:[0-9]+]], $5, $[[T0]]
+ ; MM32R3: move $2, $[[T1]]
+
+ ; MMR6: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MMR6: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
+ ; MMR6: selnez $[[T2:[0-9]+]], $5, $[[T0]]
+ ; MMR6: or $2, $[[T2]], $[[T1]]
+
+ %r = select i1 %s, i8 %x, i8 %y
+ ret i8 %r
+}
+
+define signext i32 @tst_select_i1_i32(i1 signext %s,
+ i32 signext %x, i32 signext %y) {
+entry:
+ ; ALL-LABEL: tst_select_i1_i32:
+
+ ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
+ ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M2-M3: nop
+ ; M2-M3: move $5, $6
+ ; M2-M3: $[[BB0]]:
+ ; M2-M3: jr $ra
+ ; M2-M3: move $2, $5
+
+ ; CMOV: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV: movn $6, $5, $[[T0]]
+ ; CMOV: move $2, $6
+
+ ; SEL: andi $[[T0:[0-9]+]], $4, 1
+ ; SEL: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
+ ; SEL: selnez $[[T2:[0-9]+]], $5, $[[T0]]
+ ; SEL: or $2, $[[T2]], $[[T1]]
+
+ ; MM32R3: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MM32R3: movn $[[T1:[0-9]+]], $5, $[[T0]]
+ ; MM32R3: move $2, $[[T1]]
+
+ ; MMR6: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MMR6: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
+ ; MMR6: selnez $[[T2:[0-9]+]], $5, $[[T0]]
+ ; MMR6: or $2, $[[T2]], $[[T1]]
+
+ %r = select i1 %s, i32 %x, i32 %y
+ ret i32 %r
+}
+
+define signext i64 @tst_select_i1_i64(i1 signext %s,
+ i64 signext %x, i64 signext %y) {
+entry:
+ ; ALL-LABEL: tst_select_i1_i64:
+
+ ; M2: andi $[[T0:[0-9]+]], $4, 1
+ ; M2: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M2: nop
+ ; M2: lw $[[T1:[0-9]+]], 16($sp)
+ ; M2: $[[BB0]]:
+ ; FIXME: This branch is redundant
+ ; M2: bnez $[[T0]], $[[BB1:BB[0-9_]+]]
+ ; M2: nop
+ ; M2: lw $[[T2:[0-9]+]], 20($sp)
+ ; M2: $[[BB1]]:
+ ; M2: move $2, $[[T1]]
+ ; M2: jr $ra
+ ; M2: move $3, $[[T2]]
+
+ ; CMOV-32: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV-32: lw $2, 16($sp)
+ ; CMOV-32: movn $2, $6, $[[T0]]
+ ; CMOV-32: lw $3, 20($sp)
+ ; CMOV-32: movn $3, $7, $[[T0]]
+
+ ; SEL-32: andi $[[T0:[0-9]+]], $4, 1
+ ; SEL-32: selnez $[[T1:[0-9]+]], $6, $[[T0]]
+ ; SEL-32: lw $[[T2:[0-9]+]], 16($sp)
+ ; SEL-32: seleqz $[[T3:[0-9]+]], $[[T2]], $[[T0]]
+ ; SEL-32: or $2, $[[T1]], $[[T3]]
+ ; SEL-32: selnez $[[T4:[0-9]+]], $7, $[[T0]]
+ ; SEL-32: lw $[[T5:[0-9]+]], 20($sp)
+ ; SEL-32: seleqz $[[T6:[0-9]+]], $[[T5]], $[[T0]]
+ ; SEL-32: or $3, $[[T4]], $[[T6]]
+
+ ; M3: andi $[[T0:[0-9]+]], $4, 1
+ ; M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
+ ; M3: nop
+ ; M3: move $5, $6
+ ; M3: $[[BB0]]:
+ ; M3: jr $ra
+ ; M3: move $2, $5
+
+ ; CMOV-64: andi $[[T0:[0-9]+]], $4, 1
+ ; CMOV-64: movn $6, $5, $[[T0]]
+ ; CMOV-64: move $2, $6
+
+ ; SEL-64: andi $[[T0:[0-9]+]], $4, 1
+ ; FIXME: This shift is redundant
+ ; SEL-64: sll $[[T0]], $[[T0]], 0
+ ; SEL-64: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
+ ; SEL-64: selnez $[[T0]], $5, $[[T0]]
+ ; SEL-64: or $2, $[[T0]], $[[T1]]
+
+ ; MM32R3: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MM32R3: lw $2, 16($sp)
+ ; MM32R3: movn $2, $6, $[[T0]]
+ ; MM32R3: lw $3, 20($sp)
+ ; MM32R3: movn $3, $7, $[[T0]]
+
+ ; MM32R6: andi16 $[[T0:[0-9]+]], $4, 1
+ ; MM32R6: lw $[[T1:[0-9]+]], 16($sp)
+ ; MM32R6: seleqz $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+ ; MM32R6: selnez $[[T3:[0-9]+]], $6, $[[T0]]
+ ; MM32R6: or $2, $[[T3]], $[[T2]]
+ ; MM32R6: lw $[[T4:[0-9]+]], 20($sp)
+ ; MM32R6: seleqz $[[T5:[0-9]+]], $[[T4]], $[[T0]]
+ ; MM32R6: selnez $[[T6:[0-9]+]], $7, $[[T0]]
+ ; MM32R6: or $3, $[[T6]], $[[T5]]
+
+ %r = select i1 %s, i64 %x, i64 %y
+ ret i64 %r
+}
+
+define i8* @tst_select_word_cst(i8* %a, i8* %b) {
+ ; ALL-LABEL: tst_select_word_cst:
+
+ ; M2: addiu $[[T0:[0-9]+]], $zero, -1
+ ; M2: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; M2: sltu $[[T2:[0-9]+]], $zero, $[[T1]]
+ ; M2: bnez $[[T2]], $[[BB0:BB[0-9_]+]]
+ ; M2: addiu $2, $zero, 0
+ ; M2: move $2, $4
+ ; M2: $[[BB0]]:
+ ; M2: jr $ra
+
+ ; M3: daddiu $[[T0:[0-9]+]], $zero, -1
+ ; M3: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; M3: sltu $[[T2:[0-9]+]], $zero, $[[T1]]
+ ; M3: bnez $[[T2]], $[[BB0:BB[0-9_]+]]
+ ; M3: daddiu $2, $zero, 0
+ ; M3: move $2, $4
+ ; M3: $[[BB0]]:
+ ; M3: jr $ra
+
+ ; CMOV-32: addiu $[[T0:[0-9]+]], $zero, -1
+ ; CMOV-32: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; CMOV-32: movn $[[T2:[0-9]+]], $zero, $[[T1]]
+ ; CMOV-32: jr $ra
+ ; CMOV-32: move $2, $[[T2]]
+
+ ; SEL-32: addiu $[[T0:[0-9]+]], $zero, -1
+ ; SEL-32: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; SEL-32: sltu $[[T2:[0-9]+]], $zero, $[[T1]]
+ ; SEL-32: jr $ra
+ ; SEL-32: seleqz $2, $4, $[[T2]]
+
+ ; CMOV-64: daddiu $[[T0:[0-9]+]], $zero, -1
+ ; CMOV-64: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; CMOV-64: movn $[[T2:[0-9]+]], $zero, $[[T1]]
+ ; CMOV-64: move $2, $[[T2]]
+
+ ; SEL-64: daddiu $[[T0:[0-9]+]], $zero, -1
+ ; SEL-64: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; SEL-64: sltu $[[T2:[0-9]+]], $zero, $[[T1]]
+ ; FIXME: This shift is redundant.
+ ; SEL-64: sll $[[T2]], $[[T2]], 0
+ ; SEL-64: seleqz $2, $4, $[[T2]]
+
+ ; MM32R3: li16 $[[T0:[0-9]+]], -1
+ ; MM32R3: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; MM32R3: lui $[[T2:[0-9]+]], 0
+ ; MM32R3: movn $[[T3:[0-9]+]], $[[T2]], $[[T1]]
+ ; MM32R3: move $2, $[[T3]]
+
+ ; MM32R6: li16 $[[T0:[0-9]+]], -1
+ ; MM32R6: xor $[[T1:[0-9]+]], $5, $[[T0]]
+ ; MM32R6: sltu $[[T2:[0-9]+]], $zero, $[[T1]]
+ ; MM32R6: seleqz $2, $4, $[[T2]]
+
+ %cmp = icmp eq i8* %b, inttoptr (i64 -1 to i8*)
+ %r = select i1 %cmp, i8* %a, i8* null
+ ret i8* %r
+}
diff --git a/test/CodeGen/Mips/llvm-ir/select.ll b/test/CodeGen/Mips/llvm-ir/select.ll
deleted file mode 100644
index f17670adca33..000000000000
--- a/test/CodeGen/Mips/llvm-ir/select.ll
+++ /dev/null
@@ -1,712 +0,0 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=M2 -check-prefix=M2-M3
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV \
-; RUN: -check-prefix=CMOV-32 -check-prefix=CMOV-32R1
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV \
-; RUN: -check-prefix=CMOV-32 -check-prefix=CMOV-32R2-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV \
-; RUN: -check-prefix=CMOV-32 -check-prefix=CMOV-32R2-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV \
-; RUN: -check-prefix=CMOV-32 -check-prefix=CMOV-32R2-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=SEL -check-prefix=SEL-32
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=M3 -check-prefix=M2-M3
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=CMOV -check-prefix=CMOV-64
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=SEL -check-prefix=SEL-64
-
-define signext i1 @tst_select_i1_i1(i1 signext %s,
- i1 signext %x, i1 signext %y) {
-entry:
- ; ALL-LABEL: tst_select_i1_i1:
-
- ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
- ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2-M3: move $5, $6
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: move $2, $5
-
- ; CMOV: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV: movn $6, $5, $[[T0]]
- ; CMOV: move $2, $6
-
- ; SEL: andi $[[T0:[0-9]+]], $4, 1
- ; SEL: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
- ; SEL: selnez $[[T2:[0-9]+]], $5, $[[T0]]
- ; SEL: or $2, $[[T2]], $[[T1]]
- %r = select i1 %s, i1 %x, i1 %y
- ret i1 %r
-}
-
-define signext i8 @tst_select_i1_i8(i1 signext %s,
- i8 signext %x, i8 signext %y) {
-entry:
- ; ALL-LABEL: tst_select_i1_i8:
-
- ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
- ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2-M3: move $5, $6
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: move $2, $5
-
- ; CMOV: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV: movn $6, $5, $[[T0]]
- ; CMOV: move $2, $6
-
- ; SEL: andi $[[T0:[0-9]+]], $4, 1
- ; SEL: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
- ; SEL: selnez $[[T2:[0-9]+]], $5, $[[T0]]
- ; SEL: or $2, $[[T2]], $[[T1]]
- %r = select i1 %s, i8 %x, i8 %y
- ret i8 %r
-}
-
-define signext i32 @tst_select_i1_i32(i1 signext %s,
- i32 signext %x, i32 signext %y) {
-entry:
- ; ALL-LABEL: tst_select_i1_i32:
-
- ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
- ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2-M3: move $5, $6
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: move $2, $5
-
- ; CMOV: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV: movn $6, $5, $[[T0]]
- ; CMOV: move $2, $6
-
- ; SEL: andi $[[T0:[0-9]+]], $4, 1
- ; SEL: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
- ; SEL: selnez $[[T2:[0-9]+]], $5, $[[T0]]
- ; SEL: or $2, $[[T2]], $[[T1]]
- %r = select i1 %s, i32 %x, i32 %y
- ret i32 %r
-}
-
-define signext i64 @tst_select_i1_i64(i1 signext %s,
- i64 signext %x, i64 signext %y) {
-entry:
- ; ALL-LABEL: tst_select_i1_i64:
-
- ; M2: andi $[[T0:[0-9]+]], $4, 1
- ; M2: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M2: nop
- ; M2: lw $[[T1:[0-9]+]], 16($sp)
- ; M2: $[[BB0]]:
- ; FIXME: This branch is redundant
- ; M2: bnez $[[T0]], $[[BB1:BB[0-9_]+]]
- ; M2: nop
- ; M2: lw $[[T2:[0-9]+]], 20($sp)
- ; M2: $[[BB1]]:
- ; M2: move $2, $[[T1]]
- ; M2: jr $ra
- ; M2: move $3, $[[T2]]
-
- ; CMOV-32: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV-32: lw $2, 16($sp)
- ; CMOV-32: movn $2, $6, $[[T0]]
- ; CMOV-32: lw $3, 20($sp)
- ; CMOV-32: movn $3, $7, $[[T0]]
-
- ; SEL-32: andi $[[T0:[0-9]+]], $4, 1
- ; SEL-32: selnez $[[T1:[0-9]+]], $6, $[[T0]]
- ; SEL-32: lw $[[T2:[0-9]+]], 16($sp)
- ; SEL-32: seleqz $[[T3:[0-9]+]], $[[T2]], $[[T0]]
- ; SEL-32: or $2, $[[T1]], $[[T3]]
- ; SEL-32: selnez $[[T4:[0-9]+]], $7, $[[T0]]
- ; SEL-32: lw $[[T5:[0-9]+]], 20($sp)
- ; SEL-32: seleqz $[[T6:[0-9]+]], $[[T5]], $[[T0]]
- ; SEL-32: or $3, $[[T4]], $[[T6]]
-
- ; M3: andi $[[T0:[0-9]+]], $4, 1
- ; M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M3: nop
- ; M3: move $5, $6
- ; M3: $[[BB0]]:
- ; M3: jr $ra
- ; M3: move $2, $5
-
- ; CMOV-64: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV-64: movn $6, $5, $[[T0]]
- ; CMOV-64: move $2, $6
-
- ; SEL-64: andi $[[T0:[0-9]+]], $4, 1
- ; FIXME: This shift is redundant
- ; SEL-64: sll $[[T0]], $[[T0]], 0
- ; SEL-64: seleqz $[[T1:[0-9]+]], $6, $[[T0]]
- ; SEL-64: selnez $[[T0]], $5, $[[T0]]
- ; SEL-64: or $2, $[[T0]], $[[T1]]
- %r = select i1 %s, i64 %x, i64 %y
- ret i64 %r
-}
-
-define float @tst_select_i1_float(i1 signext %s, float %x, float %y) {
-entry:
- ; ALL-LABEL: tst_select_i1_float:
-
- ; M2-M3: andi $[[T0:[0-9]+]], $4, 1
- ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: jr $ra
- ; M2: mtc1 $6, $f0
- ; M3: mov.s $f13, $f14
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2: mtc1 $5, $f0
- ; M3: mov.s $f0, $f13
-
- ; CMOV-32: mtc1 $6, $f0
- ; CMOV-32: mtc1 $5, $f1
- ; CMOV-32: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV-32: movn.s $f0, $f1, $[[T0]]
-
- ; SEL-32: mtc1 $5, $[[F0:f[0-9]+]]
- ; SEL-32: mtc1 $6, $[[F1:f[0-9]+]]
- ; SEL-32: mtc1 $4, $f0
- ; SEL-32: sel.s $f0, $[[F1]], $[[F0]]
-
- ; CMOV-64: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV-64: movn.s $f14, $f13, $[[T0]]
- ; CMOV-64: mov.s $f0, $f14
-
- ; SEL-64: mtc1 $4, $f0
- ; SEL-64: sel.s $f0, $f14, $f13
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define float @tst_select_i1_float_reordered(float %x, float %y,
- i1 signext %s) {
-entry:
- ; ALL-LABEL: tst_select_i1_float_reordered:
-
- ; M2-M3: andi $[[T0:[0-9]+]], $6, 1
- ; M2-M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.s $f12, $f14
- ; M3: mov.s $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.s $f0, $f12
-
- ; CMOV-32: andi $[[T0:[0-9]+]], $6, 1
- ; CMOV-32: movn.s $f14, $f12, $[[T0]]
- ; CMOV-32: mov.s $f0, $f14
-
- ; SEL-32: mtc1 $6, $f0
- ; SEL-32: sel.s $f0, $f14, $f12
-
- ; CMOV-64: andi $[[T0:[0-9]+]], $6, 1
- ; CMOV-64: movn.s $f13, $f12, $[[T0]]
- ; CMOV-64: mov.s $f0, $f13
-
- ; SEL-64: mtc1 $6, $f0
- ; SEL-64: sel.s $f0, $f13, $f12
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define double @tst_select_i1_double(i1 signext %s, double %x, double %y) {
-entry:
- ; ALL-LABEL: tst_select_i1_double:
-
- ; M2: andi $[[T0:[0-9]+]], $4, 1
- ; M2: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M2: nop
- ; M2: ldc1 $f0, 16($sp)
- ; M2: jr $ra
- ; M2: nop
- ; M2: $[[BB0]]:
- ; M2: mtc1 $7, $f0
- ; M2: jr $ra
- ; M2: mtc1 $6, $f1
-
- ; CMOV-32: mtc1 $7, $[[F0:f[0-9]+]]
- ; CMOV-32R1: mtc1 $6, $f{{[0-9]+}}
- ; CMOV-32R2-R5: mthc1 $6, $[[F0]]
- ; CMOV-32: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV-32: ldc1 $f0, 16($sp)
- ; CMOV-32: movn.d $f0, $[[F0]], $[[T0]]
-
- ; SEL-32: mtc1 $7, $[[F0:f[0-9]+]]
- ; SEL-32: mthc1 $6, $[[F0]]
- ; SEL-32: ldc1 $[[F1:f[0-9]+]], 16($sp)
- ; SEL-32: mtc1 $4, $f0
- ; SEL-32: sel.d $f0, $[[F1]], $[[F0]]
-
- ; M3: andi $[[T0:[0-9]+]], $4, 1
- ; M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M3: nop
- ; M3: mov.d $f13, $f14
- ; M3: $[[BB0]]:
- ; M3: jr $ra
- ; M3: mov.d $f0, $f13
-
- ; CMOV-64: andi $[[T0:[0-9]+]], $4, 1
- ; CMOV-64: movn.d $f14, $f13, $[[T0]]
- ; CMOV-64: mov.d $f0, $f14
-
- ; SEL-64: mtc1 $4, $f0
- ; SEL-64: sel.d $f0, $f14, $f13
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
-
-define double @tst_select_i1_double_reordered(double %x, double %y,
- i1 signext %s) {
-entry:
- ; ALL-LABEL: tst_select_i1_double_reordered:
-
- ; M2: lw $[[T0:[0-9]+]], 16($sp)
- ; M2: andi $[[T1:[0-9]+]], $[[T0]], 1
- ; M2: bnez $[[T1]], $[[BB0:BB[0-9_]+]]
- ; M2: nop
- ; M2: mov.d $f12, $f14
- ; M2: $[[BB0]]:
- ; M2: jr $ra
- ; M2: mov.d $f0, $f12
-
- ; CMOV-32: lw $[[T0:[0-9]+]], 16($sp)
- ; CMOV-32: andi $[[T1:[0-9]+]], $[[T0]], 1
- ; CMOV-32: movn.d $f14, $f12, $[[T1]]
- ; CMOV-32: mov.d $f0, $f14
-
- ; SEL-32: lw $[[T0:[0-9]+]], 16($sp)
- ; SEL-32: mtc1 $[[T0]], $f0
- ; SEL-32: sel.d $f0, $f14, $f12
-
- ; M3: andi $[[T0:[0-9]+]], $6, 1
- ; M3: bnez $[[T0]], $[[BB0:BB[0-9_]+]]
- ; M3: nop
- ; M3: mov.d $f12, $f13
- ; M3: $[[BB0]]:
- ; M3: jr $ra
- ; M3: mov.d $f0, $f12
-
- ; CMOV-64: andi $[[T0:[0-9]+]], $6, 1
- ; CMOV-64: movn.d $f13, $f12, $[[T0]]
- ; CMOV-64: mov.d $f0, $f13
-
- ; SEL-64: mtc1 $6, $f0
- ; SEL-64: sel.d $f0, $f13, $f12
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
-
-define float @tst_select_fcmp_olt_float(float %x, float %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_olt_float:
-
- ; M2: c.olt.s $f12, $f14
- ; M3: c.olt.s $f12, $f13
- ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.s $f12, $f14
- ; M3: mov.s $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.s $f0, $f12
-
- ; CMOV-32: c.olt.s $f12, $f14
- ; CMOV-32: movt.s $f14, $f12, $fcc0
- ; CMOV-32: mov.s $f0, $f14
-
- ; SEL-32: cmp.lt.s $f0, $f12, $f14
- ; SEL-32: sel.s $f0, $f14, $f12
-
- ; CMOV-64: c.olt.s $f12, $f13
- ; CMOV-64: movt.s $f13, $f12, $fcc0
- ; CMOV-64: mov.s $f0, $f13
-
- ; SEL-64: cmp.lt.s $f0, $f12, $f13
- ; SEL-64: sel.s $f0, $f13, $f12
- %s = fcmp olt float %x, %y
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define float @tst_select_fcmp_ole_float(float %x, float %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_ole_float:
-
- ; M2: c.ole.s $f12, $f14
- ; M3: c.ole.s $f12, $f13
- ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.s $f12, $f14
- ; M3: mov.s $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.s $f0, $f12
-
- ; CMOV-32: c.ole.s $f12, $f14
- ; CMOV-32: movt.s $f14, $f12, $fcc0
- ; CMOV-32: mov.s $f0, $f14
-
- ; SEL-32: cmp.le.s $f0, $f12, $f14
- ; SEL-32: sel.s $f0, $f14, $f12
-
- ; CMOV-64: c.ole.s $f12, $f13
- ; CMOV-64: movt.s $f13, $f12, $fcc0
- ; CMOV-64: mov.s $f0, $f13
-
- ; SEL-64: cmp.le.s $f0, $f12, $f13
- ; SEL-64: sel.s $f0, $f13, $f12
- %s = fcmp ole float %x, %y
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define float @tst_select_fcmp_ogt_float(float %x, float %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_ogt_float:
-
- ; M2: c.ule.s $f12, $f14
- ; M3: c.ule.s $f12, $f13
- ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.s $f12, $f14
- ; M3: mov.s $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.s $f0, $f12
-
- ; CMOV-32: c.ule.s $f12, $f14
- ; CMOV-32: movf.s $f14, $f12, $fcc0
- ; CMOV-32: mov.s $f0, $f14
-
- ; SEL-32: cmp.lt.s $f0, $f14, $f12
- ; SEL-32: sel.s $f0, $f14, $f12
-
- ; CMOV-64: c.ule.s $f12, $f13
- ; CMOV-64: movf.s $f13, $f12, $fcc0
- ; CMOV-64: mov.s $f0, $f13
-
- ; SEL-64: cmp.lt.s $f0, $f13, $f12
- ; SEL-64: sel.s $f0, $f13, $f12
- %s = fcmp ogt float %x, %y
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define float @tst_select_fcmp_oge_float(float %x, float %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_oge_float:
-
- ; M2: c.ult.s $f12, $f14
- ; M3: c.ult.s $f12, $f13
- ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.s $f12, $f14
- ; M3: mov.s $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.s $f0, $f12
-
- ; CMOV-32: c.ult.s $f12, $f14
- ; CMOV-32: movf.s $f14, $f12, $fcc0
- ; CMOV-32: mov.s $f0, $f14
-
- ; SEL-32: cmp.le.s $f0, $f14, $f12
- ; SEL-32: sel.s $f0, $f14, $f12
-
- ; CMOV-64: c.ult.s $f12, $f13
- ; CMOV-64: movf.s $f13, $f12, $fcc0
- ; CMOV-64: mov.s $f0, $f13
-
- ; SEL-64: cmp.le.s $f0, $f13, $f12
- ; SEL-64: sel.s $f0, $f13, $f12
- %s = fcmp oge float %x, %y
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define float @tst_select_fcmp_oeq_float(float %x, float %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_oeq_float:
-
- ; M2: c.eq.s $f12, $f14
- ; M3: c.eq.s $f12, $f13
- ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.s $f12, $f14
- ; M3: mov.s $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.s $f0, $f12
-
- ; CMOV-32: c.eq.s $f12, $f14
- ; CMOV-32: movt.s $f14, $f12, $fcc0
- ; CMOV-32: mov.s $f0, $f14
-
- ; SEL-32: cmp.eq.s $f0, $f12, $f14
- ; SEL-32: sel.s $f0, $f14, $f12
-
- ; CMOV-64: c.eq.s $f12, $f13
- ; CMOV-64: movt.s $f13, $f12, $fcc0
- ; CMOV-64: mov.s $f0, $f13
-
- ; SEL-64: cmp.eq.s $f0, $f12, $f13
- ; SEL-64: sel.s $f0, $f13, $f12
- %s = fcmp oeq float %x, %y
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define float @tst_select_fcmp_one_float(float %x, float %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_one_float:
-
- ; M2: c.ueq.s $f12, $f14
- ; M3: c.ueq.s $f12, $f13
- ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.s $f12, $f14
- ; M3: mov.s $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.s $f0, $f12
-
- ; CMOV-32: c.ueq.s $f12, $f14
- ; CMOV-32: movf.s $f14, $f12, $fcc0
- ; CMOV-32: mov.s $f0, $f14
-
- ; SEL-32: cmp.ueq.s $f0, $f12, $f14
- ; SEL-32: mfc1 $[[T0:[0-9]+]], $f0
- ; SEL-32: not $[[T0]], $[[T0]]
- ; SEL-32: mtc1 $[[T0:[0-9]+]], $f0
- ; SEL-32: sel.s $f0, $f14, $f12
-
- ; CMOV-64: c.ueq.s $f12, $f13
- ; CMOV-64: movf.s $f13, $f12, $fcc0
- ; CMOV-64: mov.s $f0, $f13
-
- ; SEL-64: cmp.ueq.s $f0, $f12, $f13
- ; SEL-64: mfc1 $[[T0:[0-9]+]], $f0
- ; SEL-64: not $[[T0]], $[[T0]]
- ; SEL-64: mtc1 $[[T0:[0-9]+]], $f0
- ; SEL-64: sel.s $f0, $f13, $f12
-
- %s = fcmp one float %x, %y
- %r = select i1 %s, float %x, float %y
- ret float %r
-}
-
-define double @tst_select_fcmp_olt_double(double %x, double %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_olt_double:
-
- ; M2: c.olt.d $f12, $f14
- ; M3: c.olt.d $f12, $f13
- ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.d $f12, $f14
- ; M3: mov.d $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.d $f0, $f12
-
- ; CMOV-32: c.olt.d $f12, $f14
- ; CMOV-32: movt.d $f14, $f12, $fcc0
- ; CMOV-32: mov.d $f0, $f14
-
- ; SEL-32: cmp.lt.d $f0, $f12, $f14
- ; SEL-32: sel.d $f0, $f14, $f12
-
- ; CMOV-64: c.olt.d $f12, $f13
- ; CMOV-64: movt.d $f13, $f12, $fcc0
- ; CMOV-64: mov.d $f0, $f13
-
- ; SEL-64: cmp.lt.d $f0, $f12, $f13
- ; SEL-64: sel.d $f0, $f13, $f12
- %s = fcmp olt double %x, %y
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
-
-define double @tst_select_fcmp_ole_double(double %x, double %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_ole_double:
-
- ; M2: c.ole.d $f12, $f14
- ; M3: c.ole.d $f12, $f13
- ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.d $f12, $f14
- ; M3: mov.d $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.d $f0, $f12
-
- ; CMOV-32: c.ole.d $f12, $f14
- ; CMOV-32: movt.d $f14, $f12, $fcc0
- ; CMOV-32: mov.d $f0, $f14
-
- ; SEL-32: cmp.le.d $f0, $f12, $f14
- ; SEL-32: sel.d $f0, $f14, $f12
-
- ; CMOV-64: c.ole.d $f12, $f13
- ; CMOV-64: movt.d $f13, $f12, $fcc0
- ; CMOV-64: mov.d $f0, $f13
-
- ; SEL-64: cmp.le.d $f0, $f12, $f13
- ; SEL-64: sel.d $f0, $f13, $f12
- %s = fcmp ole double %x, %y
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
-
-define double @tst_select_fcmp_ogt_double(double %x, double %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_ogt_double:
-
- ; M2: c.ule.d $f12, $f14
- ; M3: c.ule.d $f12, $f13
- ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.d $f12, $f14
- ; M3: mov.d $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.d $f0, $f12
-
- ; CMOV-32: c.ule.d $f12, $f14
- ; CMOV-32: movf.d $f14, $f12, $fcc0
- ; CMOV-32: mov.d $f0, $f14
-
- ; SEL-32: cmp.lt.d $f0, $f14, $f12
- ; SEL-32: sel.d $f0, $f14, $f12
-
- ; CMOV-64: c.ule.d $f12, $f13
- ; CMOV-64: movf.d $f13, $f12, $fcc0
- ; CMOV-64: mov.d $f0, $f13
-
- ; SEL-64: cmp.lt.d $f0, $f13, $f12
- ; SEL-64: sel.d $f0, $f13, $f12
- %s = fcmp ogt double %x, %y
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
-
-define double @tst_select_fcmp_oge_double(double %x, double %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_oge_double:
-
- ; M2: c.ult.d $f12, $f14
- ; M3: c.ult.d $f12, $f13
- ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.d $f12, $f14
- ; M3: mov.d $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.d $f0, $f12
-
- ; CMOV-32: c.ult.d $f12, $f14
- ; CMOV-32: movf.d $f14, $f12, $fcc0
- ; CMOV-32: mov.d $f0, $f14
-
- ; SEL-32: cmp.le.d $f0, $f14, $f12
- ; SEL-32: sel.d $f0, $f14, $f12
-
- ; CMOV-64: c.ult.d $f12, $f13
- ; CMOV-64: movf.d $f13, $f12, $fcc0
- ; CMOV-64: mov.d $f0, $f13
-
- ; SEL-64: cmp.le.d $f0, $f13, $f12
- ; SEL-64: sel.d $f0, $f13, $f12
- %s = fcmp oge double %x, %y
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
-
-define double @tst_select_fcmp_oeq_double(double %x, double %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_oeq_double:
-
- ; M2: c.eq.d $f12, $f14
- ; M3: c.eq.d $f12, $f13
- ; M2-M3: bc1t $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.d $f12, $f14
- ; M3: mov.d $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.d $f0, $f12
-
- ; CMOV-32: c.eq.d $f12, $f14
- ; CMOV-32: movt.d $f14, $f12, $fcc0
- ; CMOV-32: mov.d $f0, $f14
-
- ; SEL-32: cmp.eq.d $f0, $f12, $f14
- ; SEL-32: sel.d $f0, $f14, $f12
-
- ; CMOV-64: c.eq.d $f12, $f13
- ; CMOV-64: movt.d $f13, $f12, $fcc0
- ; CMOV-64: mov.d $f0, $f13
-
- ; SEL-64: cmp.eq.d $f0, $f12, $f13
- ; SEL-64: sel.d $f0, $f13, $f12
- %s = fcmp oeq double %x, %y
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
-
-define double @tst_select_fcmp_one_double(double %x, double %y) {
-entry:
- ; ALL-LABEL: tst_select_fcmp_one_double:
-
- ; M2: c.ueq.d $f12, $f14
- ; M3: c.ueq.d $f12, $f13
- ; M2-M3: bc1f $[[BB0:BB[0-9_]+]]
- ; M2-M3: nop
- ; M2: mov.d $f12, $f14
- ; M3: mov.d $f12, $f13
- ; M2-M3: $[[BB0]]:
- ; M2-M3: jr $ra
- ; M2-M3: mov.d $f0, $f12
-
- ; CMOV-32: c.ueq.d $f12, $f14
- ; CMOV-32: movf.d $f14, $f12, $fcc0
- ; CMOV-32: mov.d $f0, $f14
-
- ; SEL-32: cmp.ueq.d $f0, $f12, $f14
- ; SEL-32: mfc1 $[[T0:[0-9]+]], $f0
- ; SEL-32: not $[[T0]], $[[T0]]
- ; SEL-32: mtc1 $[[T0:[0-9]+]], $f0
- ; SEL-32: sel.d $f0, $f14, $f12
-
- ; CMOV-64: c.ueq.d $f12, $f13
- ; CMOV-64: movf.d $f13, $f12, $fcc0
- ; CMOV-64: mov.d $f0, $f13
-
- ; SEL-64: cmp.ueq.d $f0, $f12, $f13
- ; SEL-64: mfc1 $[[T0:[0-9]+]], $f0
- ; SEL-64: not $[[T0]], $[[T0]]
- ; SEL-64: mtc1 $[[T0:[0-9]+]], $f0
- ; SEL-64: sel.d $f0, $f13, $f12
- %s = fcmp one double %x, %y
- %r = select i1 %s, double %x, double %y
- ret double %r
-}
diff --git a/test/CodeGen/Mips/llvm-ir/shl.ll b/test/CodeGen/Mips/llvm-ir/shl.ll
index bba34c47ea82..fa43840a8b7b 100644
--- a/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -1,42 +1,33 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=M2 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 -check-prefix=NOT-R2-R6 \
-; RUN: -check-prefix=32R1-R5
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5 -check-prefix=R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5 -check-prefix=R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R1-R5 -check-prefix=R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32 \
-; RUN: -check-prefix=32R6 -check-prefix=R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=M3 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64 \
-; RUN: -check-prefix=64R6 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,M2,NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,NOT-R2-R6,32R1-R5
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5,R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5,R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R1-R5,R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,32R6,R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,M3,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6,R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6,R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,GP64-NOT-R6,R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64,64R6,R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR3
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MMR6
define signext i1 @shl_i1(i1 signext %a, i1 signext %b) {
entry:
@@ -61,6 +52,10 @@ entry:
; R2-R6: sllv $[[T1:[0-9]+]], $4, $[[T0]]
; R2-R6: seb $2, $[[T1]]
+ ; MM: andi16 $[[T0:[0-9]+]], $5, 255
+ ; MM: sllv $[[T1:[0-9]+]], $4, $[[T0]]
+ ; MM: seb $2, $[[T1]]
+
%r = shl i8 %a, %b
ret i8 %r
}
@@ -78,6 +73,10 @@ entry:
; R2-R6: sllv $[[T1:[0-9]+]], $4, $[[T0]]
; R2-R6: seh $2, $[[T1]]
+ ; MM: andi16 $[[T0:[0-9]+]], $5, 65535
+ ; MM: sllv $[[T1:[0-9]+]], $4, $[[T0]]
+ ; MM: seh $2, $[[T1]]
+
%r = shl i16 %a, %b
ret i16 %r
}
@@ -139,6 +138,29 @@ entry:
; GP64: dsllv $2, $4, $5
+ ; MMR3: sllv $[[T0:[0-9]+]], $4, $7
+ ; MMR3: srl16 $[[T1:[0-9]+]], $5, 1
+ ; MMR3: not16 $[[T2:[0-9]+]], $7
+ ; MMR3: srlv $[[T3:[0-9]+]], $[[T1]], $[[T2]]
+ ; MMR3: or16 $[[T4:[0-9]+]], $[[T0]]
+ ; MMR3: sllv $[[T5:[0-9]+]], $5, $7
+ ; MMR3: andi16 $[[T6:[0-9]+]], $7, 32
+ ; MMR3: movn $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+ ; MMR3: lui $[[T8:[0-9]+]], 0
+ ; MMR3: movn $3, $[[T8]], $[[T6]]
+
+ ; MMR6: sllv $[[T0:[0-9]+]], $4, $7
+ ; MMR6: srl16 $[[T1:[0-9]+]], $5, 1
+ ; MMR6: not16 $[[T2:[0-9]+]], $7
+ ; MMR6: srlv $[[T3:[0-9]+]], $[[T1]], $[[T2]]
+ ; MMR6: or16 $[[T4:[0-9]+]], $[[T0]]
+ ; MMR6: andi16 $[[T5:[0-9]+]], $7, 32
+ ; MMR6: seleqz $[[T6:[0-9]+]], $[[T4]], $[[T5]]
+ ; MMR6: sllv $[[T7:[0-9]+]], $5, $7
+ ; MMR6: selnez $[[T8:[0-9]+]], $[[T7]], $[[T5]]
+ ; MMR6: or $2, $[[T8]], $[[T6]]
+ ; MMR6: seleqz $3, $[[T7]], $[[T5]]
+
%r = shl i64 %a, %b
ret i64 %r
}
@@ -194,6 +216,8 @@ entry:
; 64R6: jr $ra
; 64R6: seleqz $3, $[[T9]], $[[T7]]
+ ; MM: lw $25, %call16(__ashlti3)($2)
+
%r = shl i128 %a, %b
ret i128 %r
}
diff --git a/test/CodeGen/Mips/llvm-ir/srem.ll b/test/CodeGen/Mips/llvm-ir/srem.ll
index ceb53ee7033a..3431922b6c58 100644
--- a/test/CodeGen/Mips/llvm-ir/srem.ll
+++ b/test/CodeGen/Mips/llvm-ir/srem.ll
@@ -1,32 +1,37 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=GP32 \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefix=GP32 \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefix=GP32 \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=GP32 -check-prefix=R6 -check-prefix=R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=64R6 -check-prefix=R6 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R2-R5,R2-R6,NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R2-R5,R2-R6,NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R2-R5,R2-R6,NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R6,R2-R6
+
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64-NOT-R6,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64-NOT-R6,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64-NOT-R6,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R2-R5,R2-R6,GP64-NOT-R6,NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R2-R5,R2-R6,GP64-NOT-R6,NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R2-R5,R2-R6,GP64-NOT-R6,NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,64R6,R6,R2-R6
+
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR3,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM64
define signext i1 @srem_i1(i1 signext %a, i1 signext %b) {
entry:
@@ -43,6 +48,17 @@ entry:
; R6: sll $[[T3:[0-9]+]], $[[T0]], 31
; R6: sra $2, $[[T3]], 31
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mfhi $[[T0:[0-9]+]]
+ ; MMR3: sll $[[T1:[0-9]+]], $[[T0]], 31
+ ; MMR3: sra $2, $[[T1]], 31
+
+ ; MMR6: mod $[[T0:[0-9]+]], $4, $5
+ ; MMR6: teq $5, $zero, 7
+ ; MMR6: sll $[[T1:[0-9]+]], $[[T0]], 31
+ ; MMR6: sra $2, $[[T1]], 31
+
%r = srem i1 %a, %b
ret i1 %r
}
@@ -66,6 +82,15 @@ entry:
; R6: teq $5, $zero, 7
; R6: seb $2, $[[T0]]
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mfhi $[[T0:[0-9]+]]
+ ; MMR3: seb $2, $[[T0]]
+
+ ; MMR6: mod $[[T0:[0-9]+]], $4, $5
+ ; MMR6: teq $5, $zero, 7
+ ; MMR6: seb $2, $[[T0]]
+
%r = srem i8 %a, %b
ret i8 %r
}
@@ -83,12 +108,21 @@ entry:
; R2-R5: div $zero, $4, $5
; R2-R5: teq $5, $zero, 7
; R2-R5: mfhi $[[T0:[0-9]+]]
- ; R2-R5: seh $2, $[[T1]]
+ ; R2-R5: seh $2, $[[T0]]
; R6: mod $[[T0:[0-9]+]], $4, $5
; R6: teq $5, $zero, 7
; R6: seh $2, $[[T0]]
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mfhi $[[T0:[0-9]+]]
+ ; MMR3: seh $2, $[[T0]]
+
+ ; MMR6: mod $[[T0:[0-9]+]], $4, $5
+ ; MMR6: teq $5, $zero, 7
+ ; MMR6: seh $2, $[[T0]]
+
%r = srem i16 %a, %b
ret i16 %r
}
@@ -104,6 +138,13 @@ entry:
; R6: mod $2, $4, $5
; R6: teq $5, $zero, 7
+ ; MMR3: div $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mfhi $2
+
+ ; MMR6: mod $2, $4, $5
+ ; MMR6: teq $5, $zero, 7
+
%r = srem i32 %a, %b
ret i32 %r
}
@@ -121,6 +162,11 @@ entry:
; 64R6: dmod $2, $4, $5
; 64R6: teq $5, $zero, 7
+ ; MM32: lw $25, %call16(__moddi3)($2)
+
+ ; MM64: dmod $2, $4, $5
+ ; MM64: teq $5, $zero, 7
+
%r = srem i64 %a, %b
ret i64 %r
}
@@ -132,7 +178,11 @@ entry:
; GP32: lw $25, %call16(__modti3)($gp)
; GP64-NOT-R6: ld $25, %call16(__modti3)($gp)
- ; 64-R6: ld $25, %call16(__modti3)($gp)
+ ; 64R6: ld $25, %call16(__modti3)($gp)
+
+ ; MM32: lw $25, %call16(__modti3)($2)
+
+ ; MM64: ld $25, %call16(__modti3)($2)
%r = srem i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/store-atomic.ll b/test/CodeGen/Mips/llvm-ir/store-atomic.ll
index 6b33f2685d17..8624cf6c1c66 100644
--- a/test/CodeGen/Mips/llvm-ir/store-atomic.ll
+++ b/test/CodeGen/Mips/llvm-ir/store-atomic.ll
@@ -1,9 +1,9 @@
; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL
; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL
; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=M64
+; RUN: FileCheck %s -check-prefixes=ALL,M64
; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=M64
+; RUN: FileCheck %s -check-prefixes=ALL,M64
define void @store_i8(i8* %ptr, i8 signext %v) {
; ALL-LABEL: store_i8
diff --git a/test/CodeGen/Mips/llvm-ir/sub.ll b/test/CodeGen/Mips/llvm-ir/sub.ll
index 164975844d73..33757657ad91 100644
--- a/test/CodeGen/Mips/llvm-ir/sub.ll
+++ b/test/CodeGen/Mips/llvm-ir/sub.ll
@@ -1,37 +1,47 @@
; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM
; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM
; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM
; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP32
+; RUN: -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=GP32-MM,GP32,MM
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=GP32-MM,GP32,MM
; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM
; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM
; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=NOT-R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=NOT-R2-R6,GP64,NOT-MM
; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=R2-R6,GP64,NOT-MM
; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=R2-R6,GP64,NOT-MM
; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=R2-R6,GP64,NOT-MM
; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=R2-R6 -check-prefix=GP64
+; RUN: -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=GP64,MM
define signext i1 @sub_i1(i1 signext %a, i1 signext %b) {
entry:
; ALL-LABEL: sub_i1:
- ; ALL: subu $[[T0:[0-9]+]], $4, $5
- ; ALL: sll $[[T0]], $[[T0]], 31
- ; ALL: sra $2, $[[T0]], 31
+ ; NOT-MM: subu $[[T0:[0-9]+]], $4, $5
+ ; NOT-MM: sll $[[T0]], $[[T0]], 31
+ ; NOT-MM: sra $2, $[[T0]], 31
+
+ ; MM: subu16 $[[T0:[0-9]+]], $4, $5
+ ; MM: sll $[[T1:[0-9]+]], $[[T0]], 31
+ ; MM: sra $[[T0]], $[[T1]], 31
%r = sub i1 %a, %b
ret i1 %r
@@ -48,6 +58,9 @@ entry:
; R2-R6: subu $[[T0:[0-9]+]], $4, $5
; R2-R6: seb $2, $[[T0:[0-9]+]]
+ ; MM: subu16 $[[T0:[0-9]+]], $4, $5
+ ; MM: seb $[[T0]], $[[T0]]
+
%r = sub i8 %a, %b
ret i8 %r
}
@@ -63,6 +76,9 @@ entry:
; R2-R6: subu $[[T0:[0-9]+]], $4, $5
; R2-R6: seh $2, $[[T0:[0-9]+]]
+ ; MM: subu16 $[[T0:[0-9]+]], $4, $5
+ ; MM: seh $[[T0]], $[[T0]]
+
%r = sub i16 %a, %b
ret i16 %r
}
@@ -71,7 +87,9 @@ define signext i32 @sub_i32(i32 signext %a, i32 signext %b) {
entry:
; ALL-LABEL: sub_i32:
- ; ALL: subu $2, $4, $5
+ ; NOT-MM: subu $2, $4, $5
+
+ ; MM: subu16 $2, $4, $5
%r = sub i32 %a, %b
ret i32 %r
@@ -96,26 +114,42 @@ define signext i128 @sub_i128(i128 signext %a, i128 signext %b) {
entry:
; ALL-LABEL: sub_i128:
- ; GP32: lw $[[T0:[0-9]+]], 20($sp)
- ; GP32: sltu $[[T1:[0-9]+]], $5, $[[T0]]
- ; GP32: lw $[[T2:[0-9]+]], 16($sp)
- ; GP32: addu $[[T3:[0-9]+]], $[[T1]], $[[T2]]
- ; GP32: lw $[[T4:[0-9]+]], 24($sp)
- ; GP32: lw $[[T5:[0-9]+]], 28($sp)
- ; GP32: subu $[[T6:[0-9]+]], $7, $[[T5]]
- ; GP32: subu $2, $4, $[[T3]]
- ; GP32: sltu $[[T8:[0-9]+]], $6, $[[T4]]
- ; GP32: addu $[[T9:[0-9]+]], $[[T8]], $[[T0]]
- ; GP32: subu $3, $5, $[[T9]]
- ; GP32: sltu $[[T10:[0-9]+]], $7, $[[T5]]
- ; GP32: addu $[[T11:[0-9]+]], $[[T10]], $[[T4]]
- ; GP32: subu $4, $6, $[[T11]]
- ; GP32: move $5, $[[T6]]
-
- ; GP64: dsubu $3, $5, $7
- ; GP64: sltu $[[T0:[0-9]+]], $5, $7
- ; GP64: daddu $[[T1:[0-9]+]], $[[T0]], $6
- ; GP64: dsubu $2, $4, $[[T1]]
+ ; GP32-NOT-MM: lw $[[T0:[0-9]+]], 20($sp)
+ ; GP32-NOT-MM: sltu $[[T1:[0-9]+]], $5, $[[T0]]
+ ; GP32-NOT-MM: lw $[[T2:[0-9]+]], 16($sp)
+ ; GP32-NOT-MM: addu $[[T3:[0-9]+]], $[[T1]], $[[T2]]
+ ; GP32-NOT-MM: lw $[[T4:[0-9]+]], 24($sp)
+ ; GP32-NOT-MM: lw $[[T5:[0-9]+]], 28($sp)
+ ; GP32-NOT-MM: subu $[[T6:[0-9]+]], $7, $[[T5]]
+ ; GP32-NOT-MM: subu $2, $4, $[[T3]]
+ ; GP32-NOT-MM: sltu $[[T8:[0-9]+]], $6, $[[T4]]
+ ; GP32-NOT-MM: addu $[[T9:[0-9]+]], $[[T8]], $[[T0]]
+ ; GP32-NOT-MM: subu $3, $5, $[[T9]]
+ ; GP32-NOT-MM: sltu $[[T10:[0-9]+]], $7, $[[T5]]
+ ; GP32-NOT-MM: addu $[[T11:[0-9]+]], $[[T10]], $[[T4]]
+ ; GP32-NOT-MM: subu $4, $6, $[[T11]]
+ ; GP32-NOT-MM: move $5, $[[T6]]
+
+ ; GP32-MM: lw $[[T0:[0-9]+]], 20($sp)
+ ; GP32-MM: sltu $[[T1:[0-9]+]], $[[T2:[0-9]+]], $[[T0]]
+ ; GP32-MM: lw $[[T3:[0-9]+]], 16($sp)
+ ; GP32-MM: addu $[[T3]], $[[T1]], $[[T3]]
+ ; GP32-MM: lw $[[T4:[0-9]+]], 28($sp)
+ ; GP32-MM: subu $[[T1]], $7, $[[T4]]
+ ; GP32-MM: subu $[[T3]], $[[T5:[0-9]+]], $[[T3]]
+ ; GP32-MM: lw $[[T5]], 24($sp)
+ ; GP32-MM: sltu $[[T6:[0-9]+]], $6, $[[T5]]
+ ; GP32-MM: addu $[[T0]], $[[T6]], $[[T0]]
+ ; GP32-MM: subu $[[T0]], $5, $[[T0]]
+ ; GP32-MM: sltu $[[T2]], $7, $[[T4]]
+ ; GP32-MM: addu $[[T5]], $[[T2]], $[[T5]]
+ ; GP32-MM: subu $[[T5]], $6, $[[T5]]
+ ; GP32-MM: move $[[T2]], $[[T1]]
+
+ ; GP64: dsubu $3, $5, $7
+ ; GP64: sltu $[[T0:[0-9]+]], $5, $7
+ ; GP64: daddu $[[T1:[0-9]+]], $[[T0]], $6
+ ; GP64: dsubu $2, $4, $[[T1]]
%r = sub i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/udiv.ll b/test/CodeGen/Mips/llvm-ir/udiv.ll
index a7cafe52d1ac..6f4dcb5d7bb5 100644
--- a/test/CodeGen/Mips/llvm-ir/udiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/udiv.ll
@@ -1,29 +1,37 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=R6 -check-prefix=GP32
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=NOT-R6 -check-prefix=GP64-NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=R6 -check-prefix=64R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R6,GP32
+
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,NOT-R6,GP64-NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R6,64R6
+
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR3,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM64
define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) {
entry:
@@ -36,6 +44,13 @@ entry:
; R6: divu $2, $4, $5
; R6: teq $5, $zero, 7
+ ; MMR3: divu $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $2
+
+ ; MMR6: divu $2, $4, $5
+ ; MMR6: teq $5, $zero, 7
+
%r = udiv i1 %a, %b
ret i1 %r
}
@@ -51,6 +66,13 @@ entry:
; R6: divu $2, $4, $5
; R6: teq $5, $zero, 7
+ ; MMR3: divu $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $2
+
+ ; MMR6: divu $2, $4, $5
+ ; MMR6: teq $5, $zero, 7
+
%r = udiv i8 %a, %b
ret i8 %r
}
@@ -66,6 +88,13 @@ entry:
; R6: divu $2, $4, $5
; R6: teq $5, $zero, 7
+ ; MMR3: divu $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $2
+
+ ; MMR6: divu $2, $4, $5
+ ; MMR6: teq $5, $zero, 7
+
%r = udiv i16 %a, %b
ret i16 %r
}
@@ -81,6 +110,13 @@ entry:
; R6: divu $2, $4, $5
; R6: teq $5, $zero, 7
+ ; MMR3: divu $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mflo $2
+
+ ; MMR6: divu $2, $4, $5
+ ; MMR6: teq $5, $zero, 7
+
%r = udiv i32 %a, %b
ret i32 %r
}
@@ -98,6 +134,11 @@ entry:
; 64R6: ddivu $2, $4, $5
; 64R6: teq $5, $zero, 7
+ ; MM32: lw $25, %call16(__udivdi3)($2)
+
+ ; MM64: ddivu $2, $4, $5
+ ; MM64: teq $5, $zero, 7
+
%r = udiv i64 %a, %b
ret i64 %r
}
@@ -111,6 +152,10 @@ entry:
; GP64-NOT-R6: ld $25, %call16(__udivti3)($gp)
; 64-R6: ld $25, %call16(__udivti3)($gp)
+ ; MM32: lw $25, %call16(__udivti3)($2)
+
+ ; MM64: ld $25, %call16(__udivti3)($2)
+
%r = udiv i128 %a, %b
ret i128 %r
}
diff --git a/test/CodeGen/Mips/llvm-ir/urem.ll b/test/CodeGen/Mips/llvm-ir/urem.ll
index d5a231c8dfca..69b13ba7fee6 100644
--- a/test/CodeGen/Mips/llvm-ir/urem.ll
+++ b/test/CodeGen/Mips/llvm-ir/urem.ll
@@ -1,32 +1,37 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=GP32 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=GP32 \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s -check-prefix=GP32 \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s -check-prefix=GP32 \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=GP32 -check-prefix=R6 -check-prefix=R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6 -check-prefix=NOT-R2-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=R2-R5 -check-prefix=R2-R6 \
-; RUN: -check-prefix=GP64-NOT-R6 -check-prefix=NOT-R6
-; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=64R6 -check-prefix=R6 -check-prefix=R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R2-R5,R2-R6,NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R2-R5,R2-R6,NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R2-R5,R2-R6,NOT-R6
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP32,R6,R2-R6
+
+; RUN: llc < %s -march=mips64 -mcpu=mips3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64-NOT-R6,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64-NOT-R6,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,GP64-NOT-R6,NOT-R6,NOT-R2-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R2-R5,R2-R6,GP64-NOT-R6,NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r3 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R2-R5,R2-R6,GP64-NOT-R6,NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r5 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,R2-R5,R2-R6,GP64-NOT-R6,NOT-R6
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,64R6,R6,R2-R6
+
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR3,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -relocation-model=pic | FileCheck %s \
+; RUN: -check-prefixes=ALL,MMR6,MM64
define signext i1 @urem_i1(i1 signext %a, i1 signext %b) {
entry:
@@ -47,6 +52,21 @@ entry:
; R6: sll $[[T3:[0-9]+]], $[[T2]], 31
; R6: sra $2, $[[T3]], 31
+ ; MMR3: andi16 $[[T0:[0-9]+]], $5, 1
+ ; MMR3: andi16 $[[T1:[0-9]+]], $4, 1
+ ; MMR3: divu $zero, $[[T1]], $[[T0]]
+ ; MMR3: teq $[[T0]], $zero, 7
+ ; MMR3: mfhi $[[T2:[0-9]+]]
+ ; MMR3: sll $[[T3:[0-9]+]], $[[T2]], 31
+ ; MMR3: sra $2, $[[T3]], 31
+
+ ; MMR6: andi16 $[[T0:[0-9]+]], $5, 1
+ ; MMR6: andi16 $[[T1:[0-9]+]], $4, 1
+ ; MMR6: modu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+ ; MMR6: teq $[[T0]], $zero, 7
+ ; MMR6: sll $[[T3:[0-9]+]], $[[T2]], 31
+ ; MMR6: sra $2, $[[T3]], 31
+
%r = urem i1 %a, %b
ret i1 %r
}
@@ -76,6 +96,19 @@ entry:
; R6: teq $[[T0]], $zero, 7
; R6: seb $2, $[[T2]]
+ ; MMR3: andi16 $[[T0:[0-9]+]], $5, 255
+ ; MMR3: andi16 $[[T1:[0-9]+]], $4, 255
+ ; MMR3: divu $zero, $[[T1]], $[[T0]]
+ ; MMR3: teq $[[T0]], $zero, 7
+ ; MMR3: mfhi $[[T2:[0-9]+]]
+ ; MMR3: seb $2, $[[T2]]
+
+ ; MMR6: andi16 $[[T0:[0-9]+]], $5, 255
+ ; MMR6: andi16 $[[T1:[0-9]+]], $4, 255
+ ; MMR6: modu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+ ; MMR6: teq $[[T0]], $zero, 7
+ ; MMR6: seb $2, $[[T2]]
+
%r = urem i8 %a, %b
ret i8 %r
}
@@ -105,6 +138,19 @@ entry:
; R6: teq $[[T0]], $zero, 7
; R6: seh $2, $[[T2]]
+ ; MMR3: andi16 $[[T0:[0-9]+]], $5, 65535
+ ; MMR3: andi16 $[[T1:[0-9]+]], $4, 65535
+ ; MMR3: divu $zero, $[[T1]], $[[T0]]
+ ; MMR3: teq $[[T0]], $zero, 7
+ ; MMR3: mfhi $[[T2:[0-9]+]]
+ ; MMR3: seh $2, $[[T2]]
+
+ ; MMR6: andi16 $[[T0:[0-9]+]], $5, 65535
+ ; MMR6: andi16 $[[T1:[0-9]+]], $4, 65535
+ ; MMR6: modu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+ ; MMR6: teq $[[T0]], $zero, 7
+ ; MMR6: seh $2, $[[T2]]
+
%r = urem i16 %a, %b
ret i16 %r
}
@@ -120,6 +166,13 @@ entry:
; R6: modu $2, $4, $5
; R6: teq $5, $zero, 7
+ ; MMR3: divu $zero, $4, $5
+ ; MMR3: teq $5, $zero, 7
+ ; MMR3: mfhi $2
+
+ ; MMR6: modu $2, $4, $5
+ ; MMR6: teq $5, $zero, 7
+
%r = urem i32 %a, %b
ret i32 %r
}
@@ -137,6 +190,11 @@ entry:
; 64R6: dmodu $2, $4, $5
; 64R6: teq $5, $zero, 7
+ ; MM32: lw $25, %call16(__umoddi3)($2)
+
+ ; MM64: dmodu $2, $4, $5
+ ; MM64: teq $5, $zero, 7
+
%r = urem i64 %a, %b
ret i64 %r
}
@@ -145,10 +203,14 @@ define signext i128 @urem_i128(i128 signext %a, i128 signext %b) {
entry:
; ALL-LABEL: urem_i128:
- ; GP32: lw $25, %call16(__umodti3)($gp)
+ ; GP32: lw $25, %call16(__umodti3)($gp)
+
+ ; GP64-NOT-R6: ld $25, %call16(__umodti3)($gp)
+ ; 64R6: ld $25, %call16(__umodti3)($gp)
+
+ ; MM32: lw $25, %call16(__umodti3)($2)
- ; GP64-NOT-R6: ld $25, %call16(__umodti3)($gp)
- ; 64-R6: ld $25, %call16(__umodti3)($gp)
+ ; MM64: ld $25, %call16(__umodti3)($2)
%r = urem i128 %a, %b
ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/xor.ll b/test/CodeGen/Mips/llvm-ir/xor.ll
index d3cc57484895..0ba696fbc339 100644
--- a/test/CodeGen/Mips/llvm-ir/xor.ll
+++ b/test/CodeGen/Mips/llvm-ir/xor.ll
@@ -1,35 +1,44 @@
-; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s -check-prefixes=ALL,GP32
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP32
+; RUN: -check-prefixes=ALL,GP32
; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN: -check-prefix=ALL -check-prefix=GP64
+; RUN: -check-prefixes=ALL,GP64
+; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM32
+; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips | FileCheck %s \
+; RUN: -check-prefixes=ALL,MM,MM64
define signext i1 @xor_i1(i1 signext %a, i1 signext %b) {
entry:
; ALL-LABEL: xor_i1:
- ; ALL: xor $2, $4, $5
+ ; GP32: xor $2, $4, $5
+
+ ; GP64: xor $2, $4, $5
+
+ ; MM: xor16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = xor i1 %a, %b
ret i1 %r
@@ -39,7 +48,12 @@ define signext i8 @xor_i8(i8 signext %a, i8 signext %b) {
entry:
; ALL-LABEL: xor_i8:
- ; ALL: xor $2, $4, $5
+ ; GP32: xor $2, $4, $5
+
+ ; GP64: xor $2, $4, $5
+
+ ; MM: xor16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = xor i8 %a, %b
ret i8 %r
@@ -49,7 +63,12 @@ define signext i16 @xor_i16(i16 signext %a, i16 signext %b) {
entry:
; ALL-LABEL: xor_i16:
- ; ALL: xor $2, $4, $5
+ ; GP32: xor $2, $4, $5
+
+ ; GP64: xor $2, $4, $5
+
+ ; MM: xor16 $[[T0:[0-9]+]], $5
+ ; MM: move $2, $[[T0]]
%r = xor i16 %a, %b
ret i16 %r
@@ -64,6 +83,12 @@ entry:
; GP64: xor $[[T0:[0-9]+]], $4, $5
; GP64: sll $2, $[[T0]], 0
+ ; MM32: xor16 $[[T0:[0-9]+]], $5
+ ; MM32: move $2, $[[T0]]
+
+ ; MM64: xor $[[T0:[0-9]+]], $4, $5
+ ; MM64: sll $2, $[[T0]], 0
+
%r = xor i32 %a, %b
ret i32 %r
}
@@ -77,6 +102,13 @@ entry:
; GP64: xor $2, $4, $5
+ ; MM32: xor16 $[[T0:[0-9]+]], $6
+ ; MM32: xor16 $[[T1:[0-9]+]], $7
+ ; MM32: move $2, $[[T0]]
+ ; MM32: move $3, $[[T1]]
+
+ ; MM64: xor $2, $4, $5
+
%r = xor i64 %a, %b
ret i64 %r
}
@@ -97,6 +129,102 @@ entry:
; GP64: xor $2, $4, $6
; GP64: xor $3, $5, $7
+ ; MM32: lw $[[T0:[0-9]+]], 20($sp)
+ ; MM32: lw $[[T1:[0-9]+]], 16($sp)
+ ; MM32: xor16 $[[T1]], $4
+ ; MM32: xor16 $[[T0]], $5
+ ; MM32: lw $[[T2:[0-9]+]], 24($sp)
+ ; MM32: xor16 $[[T2]], $6
+ ; MM32: lw $[[T3:[0-9]+]], 28($sp)
+ ; MM32: xor16 $[[T3]], $7
+
+ ; MM64: xor $2, $4, $6
+ ; MM64: xor $3, $5, $7
+
%r = xor i128 %a, %b
ret i128 %r
}
+
+define signext i1 @xor_i1_4(i1 signext %b) {
+entry:
+; ALL-LABEL: xor_i1_4:
+
+ ; ALL: move $2, $4
+
+ %r = xor i1 4, %b
+ ret i1 %r
+}
+
+define signext i8 @xor_i8_4(i8 signext %b) {
+entry:
+; ALL-LABEL: xor_i8_4:
+
+ ; ALL: xori $2, $4, 4
+
+ %r = xor i8 4, %b
+ ret i8 %r
+}
+
+define signext i16 @xor_i16_4(i16 signext %b) {
+entry:
+; ALL-LABEL: xor_i16_4:
+
+ ; ALL: xori $2, $4, 4
+
+ %r = xor i16 4, %b
+ ret i16 %r
+}
+
+define signext i32 @xor_i32_4(i32 signext %b) {
+entry:
+; ALL-LABEL: xor_i32_4:
+
+ ; ALL: xori $2, $4, 4
+
+ %r = xor i32 4, %b
+ ret i32 %r
+}
+
+define signext i64 @xor_i64_4(i64 signext %b) {
+entry:
+; ALL-LABEL: xor_i64_4:
+
+ ; GP32: xori $3, $5, 4
+ ; GP32: move $2, $4
+
+ ; GP64: xori $2, $4, 4
+
+ ; MM32: xori $3, $5, 4
+ ; MM32: move $2, $4
+
+ ; MM64: xori $2, $4, 4
+
+ %r = xor i64 4, %b
+ ret i64 %r
+}
+
+define signext i128 @xor_i128_4(i128 signext %b) {
+entry:
+; ALL-LABEL: xor_i128_4:
+
+ ; GP32: xori $[[T0:[0-9]+]], $7, 4
+ ; GP32: move $2, $4
+ ; GP32: move $3, $5
+ ; GP32: move $4, $6
+ ; GP32: move $5, $[[T0]]
+
+ ; GP64: xori $3, $5, 4
+ ; GP64: move $2, $4
+
+ ; MM32: xori $[[T0:[0-9]+]], $7, 4
+ ; MM32: move $2, $4
+ ; MM32: move $3, $5
+ ; MM32: move $4, $6
+ ; MM32: move $5, $[[T0]]
+
+ ; MM64: xori $3, $5, 4
+ ; MM64: move $2, $4
+
+ %r = xor i128 4, %b
+ ret i128 %r
+}
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index a01d246ae460..3bd924a81200 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -1,17 +1,17 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
-; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
-; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
-; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EL %s
-; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EB %s
-; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
-; RUN: llc -march=mips64 -mcpu=mips4 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
-; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
-; RUN: llc -march=mips64 -mcpu=mips64 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
-; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
-; RUN: llc -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
-; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EL %s
-; RUN: llc -march=mips64 -mcpu=mips64r6 -target-abi=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EB %s
+; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32,MIPS32-EL %s
+; RUN: llc -march=mips -mcpu=mips32 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32,MIPS32-EB %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32,MIPS32-EL %s
+; RUN: llc -march=mips -mcpu=mips32r2 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32,MIPS32-EB %s
+; RUN: llc -march=mipsel -mcpu=mips32r6 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32R6,MIPS32R6-EL %s
+; RUN: llc -march=mips -mcpu=mips32r6 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS32R6,MIPS32R6-EB %s
+; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EL %s
+; RUN: llc -march=mips64 -mcpu=mips4 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EL %s
+; RUN: llc -march=mips64 -mcpu=mips64 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EL %s
+; RUN: llc -march=mips64 -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64R6,MIPS64R6-EL %s
+; RUN: llc -march=mips64 -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64R6,MIPS64R6-EB %s
%struct.SLL = type { i64 }
%struct.SI = type { i32 }
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 9f5b7417b859..06eda11e7888 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -1,13 +1,17 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
-; RUN: llc -march=mipsel -force-mips-long-branch -O3 < %s \
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -force-mips-long-branch -O3 -relocation-model=pic < %s \
; RUN: | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 -force-mips-long-branch -O3 \
+; RUN: llc -march=mipsel -mcpu=mips32r6 -force-mips-long-branch -O3 \
+; RUN: -relocation-model=pic -asm-show-inst < %s | FileCheck %s -check-prefix=O32-R6
+; RUN: llc -march=mips64el -mcpu=mips4 -target-abi=n64 -force-mips-long-branch -O3 -relocation-model=pic \
; RUN: < %s | FileCheck %s -check-prefix=N64
-; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -force-mips-long-branch -O3 \
+; RUN: llc -march=mips64el -mcpu=mips64 -target-abi=n64 -force-mips-long-branch -O3 -relocation-model=pic \
; RUN: < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 -force-mips-long-branch -O3 \
+; RUN: -relocation-model=pic -asm-show-inst < %s | FileCheck %s -check-prefix=N64-R6
; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=micromips \
-; RUN: -force-mips-long-branch -O3 < %s | FileCheck %s -check-prefix=MICROMIPS
-; RUN: llc -mtriple=mipsel-none-nacl -force-mips-long-branch -O3 < %s \
+; RUN: -force-mips-long-branch -O3 -relocation-model=pic < %s | FileCheck %s -check-prefix=MICROMIPS
+; RUN: llc -mtriple=mipsel-none-nacl -force-mips-long-branch -O3 -relocation-model=pic < %s \
; RUN: | FileCheck %s -check-prefix=NACL
@@ -72,6 +76,10 @@ end:
; O32: jr $ra
; O32: nop
+; In MIPS32R6 JR is an alias to JALR with $rd=0. As everything else remains the
+; same with the O32 prefix, we use -asm-show-inst in order to make sure that
+; the opcode of the MachineInst is a JALR.
+; O32-R6: JALR
; Check the MIPS64 version.
@@ -101,6 +109,11 @@ end:
; N64: jr $ra
; N64: nop
+; In MIPS64R6 JR is an alias to JALR with $rd=0. As everything else remains the
+; same with the N64 prefix, we use -asm-show-inst in order to make sure that
+; the opcode of the MachineInst is a JALR.
+; N64-R6: JALR64
+
; Check the microMIPS version.
@@ -155,7 +168,7 @@ end:
; NACL: lw $[[R1:[0-9]+]], %got(x)($[[GP]])
; NACL: addiu $[[R2:[0-9]+]], $zero, 1
; NACL: sw $[[R2]], 0($[[R1]])
-; NACL: .align 4
+; NACL: .p2align 4
; NACL-NEXT: $[[BB2]]:
; NACL: jr $ra
; NACL: nop
diff --git a/test/CodeGen/Mips/lw16-base-reg.ll b/test/CodeGen/Mips/lw16-base-reg.ll
new file mode 100644
index 000000000000..09150421a960
--- /dev/null
+++ b/test/CodeGen/Mips/lw16-base-reg.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -march=mips -mcpu=mips32r3 -mattr=micromips -filetype=asm \
+; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
+; RUN: llc %s -march=mips64 -mcpu=mips64r6 -mattr=micromips -filetype=asm \
+; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
+
+; The purpose of this test is to check whether the CodeGen selects
+; LW16 instruction with the base register in a range of $2-$7, $16, $17.
+
+%struct.T = type { i32 }
+
+$_ZN1TaSERKS_ = comdat any
+
+define linkonce_odr void @_ZN1TaSERKS_(%struct.T* %this, %struct.T* dereferenceable(4) %t) #0 comdat align 2 {
+entry:
+ %this.addr = alloca %struct.T*, align 4
+ %t.addr = alloca %struct.T*, align 4
+ %this1 = load %struct.T*, %struct.T** %this.addr, align 4
+ %0 = load %struct.T*, %struct.T** %t.addr, align 4
+ %V3 = getelementptr inbounds %struct.T, %struct.T* %0, i32 0, i32 0
+ %1 = load i32, i32* %V3, align 4
+ %V4 = getelementptr inbounds %struct.T, %struct.T* %this1, i32 0, i32 0
+ store i32 %1, i32* %V4, align 4
+ ret void
+}
+
+; CHECK: lw16 ${{[0-9]+}}, 0(${{[2-7]|16|17}})
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 667676de5f33..7baba005a072 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32
-; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32
-; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R6
+; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s -check-prefixes=ALL,32
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefixes=ALL,32
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefixes=ALL,32R6
; RUN: llc -march=mips -mcpu=mips32 -mattr=dsp < %s | FileCheck %s -check-prefix=DSP
-; RUN: llc -march=mips -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64
-; RUN: llc -march=mips -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64
-; RUN: llc -march=mips -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64R6
+; RUN: llc -march=mips -mcpu=mips64 -target-abi n64 < %s | FileCheck %s -check-prefixes=ALL,64
+; RUN: llc -march=mips -mcpu=mips64r2 -target-abi n64 < %s | FileCheck %s -check-prefixes=ALL,64
+; RUN: llc -march=mips -mcpu=mips64r6 -target-abi n64 < %s | FileCheck %s -check-prefixes=ALL,64R6
; FIXME: The MIPS16 test should check its output
; RUN: llc -march=mips -mattr=mips16 < %s
diff --git a/test/CodeGen/Mips/micromips-addiu.ll b/test/CodeGen/Mips/micromips-addiu.ll
index e0743c9c088b..84ebc4349e1f 100644
--- a/test/CodeGen/Mips/micromips-addiu.ll
+++ b/test/CodeGen/Mips/micromips-addiu.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs \
; RUN: -relocation-model=pic -O3 < %s | FileCheck %s
@x = global i32 65504, align 4
diff --git a/test/CodeGen/Mips/micromips-atomic1.ll b/test/CodeGen/Mips/micromips-atomic1.ll
index 37c3d7682e4f..d7c66c27b6e2 100644
--- a/test/CodeGen/Mips/micromips-atomic1.ll
+++ b/test/CodeGen/Mips/micromips-atomic1.ll
@@ -1,6 +1,5 @@
; RUN: llc -march=mipsel -filetype=obj --disable-machine-licm -mattr=micromips < %s -o - \
-; RUN: | llvm-objdump -no-show-raw-insn -arch mipsel -mcpu=mips32r2 -mattr=micromips -d - \
-; RUN: | FileCheck %s -check-prefix=MICROMIPS
+; RUN: | llvm-objdump -no-show-raw-insn -d - | FileCheck %s -check-prefix=MICROMIPS
; Use llvm-objdump to check wheter the encodings of microMIPS atomic instructions are correct.
; While emitting assembly files directly when in microMIPS mode, it is possible to emit a mips32r2
diff --git a/test/CodeGen/Mips/micromips-delay-slot.ll b/test/CodeGen/Mips/micromips-delay-slot.ll
index ef6546232835..5c6aa36a4130 100644
--- a/test/CodeGen/Mips/micromips-delay-slot.ll
+++ b/test/CodeGen/Mips/micromips-delay-slot.ll
@@ -1,5 +1,7 @@
; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
; RUN: -relocation-model=static -O2 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=+micromips \
+; RUN: -relocation-model=static -O2 < %s | FileCheck %s -check-prefix=CHECK-MMR6
; Function Attrs: nounwind
define i32 @foo(i32 signext %a) #0 {
@@ -16,3 +18,5 @@ declare i32 @bar(i32 signext) #1
; CHECK: jals
; CHECK-NEXT: sll16
+; CHECK-MMR6: jal
+; CHECK-MMR6-NOT: sll16
diff --git a/test/CodeGen/Mips/micromips-lwc1-swc1.ll b/test/CodeGen/Mips/micromips-lwc1-swc1.ll
new file mode 100644
index 000000000000..a1a10a5de259
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-lwc1-swc1.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=mips -mcpu=mips32r3 -mattr=+micromips \
+; RUN: -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MM32
+; RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips \
+; RUN: -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MM32
+; RUN: llc -march=mips -mcpu=mips64r6 -mattr=+micromips -target-abi n64 \
+; RUN: -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MM64
+
+@gf0 = external global float
+
+define float @test_lwc1() {
+entry:
+; CHECK-LABEL: test_lwc1
+; MM32: lui $[[R0:[0-9]+]], %hi(_gp_disp)
+; MM32: addiu $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
+; MM32: addu $[[R2:[0-9]+]], $[[R1]], $25
+; MM32: lw $[[R3:[0-9]+]], %got(gf0)($[[R2]])
+; MM32: lwc1 $f0, 0($[[R3]])
+
+; MM64: lui $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test_lwc1)))
+; MM64: daddu $[[R1:[0-9]+]], $[[R0]], $25
+; MM64: daddiu $[[R2:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test_lwc1)))
+; MM64: ld $[[R3:[0-9]+]], %got_disp(gf0)($[[R2]])
+; MM64: lwc1 $f0, 0($[[R3]])
+
+ %0 = load float, float* @gf0, align 4
+ ret float %0
+}
+
+define void @test_swc1(float %a) {
+entry:
+; CHECK-LABEL: test_swc1
+; MM32: lui $[[R0:[0-9]+]], %hi(_gp_disp)
+; MM32: addiu $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
+; MM32: addu $[[R2:[0-9]+]], $[[R1]], $25
+; MM32: lw $[[R3:[0-9]+]], %got(gf0)($[[R2]])
+; MM32: swc1 $f12, 0($[[R3]])
+
+; MM64: lui $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test_swc1)))
+; MM64: daddu $[[R1:[0-9]+]], $[[R0]], $25
+; MM64: daddiu $[[R2:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test_swc1)))
+; MM64: ld $[[R3:[0-9]+]], %got_disp(gf0)($[[R2]])
+; MM64: swc1 $f12, 0($[[R3]])
+
+ store float %a, float* @gf0, align 4
+ ret void
+}
+
diff --git a/test/CodeGen/Mips/micromips-or16.ll b/test/CodeGen/Mips/micromips-or16.ll
index 82ea9c687df4..ae2c53884ef7 100644
--- a/test/CodeGen/Mips/micromips-or16.ll
+++ b/test/CodeGen/Mips/micromips-or16.ll
@@ -1,18 +1,23 @@
; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
; RUN: -relocation-model=pic -O3 < %s | FileCheck %s
+; RUN: llc -O0 -march=mips -mcpu=mips32r2 -mattr=+micromips \
+; RUN: -asm-show-inst < %s | FileCheck %s
-define i32 @main() {
-entry:
- %retval = alloca i32, align 4
- %a = alloca i32, align 4
- %b = alloca i32, align 4
- %c = alloca i32, align 4
- store i32 0, i32* %retval
- %0 = load i32, i32* %b, align 4
- %1 = load i32, i32* %c, align 4
- %or = or i32 %0, %1
- store i32 %or, i32* %a, align 4
- ret i32 0
+; Branch instruction added to enable FastISel::selectOperator
+; to select OR instruction
+define i32 @f1(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: f1
+; CHECK-NOT: OR16_MMR6
+ %1 = or i32 %a, %b
+ br label %b1
+b1:
+ ret i32 %1
}
+define i32 @f2(i32 signext %a, i32 signext %b) {
+entry:
+; CHECK-LABEL: f2
; CHECK: or16
+ %0 = or i32 %a, %b
+ ret i32 %0
+}
diff --git a/test/CodeGen/Mips/micromips-shift.ll b/test/CodeGen/Mips/micromips-shift.ll
index ed1bcbbf0831..a4f8ffe9408d 100644
--- a/test/CodeGen/Mips/micromips-shift.ll
+++ b/test/CodeGen/Mips/micromips-shift.ll
@@ -1,5 +1,7 @@
; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
; RUN: -relocation-model=pic -O3 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=+micromips \
+; RUN: -relocation-model=pic -O3 < %s | FileCheck %s
@a = global i32 10, align 4
@b = global i32 0, align 4
diff --git a/test/CodeGen/Mips/micromips-zero-mat-uses.ll b/test/CodeGen/Mips/micromips-zero-mat-uses.ll
deleted file mode 100644
index b38747a2d2c2..000000000000
--- a/test/CodeGen/Mips/micromips-zero-mat-uses.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+micromips,+nooddspreg -O0 < %s | FileCheck %s
-
-; CHECK: addiu $[[R0:[0-9]+]], $zero, 0
-; CHECK: subu16 $2, $[[R0]], ${{[0-9]+}}
-define i32 @foo() {
- %1 = sub i32 0, undef
- ret i32 %1
-}
diff --git a/test/CodeGen/Mips/mips-shf-gprel.s b/test/CodeGen/Mips/mips-shf-gprel.s
new file mode 100644
index 000000000000..9caaf00394a3
--- /dev/null
+++ b/test/CodeGen/Mips/mips-shf-gprel.s
@@ -0,0 +1,27 @@
+# Check that .sdata and .sbss sections have SHF_MIPS_GPREL flags
+# and proper section types.
+
+# RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux %s -o - \
+# RUN: | llvm-readobj -s | FileCheck %s
+
+ .sdata
+ .word 0
+
+ .sbss
+ .zero 4
+
+# CHECK: Name: .sdata
+# CHECK-NEXT: Type: SHT_PROGBITS
+# CHECK-NEXT: Flags [ (0x10000003)
+# CHECK-NEXT: SHF_ALLOC
+# CHECK-NEXT: SHF_MIPS_GPREL
+# CHECK-NEXT: SHF_WRITE
+# CHECK-NEXT: ]
+
+# CHECK: Name: .sbss
+# CHECK-NEXT: Type: SHT_NOBITS
+# CHECK-NEXT: Flags [ (0x10000003)
+# CHECK-NEXT: SHF_ALLOC
+# CHECK-NEXT: SHF_MIPS_GPREL
+# CHECK-NEXT: SHF_WRITE
+# CHECK-NEXT: ]
diff --git a/test/CodeGen/Mips/mips16fpe.ll b/test/CodeGen/Mips/mips16fpe.ll
index 16695e45265a..b8f1d945f356 100644
--- a/test/CodeGen/Mips/mips16fpe.ll
+++ b/test/CodeGen/Mips/mips16fpe.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16hf
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic -O3 \
+; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=16hf
@x = global float 5.000000e+00, align 4
@y = global float 1.500000e+01, align 4
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index d9c93810438f..2b1c154f095b 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -1,11 +1,11 @@
; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips4 -mattr=+soft-float -O1 \
-; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
+; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefixes=ALL,C_CC_FMT,PRER6
; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64 -mattr=+soft-float -O1 \
-; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
+; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefixes=ALL,C_CC_FMT,PRER6
; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r2 -mattr=+soft-float -O1 \
-; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
+; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefixes=ALL,C_CC_FMT,PRER6
; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r6 -mattr=+soft-float -O1 \
-; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=CMP_CC_FMT
+; RUN: -disable-mips-delay-filler < %s | FileCheck %s -check-prefixes=ALL,CMP_CC_FMT,R6
@gld0 = external global fp128
@gld1 = external global fp128
@@ -544,10 +544,11 @@ entry:
}
; ALL-LABEL: load_LD_float:
-; ALL: ld $[[R0:[0-9]+]], %got_disp(gf1)
-; ALL: lw $4, 0($[[R0]])
-; ALL: ld $25, %call16(__extendsftf2)
-; ALL: jalr $25
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gf1)
+; ALL: lw $4, 0($[[R0]])
+; ALL: ld $25, %call16(__extendsftf2)
+; PRER6: jalr $25
+; R6: jalrc $25
define fp128 @load_LD_float() {
entry:
@@ -557,10 +558,11 @@ entry:
}
; ALL-LABEL: load_LD_double:
-; ALL: ld $[[R0:[0-9]+]], %got_disp(gd1)
-; ALL: ld $4, 0($[[R0]])
-; ALL: ld $25, %call16(__extenddftf2)
-; ALL: jalr $25
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gd1)
+; ALL: ld $4, 0($[[R0]])
+; ALL: ld $25, %call16(__extenddftf2)
+; PRER6: jalr $25
+; R6: jalrc $25
define fp128 @load_LD_double() {
entry:
@@ -585,13 +587,14 @@ entry:
}
; ALL-LABEL: store_LD_float:
-; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
-; ALL: ld $4, 0($[[R0]])
-; ALL: ld $5, 8($[[R0]])
-; ALL: ld $25, %call16(__trunctfsf2)
-; ALL: jalr $25
-; ALL: ld $[[R1:[0-9]+]], %got_disp(gf1)
-; ALL: sw $2, 0($[[R1]])
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld $4, 0($[[R0]])
+; ALL: ld $5, 8($[[R0]])
+; ALL: ld $25, %call16(__trunctfsf2)
+; PRER6: jalr $25
+; R6: jalrc $25
+; ALL: ld $[[R1:[0-9]+]], %got_disp(gf1)
+; ALL: sw $2, 0($[[R1]])
define void @store_LD_float() {
entry:
@@ -602,13 +605,14 @@ entry:
}
; ALL-LABEL: store_LD_double:
-; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
-; ALL: ld $4, 0($[[R0]])
-; ALL: ld $5, 8($[[R0]])
-; ALL: ld $25, %call16(__trunctfdf2)
-; ALL: jalr $25
-; ALL: ld $[[R1:[0-9]+]], %got_disp(gd1)
-; ALL: sd $2, 0($[[R1]])
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld $4, 0($[[R0]])
+; ALL: ld $5, 8($[[R0]])
+; ALL: ld $25, %call16(__trunctfdf2)
+; PRER6: jalr $25
+; R6: jalrc $25
+; ALL: ld $[[R1:[0-9]+]], %got_disp(gd1)
+; ALL: sd $2, 0($[[R1]])
define void @store_LD_double() {
entry:
@@ -648,7 +652,8 @@ entry:
; ALL: move $[[R2:[0-9]+]], $9
; ALL: move $[[R3:[0-9]+]], $8
; ALL: ld $25, %call16(__gttf2)($gp)
-; ALL: jalr $25
+; PRER6: jalr $25
+; R6: jalrc $25
; C_CC_FMT: slti $[[CC:[0-9]+]], $2, 1
; C_CC_FMT: movz $[[R1]], $[[R3]], $[[CC]]
diff --git a/test/CodeGen/Mips/mips64extins.ll b/test/CodeGen/Mips/mips64extins.ll
index 211cd5f8e7fd..7876266fb856 100644
--- a/test/CodeGen/Mips/mips64extins.ll
+++ b/test/CodeGen/Mips/mips64extins.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -target-abi=n64 | FileCheck %s
define i64 @dext(i64 %i) nounwind readnone {
entry:
+; CHECK-LABEL: dext:
; CHECK: dext ${{[0-9]+}}, ${{[0-9]+}}, 5, 10
%shr = lshr i64 %i, 5
%and = and i64 %shr, 1023
@@ -10,7 +11,8 @@ entry:
define i64 @dextm(i64 %i) nounwind readnone {
entry:
-; CHECK: dext ${{[0-9]+}}, ${{[0-9]+}}, 5, 34
+; CHECK-LABEL: dextm:
+; CHECK: dextm ${{[0-9]+}}, ${{[0-9]+}}, 5, 34
%shr = lshr i64 %i, 5
%and = and i64 %shr, 17179869183
ret i64 %and
@@ -18,7 +20,8 @@ entry:
define i64 @dextu(i64 %i) nounwind readnone {
entry:
-; CHECK: dext ${{[0-9]+}}, ${{[0-9]+}}, 34, 6
+; CHECK-LABEL: dextu:
+; CHECK: dextu ${{[0-9]+}}, ${{[0-9]+}}, 34, 6
%shr = lshr i64 %i, 34
%and = and i64 %shr, 63
ret i64 %and
@@ -26,6 +29,7 @@ entry:
define i64 @dins(i64 %i, i64 %j) nounwind readnone {
entry:
+; CHECK-LABEL: dins:
; CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 8, 10
%shl2 = shl i64 %j, 8
%and = and i64 %shl2, 261888
@@ -36,6 +40,7 @@ entry:
define i64 @dinsm(i64 %i, i64 %j) nounwind readnone {
entry:
+; CHECK-LABEL: dinsm:
; CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 10, 33
%shl4 = shl i64 %j, 10
%and = and i64 %shl4, 8796093021184
@@ -46,6 +51,7 @@ entry:
define i64 @dinsu(i64 %i, i64 %j) nounwind readnone {
entry:
+; CHECK-LABEL: dinsu:
; CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 40, 13
%shl4 = shl i64 %j, 40
%and = and i64 %shl4, 9006099743113216
diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll
index 55d5c775cbba..564ffdd2f691 100644
--- a/test/CodeGen/Mips/mips64fpldst.ll
+++ b/test/CodeGen/Mips/mips64fpldst.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
@f0 = common global float 0.000000e+00, align 4
@d0 = common global double 0.000000e+00, align 8
@@ -12,10 +14,10 @@ define float @funcfl1() nounwind readonly {
entry:
; CHECK-N64: funcfl1
; CHECK-N64: ld $[[R0:[0-9]+]], %got_disp(f0)
-; CHECK-N64: lwc1 $f{{[0-9]+}}, 0($[[R0]])
+; CHECK-N64: lwc1 $f{{[0-9]+}}, 0($[[R0]])
; CHECK-N32: funcfl1
; CHECK-N32: lw $[[R0:[0-9]+]], %got_disp(f0)
-; CHECK-N32: lwc1 $f{{[0-9]+}}, 0($[[R0]])
+; CHECK-N32: lwc1 $f{{[0-9]+}}, 0($[[R0]])
%0 = load float, float* @f0, align 4
ret float %0
}
@@ -24,11 +26,11 @@ define double @funcfl2() nounwind readonly {
entry:
; CHECK-N64: funcfl2
; CHECK-N64: ld $[[R0:[0-9]+]], %got_disp(d0)
-; CHECK-N64: ldc1 $f{{[0-9]+}}, 0($[[R0]])
+; CHECK-N64: ldc1 $f{{[0-9]+}}, 0($[[R0]])
; CHECK-N32: funcfl2
; CHECK-N32: lw $[[R0:[0-9]+]], %got_disp(d0)
-; CHECK-N32: ldc1 $f{{[0-9]+}}, 0($[[R0]])
- %0 = load double, double* @d0, align 8
+; CHECK-N32: ldc1 $f{{[0-9]+}}, 0($[[R0]])
+ %0 = load double, double* @d0, align 8
ret double %0
}
@@ -36,12 +38,12 @@ define void @funcfs1() nounwind {
entry:
; CHECK-N64: funcfs1
; CHECK-N64: ld $[[R0:[0-9]+]], %got_disp(f0)
-; CHECK-N64: swc1 $f{{[0-9]+}}, 0($[[R0]])
+; CHECK-N64: swc1 $f{{[0-9]+}}, 0($[[R0]])
; CHECK-N32: funcfs1
; CHECK-N32: lw $[[R0:[0-9]+]], %got_disp(f0)
-; CHECK-N32: swc1 $f{{[0-9]+}}, 0($[[R0]])
- %0 = load float, float* @f1, align 4
- store float %0, float* @f0, align 4
+; CHECK-N32: swc1 $f{{[0-9]+}}, 0($[[R0]])
+ %0 = load float, float* @f1, align 4
+ store float %0, float* @f0, align 4
ret void
}
@@ -49,12 +51,12 @@ define void @funcfs2() nounwind {
entry:
; CHECK-N64: funcfs2
; CHECK-N64: ld $[[R0:[0-9]+]], %got_disp(d0)
-; CHECK-N64: sdc1 $f{{[0-9]+}}, 0($[[R0]])
+; CHECK-N64: sdc1 $f{{[0-9]+}}, 0($[[R0]])
; CHECK-N32: funcfs2
; CHECK-N32: lw $[[R0:[0-9]+]], %got_disp(d0)
-; CHECK-N32: sdc1 $f{{[0-9]+}}, 0($[[R0]])
- %0 = load double, double* @d1, align 8
- store double %0, double* @d0, align 8
+; CHECK-N32: sdc1 $f{{[0-9]+}}, 0($[[R0]])
+ %0 = load double, double* @d1, align 8
+ store double %0, double* @d0, align 8
ret void
}
diff --git a/test/CodeGen/Mips/mips64instrs.ll b/test/CodeGen/Mips/mips64instrs.ll
index d64cdceb6b81..8f124c89db4b 100644
--- a/test/CodeGen/Mips/mips64instrs.ll
+++ b/test/CodeGen/Mips/mips64instrs.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS4 -check-prefix=ACCMULDIV %s
-; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=ACCMULDIV %s
-; RUN: llc -march=mips64el -mcpu=mips64r2 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=ACCMULDIV %s
-; RUN: llc -march=mips64el -mcpu=mips64r6 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=GPRMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,MIPS4,ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,HAS-DCLO,ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,HAS-DCLO,ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,HAS-DCLO,GPRMULDIV %s
@gll0 = common global i64 0, align 8
@gll1 = common global i64 0, align 8
diff --git a/test/CodeGen/Mips/mips64intldst.ll b/test/CodeGen/Mips/mips64intldst.ll
index 658ab88481c4..0abe192de117 100644
--- a/test/CodeGen/Mips/mips64intldst.ll
+++ b/test/CodeGen/Mips/mips64intldst.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 | FileCheck %s -check-prefix=CHECK-N64
-; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32
@c = common global i8 0, align 4
@s = common global i16 0, align 4
diff --git a/test/CodeGen/Mips/mips64muldiv.ll b/test/CodeGen/Mips/mips64muldiv.ll
index 32d05a9da369..d1292be504e6 100644
--- a/test/CodeGen/Mips/mips64muldiv.ll
+++ b/test/CodeGen/Mips/mips64muldiv.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
-; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
-; RUN: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefixes=ALL,ACC
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefixes=ALL,ACC
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck %s -check-prefixes=ALL,ACC
+; RUN: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck %s -check-prefixes=ALL,GPR
; FileCheck prefixes:
; ALL - All targets
diff --git a/test/CodeGen/Mips/mips64r6/compatibility.ll b/test/CodeGen/Mips/mips64r6/compatibility.ll
index 429f68d784bb..174f4ce1771a 100644
--- a/test/CodeGen/Mips/mips64r6/compatibility.ll
+++ b/test/CodeGen/Mips/mips64r6/compatibility.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mipsel -mcpu=mips64r6 < %s | FileCheck %s
-; RUN: not llc -march=mipsel -mcpu=mips64r6 -mattr=+dsp < %s 2>&1 | FileCheck --check-prefix=DSP %s
+; RUN: llc -march=mipsel -mcpu=mips64r6 -target-abi n64 < %s | FileCheck %s
+; RUN: not llc -march=mipsel -mcpu=mips64r6 -target-abi n64 -mattr=+dsp < %s 2>&1 | FileCheck --check-prefix=DSP %s
; CHECK: foo:
; DSP: MIPS64r6 is not compatible with the DSP ASE
diff --git a/test/CodeGen/Mips/mips64shift.ll b/test/CodeGen/Mips/mips64shift.ll
index 52c6f9066392..e93140f18c9b 100644
--- a/test/CodeGen/Mips/mips64shift.ll
+++ b/test/CodeGen/Mips/mips64shift.ll
@@ -1,64 +1,65 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck -check-prefixes=ALL,MIPS %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=micromips < %s | FileCheck -check-prefixes=ALL,MICROMIPS %s
define i64 @f0(i64 %a0, i64 %a1) nounwind readnone {
entry:
-; CHECK: dsllv
+; ALL: dsllv
%shl = shl i64 %a0, %a1
ret i64 %shl
}
define i64 @f1(i64 %a0, i64 %a1) nounwind readnone {
entry:
-; CHECK: dsrav
+; ALL: dsrav
%shr = ashr i64 %a0, %a1
ret i64 %shr
}
define i64 @f2(i64 %a0, i64 %a1) nounwind readnone {
entry:
-; CHECK: dsrlv
+; ALL: dsrlv
%shr = lshr i64 %a0, %a1
ret i64 %shr
}
define i64 @f3(i64 %a0) nounwind readnone {
entry:
-; CHECK: dsll ${{[0-9]+}}, ${{[0-9]+}}, 10
+; ALL: dsll ${{[0-9]+}}, ${{[0-9]+}}, 10
%shl = shl i64 %a0, 10
ret i64 %shl
}
define i64 @f4(i64 %a0) nounwind readnone {
entry:
-; CHECK: dsra ${{[0-9]+}}, ${{[0-9]+}}, 10
+; ALL: dsra ${{[0-9]+}}, ${{[0-9]+}}, 10
%shr = ashr i64 %a0, 10
ret i64 %shr
}
define i64 @f5(i64 %a0) nounwind readnone {
entry:
-; CHECK: dsrl ${{[0-9]+}}, ${{[0-9]+}}, 10
+; ALL: dsrl ${{[0-9]+}}, ${{[0-9]+}}, 10
%shr = lshr i64 %a0, 10
ret i64 %shr
}
define i64 @f6(i64 %a0) nounwind readnone {
entry:
-; CHECK: dsll ${{[0-9]+}}, ${{[0-9]+}}, 40
+; ALL: dsll ${{[0-9]+}}, ${{[0-9]+}}, 40
%shl = shl i64 %a0, 40
ret i64 %shl
}
define i64 @f7(i64 %a0) nounwind readnone {
entry:
-; CHECK: dsra ${{[0-9]+}}, ${{[0-9]+}}, 40
+; ALL: dsra ${{[0-9]+}}, ${{[0-9]+}}, 40
%shr = ashr i64 %a0, 40
ret i64 %shr
}
define i64 @f8(i64 %a0) nounwind readnone {
entry:
-; CHECK: dsrl ${{[0-9]+}}, ${{[0-9]+}}, 40
+; ALL: dsrl ${{[0-9]+}}, ${{[0-9]+}}, 40
%shr = lshr i64 %a0, 40
ret i64 %shr
}
@@ -66,7 +67,7 @@ entry:
define i64 @f9(i64 %a0, i64 %a1) nounwind readnone {
entry:
; CHECK-NOT: sll
-; CHECK: drotrv
+; ALL: drotrv
%shr = lshr i64 %a0, %a1
%sub = sub i64 64, %a1
%shl = shl i64 %a0, %sub
@@ -77,7 +78,7 @@ entry:
define i64 @f10(i64 %a0, i64 %a1) nounwind readnone {
entry:
; CHECK-NOT: sll
-; CHECK: drotrv
+; ALL: drotrv
%shl = shl i64 %a0, %a1
%sub = sub i64 64, %a1
%shr = lshr i64 %a0, %sub
@@ -87,7 +88,7 @@ entry:
define i64 @f11(i64 %a0) nounwind readnone {
entry:
-; CHECK: drotr ${{[0-9]+}}, ${{[0-9]+}}, 10
+; ALL: drotr ${{[0-9]+}}, ${{[0-9]+}}, 10
%shr = lshr i64 %a0, 10
%shl = shl i64 %a0, 54
%or = or i64 %shr, %shl
@@ -96,7 +97,7 @@ entry:
define i64 @f12(i64 %a0) nounwind readnone {
entry:
-; CHECK: drotr ${{[0-9]+}}, ${{[0-9]+}}, 54
+; ALL: drotr ${{[0-9]+}}, ${{[0-9]+}}, 54
%shl = shl i64 %a0, 10
%shr = lshr i64 %a0, 54
%or = or i64 %shl, %shr
diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index c7eda3320bc6..9663138d4c81 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
@@ -1,52 +1,63 @@
; Check that [sl]dc1 are normally emitted. MIPS32r2 should have [sl]dxc1 too.
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R1-LDC1
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R2-LDXC1
-; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R6-LDC1
+; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R1-LDC1
+; RUN: llc -march=mipsel -mcpu=mips32r2 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R2-LDXC1
+; RUN: llc -march=mipsel -mcpu=mips32r6 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,32R6-LDC1
+; RUN: llc -march=mipsel -mcpu=mips32r3 -mattr=+micromips \
+; RUN: -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MM
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=+micromips \
+; RUN: -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MM
; Check that -mno-ldc1-sdc1 disables [sl]dc1
; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
-; RUN: -check-prefix=32R1-LE -check-prefix=32R1-LE-PIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R1,32R1-LE,32R1-LE-PIC
; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32r2 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
-; RUN: -check-prefix=32R2-LE -check-prefix=32R2-LE-PIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R2,32R2-LE,32R2-LE-PIC
; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32r6 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
-; RUN: -check-prefix=32R6-LE -check-prefix=32R6-LE-PIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R6,32R6-LE,32R6-LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r3 \
+; RUN: -mattr=+micromips < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MM-MNO-PIC,MM-MNO-LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r6 \
+; RUN: -mattr=+micromips < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MM-MNO-PIC,MM-MNO-LE-PIC
; Check again for big-endian
; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
-; RUN: -check-prefix=32R1-BE -check-prefix=32R1-BE-PIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R1,32R1-BE,32R1-BE-PIC
; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32r2 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
-; RUN: -check-prefix=32R2-BE -check-prefix=32R2-BE-PIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R2,32R2-BE,32R2-BE-PIC
; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32r6 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
-; RUN: -check-prefix=32R6-BE -check-prefix=32R6-BE-PIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R6,32R6-BE,32R6-BE-PIC
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r3 \
+; RUN: -mattr=+micromips < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MM-MNO-PIC,MM-MNO-BE-PIC
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r6 \
+; RUN: -mattr=+micromips < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,MM-MNO-PIC,MM-MNO-BE-PIC
; Check again for the static relocation model
; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
-; RUN: -check-prefix=32R1-LE -check-prefix=32R1-LE-STATIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R1,32R1-LE,32R1-LE-STATIC
; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32r2 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
-; RUN: -check-prefix=32R2-LE -check-prefix=32R2-LE-STATIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R2,32R2-LE,32R2-LE-STATIC
; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
; RUN: -mcpu=mips32r6 < %s | \
-; RUN: FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
-; RUN: -check-prefix=32R6-LE -check-prefix=32R6-LE-STATIC
+; RUN: FileCheck %s -check-prefixes=ALL,32R6,32R6-LE,32R6-LE-STATIC
+; RUN: llc -march=mipsel -relocation-model=static -mcpu=mips32r3 \
+; RUN: -mattr=+micromips < %s | FileCheck %s -check-prefixes=ALL,MM-STATIC_PIC
+; RUN: llc -march=mipsel -relocation-model=static -mcpu=mips32r6 \
+; RUN: -mattr=+micromips < %s | FileCheck %s -check-prefixes=ALL,MM-STATIC-PIC
@g0 = common global double 0.000000e+00, align 8
@@ -109,6 +120,26 @@
; 32R6-LDC1: ldc1 $f0, 0(${{[0-9]+}})
+; MM: lui $[[R0:[0-9]+]], %hi(_gp_disp)
+; MM: addiu $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
+; MM: addu $[[R2:[0-9]+]], $[[R1]], $25
+; MM: lw $[[R3:[0-9]+]], %got(g0)($[[R2]])
+; MM: ldc1 $f0, 0($[[R3]])
+
+; MM-MNO-PIC: lui $[[R0:[0-9]+]], %hi(_gp_disp)
+; MM-MNO-PIC: addiu $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
+; MM-MNO-PIC: addu $[[R2:[0-9]+]], $[[R1]], $25
+; MM-MNO-PIC: lw $[[R3:[0-9]+]], %got(g0)($[[R2]])
+; MM-MNO-PIC: lw16 $[[R4:[0-9]+]], 0($[[R3]])
+; MM-MNO-PIC: lw16 $[[R5:[0-9]+]], 4($[[R3]])
+; MM-MNO-LE-PIC: mtc1 $[[R4]], $f0
+; MM-MNO-LE-PIC: mthc1 $[[R5]], $f0
+; MM-MNO-BE-PIC: mtc1 $[[R5]], $f0
+; MM-MNO-BE-PIC: mthc1 $[[R4]], $f0
+
+; MM-STATIC-PIC: lui $[[R0:[0-9]+]], %hi(g0)
+; MM-STATIC-PIC: ldc1 $f0, %lo(g0)($[[R0]])
+
define double @test_ldc1() {
entry:
%0 = load double, double* @g0, align 8
@@ -174,6 +205,26 @@ entry:
; 32R6-LDC1: sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+; MM: lui $[[R0:[0-9]+]], %hi(_gp_disp)
+; MM: addiu $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
+; MM: addu $[[R2:[0-9]+]], $[[R1]], $25
+; MM: lw $[[R3:[0-9]+]], %got(g0)($[[R2]])
+; MM: sdc1 $f12, 0($[[R3]])
+
+; MM-MNO-PIC: lui $[[R0:[0-9]+]], %hi(_gp_disp)
+; MM-MNO-PIC: addiu $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
+; MM-MNO-PIC: addu $[[R2:[0-9]+]], $[[R1]], $25
+; MM-MNO-LE-PIC: mfc1 $[[R3:[0-9]+]], $f12
+; MM-MNO-BE-PIC: mfhc1 $[[R3:[0-9]+]], $f12
+; MM-MNO-PIC: lw $[[R4:[0-9]+]], %got(g0)($[[R2]])
+; MM-MNO-PIC: sw16 $[[R3]], 0($[[R4]])
+; MM-MNO-LE-PIC: mfhc1 $[[R5:[0-9]+]], $f12
+; MM-MNO-BE-PIC: mfc1 $[[R5:[0-9]+]], $f12
+; MM-MNO-PIC: sw16 $[[R5]], 4($[[R4]])
+
+; MM-STATIC-PIC: lui $[[R0:[0-9]+]], %hi(g0)
+; MM-STATIC-PIC: sdc1 $f12, %lo(g0)($[[R0]])
+
define void @test_sdc1(double %a) {
entry:
store double %a, double* @g0, align 8
@@ -210,6 +261,23 @@ entry:
; 32R6-LDC1: ldc1 $f0, 0(${{[0-9]+}})
+; MM: sll16 $[[R0:[0-9]+]], $5, 3
+; MM: addu16 $[[R1:[0-9]+]], $4, $[[R0]]
+; MM: ldc1 $f0, 0($[[R1]])
+
+; MM-MNO-PIC: sll16 $[[R0:[0-9]+]], $5, 3
+; MM-MNO-PIC: addu16 $[[R1:[0-9]+]], $4, $[[R0]]
+; MM-MNO-PIC: lw16 $[[R2:[0-9]+]], 0($[[R1]])
+; MM-MNO-PIC: lw16 $[[R3:[0-9]+]], 4($[[R1]])
+; MM-MNO-LE-PIC: mtc1 $[[R2]], $f0
+; MM-MNO-LE-PIC: mthc1 $[[R3]], $f0
+; MM-MNO-BE-PIC: mtc1 $[[R3]], $f0
+; MM-MNO-BE-PIC: mthc1 $[[R2]], $f0
+
+; MM-STATIC-PIC: sll16 $[[R0:[0-9]+]], $5, 3
+; MM-STATIC-PIC: addu16 $[[R1:[0-9]+]], $4, $[[R0]]
+; MM-STATIC-PIC: ldc1 $f0, 0($[[R1]])
+
define double @test_ldxc1(double* nocapture readonly %a, i32 %i) {
entry:
%arrayidx = getelementptr inbounds double, double* %a, i32 %i
@@ -241,6 +309,23 @@ entry:
; 32R6-LDC1: sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+; MM: sll16 $[[R0:[0-9]+]], $7, 3
+; MM: addu16 $[[R1:[0-9]+]], $6, $[[R0]]
+; MM: sdc1 $f12, 0($[[R1]])
+
+; MM-MNO-PIC: sll16 $[[R0:[0-9]+]], $7, 3
+; MM-MNO-PIC: addu16 $[[R1:[0-9]+]], $6, $[[R0]]
+; MM-MNO-LE-PIC: mfc1 $[[R2:[0-9]+]], $f12
+; MM-MNO-BE-PIC: mfhc1 $[[R2:[0-9]+]], $f12
+; MM-MNO-PIC: sw16 $[[R2]], 0($[[R1]])
+; MM-MNO-LE-PIC: mfhc1 $[[R3:[0-9]+]], $f12
+; MM-MNO-BE-PIC: mfc1 $[[R3:[0-9]+]], $f12
+; MM-MNO-PIC: sw16 $[[R3]], 4($[[R1]])
+
+; MM-STATIC-PIC: sll16 $[[R0:[0-9]+]], $7, 3
+; MM-STATIC-PIC: addu16 $[[R1:[0-9]+]], $6, $[[R0]]
+; MM-STATIC-PIC: sdc1 $f12, 0($[[R1]])
+
define void @test_sdxc1(double %b, double* nocapture %a, i32 %i) {
entry:
%arrayidx = getelementptr inbounds double, double* %a, i32 %i
diff --git a/test/CodeGen/Mips/msa/2r.ll b/test/CodeGen/Mips/msa/2r.ll
index 501936c76e73..0abd73f180a0 100644
--- a/test/CodeGen/Mips/msa/2r.ll
+++ b/test/CodeGen/Mips/msa/2r.ll
@@ -1,7 +1,7 @@
; Test the MSA intrinsics that are encoded with the 2R instruction format.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_nloc_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_nloc_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/test/CodeGen/Mips/msa/2r_vector_scalar.ll b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
index ddcd3cf757d9..db0eb5ad5a43 100644
--- a/test/CodeGen/Mips/msa/2r_vector_scalar.ll
+++ b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
@@ -1,14 +1,14 @@
; Test the MSA intrinsics that are encoded with the 2R instruction format and
; convert scalars to vectors.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
-; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
@llvm_mips_fill_b_ARG1 = global i32 23, align 16
@llvm_mips_fill_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
@@ -97,4 +97,4 @@ declare <2 x i64> @llvm.mips.fill.d(i64) nounwind
; MIPS64-DAG: ld [[RD:\$[0-9]+]], %got_disp(llvm_mips_fill_d_RES)
; MIPS64-DAG: st.d [[WD]], 0([[RD]])
; MIPS-ANY: .size llvm_mips_fill_d_test
-; \ No newline at end of file
+;
diff --git a/test/CodeGen/Mips/msa/2rf.ll b/test/CodeGen/Mips/msa/2rf.ll
index 1dbfbda1b612..885e2c64bcfc 100644
--- a/test/CodeGen/Mips/msa/2rf.ll
+++ b/test/CodeGen/Mips/msa/2rf.ll
@@ -1,7 +1,7 @@
; Test the MSA intrinsics that are encoded with the 2RF instruction format.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_flog2_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
@llvm_mips_flog2_w_RES = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
diff --git a/test/CodeGen/Mips/msa/2rf_float_int.ll b/test/CodeGen/Mips/msa/2rf_float_int.ll
index 369015814b0e..44861caf4650 100644
--- a/test/CodeGen/Mips/msa/2rf_float_int.ll
+++ b/test/CodeGen/Mips/msa/2rf_float_int.ll
@@ -1,8 +1,8 @@
; Test the MSA integer to floating point conversion intrinsics that are encoded
; with the 2RF instruction format.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_ffint_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
@llvm_mips_ffint_s_w_RES = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
diff --git a/test/CodeGen/Mips/msa/2rf_int_float.ll b/test/CodeGen/Mips/msa/2rf_int_float.ll
index 77d1404f9cfa..f845f3ddd740 100644
--- a/test/CodeGen/Mips/msa/2rf_int_float.ll
+++ b/test/CodeGen/Mips/msa/2rf_int_float.ll
@@ -2,8 +2,8 @@
; 2RF instruction format. This includes conversions but other instructions such
; as fclass are also here.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_fclass_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
@llvm_mips_fclass_w_RES = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
diff --git a/test/CodeGen/Mips/msa/3r-a.ll b/test/CodeGen/Mips/msa/3r-a.ll
index db772f918614..8feb928a8add 100644
--- a/test/CodeGen/Mips/msa/3r-a.ll
+++ b/test/CodeGen/Mips/msa/3r-a.ll
@@ -1,8 +1,8 @@
; Test the MSA intrinsics that are encoded with the 3R instruction format.
; There are lots of these so this covers those beginning with 'a'
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
; It should fail to compile without fp64.
; RUN: not llc -march=mips -mattr=+msa < %s 2>&1 | \
diff --git a/test/CodeGen/Mips/msa/3r-b.ll b/test/CodeGen/Mips/msa/3r-b.ll
index 2ecdc4290067..1c4f14814a2a 100644
--- a/test/CodeGen/Mips/msa/3r-b.ll
+++ b/test/CodeGen/Mips/msa/3r-b.ll
@@ -1,8 +1,8 @@
; Test the MSA intrinsics that are encoded with the 3R instruction format.
; There are lots of these so this covers those beginning with 'b'
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_bclr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_bclr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/test/CodeGen/Mips/msa/3r-s.ll b/test/CodeGen/Mips/msa/3r-s.ll
index d04c5ff165f2..fe4365110366 100644
--- a/test/CodeGen/Mips/msa/3r-s.ll
+++ b/test/CodeGen/Mips/msa/3r-s.ll
@@ -1,8 +1,8 @@
; Test the MSA intrinsics that are encoded with the 3R instruction format.
; There are lots of these so this covers those beginning with 's'
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_sld_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_sld_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/test/CodeGen/Mips/msa/3r_splat.ll b/test/CodeGen/Mips/msa/3r_splat.ll
index 56d26b030de9..78c485f4f868 100644
--- a/test/CodeGen/Mips/msa/3r_splat.ll
+++ b/test/CodeGen/Mips/msa/3r_splat.ll
@@ -1,9 +1,9 @@
; Test the MSA splat intrinsics that are encoded with the 3R instruction
; format.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | \
; RUN: FileCheck -check-prefix=MIPS32 %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | \
; RUN: FileCheck -check-prefix=MIPS32 %s
@llvm_mips_splat_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
index 2efec2911935..5d253d7af253 100644
--- a/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1,9 +1,21 @@
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=O32 -check-prefix=MIPS32 -check-prefix=ALL-BE %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=O32 -check-prefix=MIPS32 -check-prefix=ALL-LE %s
-; RUN: llc -march=mips64 -target-abi n32 -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N32 -check-prefix=MIPS64 -check-prefix=ALL-BE %s
-; RUN: llc -march=mips64el -target-abi n32 -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N32 -check-prefix=MIPS64 -check-prefix=ALL-LE %s
-; RUN: llc -march=mips64 -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N64 -check-prefix=MIPS64 -check-prefix=ALL-BE %s
-; RUN: llc -march=mips64el -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N64 -check-prefix=MIPS64 -check-prefix=ALL-LE %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic \
+; RUN: -verify-machineinstrs < %s | \
+; RUN: FileCheck -check-prefixes=ALL,O32,MIPS32,ALL-BE %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic \
+; RUN: -verify-machineinstrs < %s | \
+; RUN: FileCheck -check-prefixes=ALL,O32,MIPS32,ALL-LE %s
+; RUN: llc -march=mips64 -target-abi n32 -mattr=+msa,+fp64 \
+; RUN: -relocation-model=pic -verify-machineinstrs < %s | \
+; RUN: FileCheck -check-prefixes=ALL,N32,MIPS64,ALL-BE %s
+; RUN: llc -march=mips64el -target-abi n32 -mattr=+msa,+fp64 \
+; RUN: -relocation-model=pic -verify-machineinstrs < %s | \
+; RUN: FileCheck -check-prefixes=ALL,N32,MIPS64,ALL-LE %s
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic \
+; RUN: -verify-machineinstrs < %s | \
+; RUN: FileCheck -check-prefixes=ALL,N64,MIPS64,ALL-BE %s
+; RUN: llc -march=mips64el -mattr=+msa,+fp64 -relocation-model=pic \
+; RUN: -verify-machineinstrs < %s | \
+; RUN: FileCheck -check-prefixes=ALL,N64,MIPS64,ALL-LE %s
@v4i8 = global <4 x i8> <i8 0, i8 0, i8 0, i8 0>
@v16i8 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
index f19cb9b7c2e5..d714b3eec1f2 100644
--- a/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=O32 %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=O32 %s
-; RUN: llc -march=mips64 -target-abi=n32 -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N32 %s
-; RUN: llc -march=mips64el -target-abi=n32 -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N32 %s
-; RUN: llc -march=mips64 -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N64 %s
-; RUN: llc -march=mips64el -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=N64 %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,O32 %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,O32 %s
+; RUN: llc -march=mips64 -target-abi=n32 -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64el -target-abi=n32 -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,N32 %s
+; RUN: llc -march=mips64 -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,N64 %s
+; RUN: llc -march=mips64el -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,N64 %s
@v4f32 = global <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
@v2f64 = global <2 x double> <double 0.0, double 0.0>
diff --git a/test/CodeGen/Mips/msa/elm_copy.ll b/test/CodeGen/Mips/msa/elm_copy.ll
index 251b535fd76c..8befecbdd65a 100644
--- a/test/CodeGen/Mips/msa/elm_copy.ll
+++ b/test/CodeGen/Mips/msa/elm_copy.ll
@@ -1,14 +1,14 @@
; Test the MSA intrinsics that are encoded with the ELM instruction format and
; are element extraction operations.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
-; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
@llvm_mips_copy_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_copy_s_b_RES = global i32 0, align 16
diff --git a/test/CodeGen/Mips/msa/elm_cxcmsa.ll b/test/CodeGen/Mips/msa/elm_cxcmsa.ll
index 8d6b0ee20ab8..b96499c15235 100644
--- a/test/CodeGen/Mips/msa/elm_cxcmsa.ll
+++ b/test/CodeGen/Mips/msa/elm_cxcmsa.ll
@@ -1,8 +1,8 @@
; Test the MSA ctcmsa and cfcmsa intrinsics (which are encoded with the ELM
; instruction format).
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -verify-machineinstrs < %s | FileCheck %s
define i32 @msa_ir_cfcmsa_test() nounwind {
entry:
diff --git a/test/CodeGen/Mips/msa/elm_insv.ll b/test/CodeGen/Mips/msa/elm_insv.ll
index 46e6289189df..6458b6dc23db 100644
--- a/test/CodeGen/Mips/msa/elm_insv.ll
+++ b/test/CodeGen/Mips/msa/elm_insv.ll
@@ -1,14 +1,14 @@
; Test the MSA element insertion intrinsics that are encoded with the ELM
; instruction format.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS32
-; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
-; RUN: FileCheck %s -check-prefix=MIPS-ANY -check-prefix=MIPS64
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
@llvm_mips_insert_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_insert_b_ARG3 = global i32 27, align 16
diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
index afd28ae184dd..f903381f9ef0 100644
--- a/test/CodeGen/Mips/msa/frameindex.ll
+++ b/test/CodeGen/Mips/msa/frameindex.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-BE %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-LE %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefixes=MIPS32-AE,MIPS32-BE %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefixes=MIPS32-AE,MIPS32-LE %s
define void @loadstore_v16i8_near() nounwind {
; MIPS32-AE: loadstore_v16i8_near:
diff --git a/test/CodeGen/Mips/msa/i5-b.ll b/test/CodeGen/Mips/msa/i5-b.ll
index 40ab095f6809..c588c8b2407e 100644
--- a/test/CodeGen/Mips/msa/i5-b.ll
+++ b/test/CodeGen/Mips/msa/i5-b.ll
@@ -1,8 +1,8 @@
; Test the MSA intrinsics that are encoded with the I5 instruction format.
; There are lots of these so this covers those beginning with 'b'
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_bclri_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_bclri_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/test/CodeGen/Mips/msa/i8.ll b/test/CodeGen/Mips/msa/i8.ll
index 4af9c588fdef..5f8b4663c387 100644
--- a/test/CodeGen/Mips/msa/i8.ll
+++ b/test/CodeGen/Mips/msa/i8.ll
@@ -1,7 +1,7 @@
; Test the MSA intrinsics that are encoded with the I8 instruction format.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
@llvm_mips_andi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_andi_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/test/CodeGen/Mips/msa/vec.ll b/test/CodeGen/Mips/msa/vec.ll
index 8790923ce727..f206975990af 100644
--- a/test/CodeGen/Mips/msa/vec.ll
+++ b/test/CodeGen/Mips/msa/vec.ll
@@ -1,7 +1,7 @@
; Test the MSA intrinsics that are encoded with the VEC instruction format.
-; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ANYENDIAN %s
-; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ANYENDIAN %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefix=ANYENDIAN %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck -check-prefix=ANYENDIAN %s
@llvm_mips_and_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
@llvm_mips_and_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/test/CodeGen/Mips/nacl-align.ll b/test/CodeGen/Mips/nacl-align.ll
index 8191c7dec6f2..74b6286648dd 100644
--- a/test/CodeGen/Mips/nacl-align.ll
+++ b/test/CodeGen/Mips/nacl-align.ll
@@ -7,8 +7,8 @@
define void @test0() {
ret void
-; CHECK: .align 4
-; CHECK-NOT: .align
+; CHECK: .p2align 4
+; CHECK-NOT: .p2align
; CHECK-LABEL: test0:
}
@@ -40,18 +40,18 @@ default:
; CHECK-LABEL: test1:
-; CHECK: .align 4
+; CHECK: .p2align 4
; CHECK-NEXT: ${{BB[0-9]+_[0-9]+}}:
; CHECK-NEXT: jr $ra
; CHECK-NEXT: addiu $2, $zero, 111
; CHECK-NEXT: ${{BB[0-9]+_[0-9]+}}:
; CHECK-NEXT: jr $ra
; CHECK-NEXT: addiu $2, $zero, 555
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: ${{BB[0-9]+_[0-9]+}}:
; CHECK-NEXT: jr $ra
; CHECK-NEXT: addiu $2, $zero, 222
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: ${{BB[0-9]+_[0-9]+}}:
; CHECK-NEXT: jr $ra
; CHECK-NEXT: addiu $2, $zero, 333
@@ -81,12 +81,12 @@ bb2:
; Note that there are two consecutive labels - one temporary and one for
; basic block.
-; CHECK: .align 4
+; CHECK: .p2align 4
; CHECK-NEXT: ${{[a-zA-Z0-9]+}}:
; CHECK-NEXT: ${{BB[0-9]+_[0-9]+}}:
; CHECK-NEXT: jr $ra
; CHECK-NEXT: addiu $2, $zero, 111
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: ${{[a-zA-Z0-9]+}}:
; CHECK-NEXT: ${{BB[0-9]+_[0-9]+}}:
; CHECK-NEXT: jr $ra
diff --git a/test/CodeGen/Mips/no-odd-spreg-msa.ll b/test/CodeGen/Mips/no-odd-spreg-msa.ll
index 7213044a2300..603a99e267f6 100644
--- a/test/CodeGen/Mips/no-odd-spreg-msa.ll
+++ b/test/CodeGen/Mips/no-odd-spreg-msa.ll
@@ -1,9 +1,9 @@
; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,-nooddspreg \
-; RUN: -no-integrated-as < %s | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=ODDSPREG
+; RUN: -no-integrated-as -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,ODDSPREG
; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+fp64,+msa,+nooddspreg \
-; RUN: -no-integrated-as < %s | FileCheck %s -check-prefix=ALL \
-; RUN: -check-prefix=NOODDSPREG
+; RUN: -no-integrated-as -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=ALL,NOODDSPREG
@v4f32 = global <4 x float> zeroinitializer
diff --git a/test/CodeGen/Mips/no-odd-spreg.ll b/test/CodeGen/Mips/no-odd-spreg.ll
index 572e940bc467..411441a7bd94 100644
--- a/test/CodeGen/Mips/no-odd-spreg.ll
+++ b/test/CodeGen/Mips/no-odd-spreg.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-NO-EMIT
-; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
-; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-NO-EMIT
-; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64,+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
-; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fpxx,-nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-EMIT
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefixes=ALL,ODDSPREG,ODDSPREG-NO-EMIT
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+nooddspreg < %s | FileCheck %s -check-prefixes=ALL,NOODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s | FileCheck %s -check-prefixes=ALL,ODDSPREG,ODDSPREG-NO-EMIT
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64,+nooddspreg < %s | FileCheck %s -check-prefixes=ALL,NOODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fpxx,-nooddspreg < %s | FileCheck %s -check-prefixes=ALL,ODDSPREG,ODDSPREG-EMIT
; We don't emit a directive unless we need to. This is to support versions of
; GAS which do not support the directive.
diff --git a/test/CodeGen/Mips/o32_cc.ll b/test/CodeGen/Mips/o32_cc.ll
index c28f9abcadcd..cef1290a75bd 100644
--- a/test/CodeGen/Mips/o32_cc.ll
+++ b/test/CodeGen/Mips/o32_cc.ll
@@ -1,8 +1,7 @@
; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=ALL %s
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck -check-prefix=ALL %s
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck -check-prefix=ALL -check-prefix=NO-MFHC1 %s
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-MFHC1 %s
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-MFHC1 %s
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck -check-prefixes=ALL,NO-MFHC1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck -check-prefixes=ALL,HAS-MFHC1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck -check-prefixes=ALL,HAS-MFHC1 %s
; $f12, $f14
; ALL-LABEL: testlowercall0:
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index 108c663ab1cd..33431dba43c4 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
%0 = type { i8, i16, i32, i64, double, i32, [4 x i8] }
%struct.S1 = type { i8, i16, i32, i64, double, i32 }
diff --git a/test/CodeGen/Mips/octeon.ll b/test/CodeGen/Mips/octeon.ll
index 499ce3c1ddbf..b441274cec6b 100644
--- a/test/CodeGen/Mips/octeon.ll
+++ b/test/CodeGen/Mips/octeon.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O1 < %s -march=mips64 -mcpu=octeon | FileCheck %s -check-prefix=ALL -check-prefix=OCTEON
-; RUN: llc -O1 < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64
+; RUN: llc -O1 < %s -march=mips64 -mcpu=octeon | FileCheck %s -check-prefixes=ALL,OCTEON
+; RUN: llc -O1 < %s -march=mips64 -mcpu=mips64 | FileCheck %s -check-prefixes=ALL,MIPS64
define i64 @addi64(i64 %a, i64 %b) nounwind {
entry:
diff --git a/test/CodeGen/Mips/octeon_popcnt.ll b/test/CodeGen/Mips/octeon_popcnt.ll
index 3432b3992984..13488ed59504 100644
--- a/test/CodeGen/Mips/octeon_popcnt.ll
+++ b/test/CodeGen/Mips/octeon_popcnt.ll
@@ -21,7 +21,7 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; MIPS64-NOT: pop
}
-define i32 @cnt32(i32 zeroext %x) nounwind readnone {
+define i32 @cnt32(i32 signext %x) nounwind readnone {
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
ret i32 %cnt
; OCTEON-LABEL: cnt32:
diff --git a/test/CodeGen/Mips/optimize-pic-o0.ll b/test/CodeGen/Mips/optimize-pic-o0.ll
index 454bc851484d..8790b8e92b74 100644
--- a/test/CodeGen/Mips/optimize-pic-o0.ll
+++ b/test/CodeGen/Mips/optimize-pic-o0.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel -O0 -relocation-model=pic < %s | FileCheck %s
; Function Attrs: nounwind
define i32 @main() {
diff --git a/test/CodeGen/Mips/prevent-hoisting.ll b/test/CodeGen/Mips/prevent-hoisting.ll
index 81b14d7441b3..200848ac5485 100644
--- a/test/CodeGen/Mips/prevent-hoisting.ll
+++ b/test/CodeGen/Mips/prevent-hoisting.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -O3 < %s | FileCheck %s
+; RUN: llc -march=mipsel -O3 -relocation-model=pic < %s | FileCheck %s
; MIPS direct branches implicitly define register $at. This test makes sure that
@@ -11,12 +11,12 @@
; CHECK-LABEL: readLumaCoeff8x8_CABAC
; The check for first "addiu" instruction is added so that we can match the correct "b" instruction.
-; CHECK: addiu ${{[0-9]+}}, $zero, -1
+; CHECK: andi
; CHECK: b $[[BB0:BB[0-9_]+]]
-; CHECK-NEXT: addiu ${{[0-9]+}}, $zero, 0
+; CHECK-NEXT: sll
; Check that at the start of a fallthrough block there is a instruction that writes to $1.
-; CHECK-NEXT: {{BB[0-9_#]+}}:
+; CHECK-NEXT: {{BB[0-9_#]+}}:
; CHECK-NEXT: lw $[[R1:[0-9]+]], %got(assignSE2partition)($[[R2:[0-9]+]])
; CHECK-NEXT: sll $1, $[[R0:[0-9]+]], 4
diff --git a/test/CodeGen/Mips/private-addr.ll b/test/CodeGen/Mips/private-addr.ll
new file mode 100644
index 000000000000..37dd6fe53c40
--- /dev/null
+++ b/test/CodeGen/Mips/private-addr.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=mips-pc-linux -relocation-model=pic < %s | FileCheck %s
+
+define private void @bar() {
+ ret void
+}
+
+define void()* @foo() {
+; CHECK: foo:
+; CHECK: lw $[[REG:.*]], %got($bar)($1)
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: addiu $2, $[[REG]], %lo($bar)
+
+ ret void()* @bar
+}
diff --git a/test/CodeGen/Mips/private.ll b/test/CodeGen/Mips/private.ll
index 5907dbd644ae..07affbf30c38 100644
--- a/test/CodeGen/Mips/private.ll
+++ b/test/CodeGen/Mips/private.ll
@@ -1,6 +1,6 @@
; Test to make sure that the 'private' is used correctly.
;
-; RUN: llc -march=mips < %s | FileCheck %s
+; RUN: llc -march=mips -relocation-model=pic < %s | FileCheck %s
define private void @foo() {
; CHECK-LABEL: foo:
diff --git a/test/CodeGen/Mips/return-vector.ll b/test/CodeGen/Mips/return-vector.ll
index 3870fe092156..08eddf370096 100644
--- a/test/CodeGen/Mips/return-vector.ll
+++ b/test/CodeGen/Mips/return-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
; Check that function accesses vector return value from stack in cases when
diff --git a/test/CodeGen/Mips/rotate.ll b/test/CodeGen/Mips/rotate.ll
index 70eff6e224d0..77936b7bef9b 100644
--- a/test/CodeGen/Mips/rotate.ll
+++ b/test/CodeGen/Mips/rotate.ll
@@ -1,8 +1,15 @@
; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s
; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 < %s | FileCheck %s -check-prefix=mips16
+; RUN: llc -march=mips -mcpu=mips32r3 -mattr=+micromips < %s | FileCheck %s \
+; RUN: -check-prefix=MM32
+; RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips < %s | FileCheck %s \
+; RUN: -check-prefix=MM32
; CHECK: rotrv $2, $4
; mips16: .ent rot0
+; MM32: li16 $2, 32
+; MM32: subu16 $2, $2, $5
+; MM32: rotrv $2, $4, $2
define i32 @rot0(i32 %a, i32 %b) nounwind readnone {
entry:
%shl = shl i32 %a, %b
@@ -14,6 +21,7 @@ entry:
; CHECK: rotr $2, $4, 22
; mips16: .ent rot1
+; MM32: rotr $2, $4, 22
define i32 @rot1(i32 %a) nounwind readnone {
entry:
%shl = shl i32 %a, 10
@@ -24,6 +32,7 @@ entry:
; CHECK: rotrv $2, $4, $5
; mips16: .ent rot2
+; MM32: rotrv $2, $4, $5
define i32 @rot2(i32 %a, i32 %b) nounwind readnone {
entry:
%shr = lshr i32 %a, %b
@@ -35,6 +44,7 @@ entry:
; CHECK: rotr $2, $4, 10
; mips16: .ent rot3
+; MM32: rotr $2, $4, 10
define i32 @rot3(i32 %a) nounwind readnone {
entry:
%shr = lshr i32 %a, 10
diff --git a/test/CodeGen/Mips/select.ll b/test/CodeGen/Mips/select.ll
index 96bd3782c058..0ef8f36333f2 100644
--- a/test/CodeGen/Mips/select.ll
+++ b/test/CodeGen/Mips/select.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=ALL -check-prefix=32
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32R2
-; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32R6
-; RUN: llc < %s -march=mips64el -mcpu=mips64 | FileCheck %s -check-prefix=ALL -check-prefix=64
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64R2
-; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64R6
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,32
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,32R2
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,32R6
+; RUN: llc < %s -march=mips64el -mcpu=mips64 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,64
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,64R2
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,64R6
@d2 = external global double
@d3 = external global double
diff --git a/test/CodeGen/Mips/selectcc.ll b/test/CodeGen/Mips/selectcc.ll
index 9790a0a3e411..865e4b38acad 100644
--- a/test/CodeGen/Mips/selectcc.ll
+++ b/test/CodeGen/Mips/selectcc.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s
-; RUN: llc -march=mipsel -mcpu=mips32 -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s
-; RUN: llc -march=mipsel -mcpu=mips32r2 -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=pic < %s
+; RUN: llc -march=mipsel -mcpu=mips32 -pre-RA-sched=source -relocation-model=pic < %s | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -march=mipsel -mcpu=mips32r2 -relocation-model=pic < %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 -pre-RA-sched=source -relocation-model=pic < %s | FileCheck %s --check-prefix=SOURCE-SCHED
@gf0 = external global float
@gf1 = external global float
diff --git a/test/CodeGen/Mips/selectiondag-optlevel.ll b/test/CodeGen/Mips/selectiondag-optlevel.ll
new file mode 100644
index 000000000000..999361131745
--- /dev/null
+++ b/test/CodeGen/Mips/selectiondag-optlevel.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=mips -fast-isel=false -O0 < %s 2>&1 | FileCheck %s -check-prefix=O0
+; RUN: llc -march=mips -fast-isel=false -O2 < %s 2>&1 | FileCheck %s -check-prefix=O2
+
+; At -O0, DAGCombine won't try to merge these consecutive loads but it will at
+; -O2.
+
+define void @foo() nounwind {
+entry:
+ %0 = alloca [2 x i8], align 32
+ %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
+ store i8 1, i8* %1
+ %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
+ store i8 1, i8* %2
+ ret void
+}
+
+; O0: addiu $[[REG:[0-9]+]], $zero, 1
+; O0-DAG: sb $[[REG]], 0($sp)
+; O0-DAG: sb $[[REG]], 1($sp)
+
+; O2: addiu $[[REG:[0-9]+]], $zero, 257
+; O2: sh $[[REG]], 0($sp)
diff --git a/test/CodeGen/Mips/stackcoloring.ll b/test/CodeGen/Mips/stackcoloring.ll
index 5516b5a3c023..817caee2f275 100644
--- a/test/CodeGen/Mips/stackcoloring.ll
+++ b/test/CodeGen/Mips/stackcoloring.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
@g1 = external global i32*
diff --git a/test/CodeGen/Mips/start-asm-file.ll b/test/CodeGen/Mips/start-asm-file.ll
index 60c047a4e8cc..6d0a5425230a 100644
--- a/test/CodeGen/Mips/start-asm-file.ll
+++ b/test/CodeGen/Mips/start-asm-file.ll
@@ -3,53 +3,53 @@
; ### O32 ABI ###
; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
; RUN: -relocation-model=static %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-STATIC-O32 -check-prefix=CHECK-STATIC-O32-NLEGACY %s
+; RUN: FileCheck -check-prefixes=CHECK-STATIC-O32,CHECK-STATIC-O32-NLEGACY %s
; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
; RUN: -relocation-model=pic %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-PIC-O32 -check-prefix=CHECK-PIC-O32-NLEGACY %s
+; RUN: FileCheck -check-prefixes=CHECK-PIC-O32,CHECK-PIC-O32-NLEGACY %s
; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
; RUN: -relocation-model=static -mattr=+nan2008 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-STATIC-O32 -check-prefix=CHECK-STATIC-O32-N2008 %s
+; RUN: FileCheck -check-prefixes=CHECK-STATIC-O32,CHECK-STATIC-O32-N2008 %s
; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
; RUN: -relocation-model=pic -mattr=+nan2008 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-PIC-O32 -check-prefix=CHECK-PIC-O32-N2008 %s
+; RUN: FileCheck -check-prefixes=CHECK-PIC-O32,CHECK-PIC-O32-N2008 %s
; ### N32 ABI ###
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=static -target-abi n32 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-NLEGACY %s
+; RUN: FileCheck -check-prefixes=CHECK-STATIC-N32,CHECK-STATIC-N32-NLEGACY %s
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=pic -target-abi n32 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-NLEGACY %s
+; RUN: FileCheck -check-prefixes=CHECK-PIC-N32,CHECK-PIC-N32-NLEGACY %s
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=static -target-abi n32 -mattr=+nan2008 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-N2008 %s
+; RUN: FileCheck -check-prefixes=CHECK-STATIC-N32,CHECK-STATIC-N32-N2008 %s
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=pic -target-abi n32 -mattr=+nan2008 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-N2008 %s
+; RUN: FileCheck -check-prefixes=CHECK-PIC-N32,CHECK-PIC-N32-N2008 %s
; ### N64 ABI ###
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=static -target-abi n64 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-NLEGACY %s
+; RUN: FileCheck -check-prefixes=CHECK-STATIC-N64,CHECK-STATIC-N64-NLEGACY %s
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=pic -target-abi n64 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-NLEGACY %s
+; RUN: FileCheck -check-prefixes=CHECK-PIC-N64,CHECK-PIC-N64-NLEGACY %s
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=static -target-abi n64 -mattr=+nan2008 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-N2008 %s
+; RUN: FileCheck -check-prefixes=CHECK-STATIC-N64,CHECK-STATIC-N64-N2008 %s
; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
; RUN: -relocation-model=pic -target-abi n64 -mattr=+nan2008 %s -o - | \
-; RUN: FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-N2008 %s
+; RUN: FileCheck -check-prefixes=CHECK-PIC-N64,CHECK-PIC-N64-N2008 %s
; CHECK-STATIC-O32: .abicalls
; CHECK-STATIC-O32: .option pic0
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
index 4b3c8fb79315..34493e9ae338 100644
--- a/test/CodeGen/Mips/stchar.ll
+++ b/test/CodeGen/Mips/stchar.ll
@@ -5,32 +5,8 @@
@sp = common global i16* null, align 4
@cp = common global i8* null, align 4
-define void @p1(i16 signext %s, i8 signext %c) nounwind {
-entry:
- %conv = sext i16 %s to i32
- %conv1 = sext i8 %c to i32
- %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv, i32 %conv1) nounwind
- ret void
-}
-
declare i32 @printf(i8* nocapture, ...) nounwind
-define void @p2() nounwind {
-entry:
- %0 = load i16*, i16** @sp, align 4
- %1 = load i16, i16* %0, align 2
- %2 = load i8*, i8** @cp, align 4
- %3 = load i8, i8* %2, align 1
- %conv.i = sext i16 %1 to i32
- %conv1.i = sext i8 %3 to i32
- %call.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
- %4 = load i16*, i16** @sp, align 4
- store i16 32, i16* %4, align 2
- %5 = load i8*, i8** @cp, align 4
- store i8 97, i8* %5, align 1
- ret void
-}
-
define void @test() nounwind {
entry:
%s = alloca i16, align 4
@@ -58,32 +34,6 @@ entry:
; 16_h: lh ${{[0-9]+}}, [[offset2]](${{[0-9]+}})
}
-define i32 @main() nounwind {
-entry:
- %s.i = alloca i16, align 4
- %c.i = alloca i8, align 4
- %0 = bitcast i16* %s.i to i8*
- call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
- call void @llvm.lifetime.start(i64 -1, i8* %c.i) nounwind
- store i16 16, i16* %s.i, align 4
- store i8 99, i8* %c.i, align 4
- store i16* %s.i, i16** @sp, align 4
- store i8* %c.i, i8** @cp, align 4
- %call.i.i.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
- %1 = load i16*, i16** @sp, align 4
- store i16 32, i16* %1, align 2
- %2 = load i8*, i8** @cp, align 4
- store i8 97, i8* %2, align 1
- %3 = load i16, i16* %s.i, align 4
- %4 = load i8, i8* %c.i, align 4
- %conv.i.i = sext i16 %3 to i32
- %conv1.i.i = sext i8 %4 to i32
- %call.i.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), i32 %conv.i.i, i32 %conv1.i.i) nounwind
- call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
- call void @llvm.lifetime.end(i64 -1, i8* %c.i) nounwind
- ret i32 0
-}
-
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/Mips/stldst.ll b/test/CodeGen/Mips/stldst.ll
index 8aecca4aed67..62d5f1f92b45 100644
--- a/test/CodeGen/Mips/stldst.ll
+++ b/test/CodeGen/Mips/stldst.ll
@@ -33,9 +33,9 @@ entry:
%call7 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str, i32 0, i32 0), i32 %0, i32 %1, i32 %add, i32 %add1, i32 %sub, i32 %add2, i32 %add3, i32 %sub4, i32 %sub5, i32 %add6) nounwind
ret i32 0
}
-; 16: sw ${{[0-9]+}}, {{[0-9]+}} ( $sp ); # 4-byte Folded Spill
-; 16: lw ${{[0-9]+}}, {{[0-9]+}} ( $sp ); # 4-byte Folded Reload
-; 16: sw ${{[0-9]+}}, {{[0-9]+}} ( $sp ); # 4-byte Folded Spill
-; 16: lw ${{[0-9]+}}, {{[0-9]+}} ( $sp ); # 4-byte Folded Reload
+; 16: sw ${{[0-9]+}}, {{[0-9]+}}($sp) # 4-byte Folded Spill
+; 16: lw ${{[0-9]+}}, {{[0-9]+}}($sp) # 4-byte Folded Reload
+; 16: sw ${{[0-9]+}}, {{[0-9]+}}($sp) # 4-byte Folded Spill
+; 16: lw ${{[0-9]+}}, {{[0-9]+}}($sp) # 4-byte Folded Reload
declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
index b0868255053a..61f8e508f40d 100644
--- a/test/CodeGen/Mips/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -1,11 +1,12 @@
-; RUN: llc -march=mipsel -relocation-model=pic -enable-mips-tail-calls < %s | \
-; RUN: FileCheck %s -check-prefix=PIC32
-; RUN: llc -march=mipsel -relocation-model=static \
-; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \
-; RUN: < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -relocation-model=pic -enable-mips-tail-calls \
+; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=PIC32
+; RUN: llc -march=mipsel -relocation-model=static -enable-mips-tail-calls \
+; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=STATIC32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -enable-mips-tail-calls \
+; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=N64
; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic \
-; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=PIC16
+; RUN: -enable-mips-tail-calls -verify-machineinstrs < %s | \
+; RUN: FileCheck %s -check-prefix=PIC16
@g0 = common global i32 0, align 4
@g1 = common global i32 0, align 4
diff --git a/test/CodeGen/Mips/thread-pointer.ll b/test/CodeGen/Mips/thread-pointer.ll
new file mode 100644
index 000000000000..60bee3d03031
--- /dev/null
+++ b/test/CodeGen/Mips/thread-pointer.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mips < %s | FileCheck %s
+; RUN: llc -march=mips64 < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mips64el < %s | FileCheck %s
+
+declare i8* @llvm.thread.pointer() nounwind readnone
+
+define i8* @thread_pointer() {
+; CHECK: rdhwr $3, $29
+ %1 = tail call i8* @llvm.thread.pointer()
+ ret i8* %1
+}
diff --git a/test/CodeGen/Mips/tls-models.ll b/test/CodeGen/Mips/tls-models.ll
index 1a958dceaa28..ca3c7fde19d4 100644
--- a/test/CodeGen/Mips/tls-models.ll
+++ b/test/CodeGen/Mips/tls-models.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=CHECK-PIC %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-PIC %s
; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck -check-prefix=CHECK-NONPIC %s
@external_gd = external thread_local global i32
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index 97e270fc59a6..8968567cb423 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -1,10 +1,10 @@
-; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
-; RUN: FileCheck %s -check-prefix=PIC -check-prefix=CHECK
+; RUN: llc -march=mipsel -disable-mips-delay-filler -relocation-model=pic < %s | \
+; RUN: FileCheck %s -check-prefixes=PIC,CHECK
; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler < \
-; RUN: %s | FileCheck %s -check-prefix=STATIC -check-prefix=CHECK
+; RUN: %s | FileCheck %s -check-prefixes=STATIC,CHECK
; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler \
; RUN: -mips-fix-global-base-reg=false < %s | \
-; RUN: FileCheck %s -check-prefix=STATICGP -check-prefix=CHECK
+; RUN: FileCheck %s -check-prefixes=STATICGP,CHECK
@t1 = thread_local global i32 0, align 4
diff --git a/test/CodeGen/Mips/unalignedload.ll b/test/CodeGen/Mips/unalignedload.ll
index 9e453a6e794b..ba476b6c4554 100644
--- a/test/CodeGen/Mips/unalignedload.ll
+++ b/test/CodeGen/Mips/unalignedload.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
-; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
-; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
-; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32R6-EL
-; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32R6-EB
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,ALL-EL,MIPS32-EL
+; RUN: llc < %s -march=mips -mcpu=mips32 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,ALL-EB,MIPS32-EB
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,ALL-EL,MIPS32-EL
+; RUN: llc < %s -march=mips -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,ALL-EB,MIPS32-EB
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,ALL-EL,MIPS32R6-EL
+; RUN: llc < %s -march=mips -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,ALL-EB,MIPS32R6-EB
%struct.S2 = type { %struct.S1, %struct.S1 }
%struct.S1 = type { i8, i8 }
%struct.S4 = type { [7 x i8] }
diff --git a/test/CodeGen/Mips/zeroreg.ll b/test/CodeGen/Mips/zeroreg.ll
index 6baf9d4fbff2..c024d04a39b0 100644
--- a/test/CodeGen/Mips/zeroreg.ll
+++ b/test/CodeGen/Mips/zeroreg.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
-; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32R6
-; RUN: llc < %s -march=mipsel -mcpu=mips4 | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
-; RUN: llc < %s -march=mipsel -mcpu=mips64 | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
-; RUN: llc < %s -march=mipsel -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
-; RUN: llc < %s -march=mipsel -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64R6
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,32-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,32-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,32R6
+; RUN: llc < %s -march=mipsel -mcpu=mips4 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64r2 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefixes=ALL,64R6
@g1 = external global i32
diff --git a/test/CodeGen/NVPTX/MachineSink-call.ll b/test/CodeGen/NVPTX/MachineSink-call.ll
new file mode 100644
index 000000000000..3a6d43b76aeb
--- /dev/null
+++ b/test/CodeGen/NVPTX/MachineSink-call.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @foo()
+
+; Load a value, then call a function. Branch, and use the loaded value only on
+; one side of the branch. The load shouldn't be sunk beneath the call, because
+; the call may modify memory.
+define i32 @f(i32 %x, i32* %ptr, i1 %cond) {
+Start:
+ ; CHECK: ld.u32
+ %ptr_val = load i32, i32* %ptr
+ ; CHECK: call.uni
+ call void @foo()
+ br i1 %cond, label %L1, label %L2
+L1:
+ %ptr_val2 = add i32 %ptr_val, 100
+ br label %L2
+L2:
+ %v4 = phi i32 [ %x, %Start ], [ %ptr_val2, %L1 ]
+ %v5 = add i32 %v4, 1000
+ ret i32 %v5
+}
diff --git a/test/CodeGen/NVPTX/MachineSink-convergent.ll b/test/CodeGen/NVPTX/MachineSink-convergent.ll
new file mode 100644
index 000000000000..91c80182e2f8
--- /dev/null
+++ b/test/CodeGen/NVPTX/MachineSink-convergent.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.barrier0()
+
+; Load a value, then syncthreads. Branch, and use the loaded value only on one
+; side of the branch. The load shouldn't be sunk beneath the call, because
+; syncthreads is modeled as maystore.
+define i32 @f(i32 %x, i32* %ptr, i1 %cond) {
+Start:
+ ; CHECK: ld.u32
+ %ptr_val = load i32, i32* %ptr
+ ; CHECK: bar.sync
+ call void @llvm.nvvm.barrier0()
+ br i1 %cond, label %L1, label %L2
+L1:
+ %ptr_val2 = add i32 %ptr_val, 100
+ br label %L2
+L2:
+ %v4 = phi i32 [ %x, %Start ], [ %ptr_val2, %L1 ]
+ %v5 = add i32 %v4, 1000
+ ret i32 %v5
+}
diff --git a/test/CodeGen/NVPTX/TailDuplication-convergent.ll b/test/CodeGen/NVPTX/TailDuplication-convergent.ll
new file mode 100644
index 000000000000..fc6867eca417
--- /dev/null
+++ b/test/CodeGen/NVPTX/TailDuplication-convergent.ll
@@ -0,0 +1,45 @@
+; RUN: llc -O2 -tail-dup-size=100 -enable-tail-merge=0 < %s | FileCheck %s
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @foo()
+declare void @llvm.nvvm.barrier0()
+
+; syncthreads shouldn't be duplicated.
+; CHECK: .func call_syncthreads
+; CHECK: bar.sync
+; CHECK-NOT: bar.sync
+define void @call_syncthreads(i32* %a, i32* %b, i1 %cond, i1 %cond2) nounwind {
+ br i1 %cond, label %L1, label %L2
+ br i1 %cond2, label %Ret, label %L1
+Ret:
+ ret void
+L1:
+ store i32 0, i32* %a
+ br label %L42
+L2:
+ store i32 1, i32* %a
+ br label %L42
+L42:
+ call void @llvm.nvvm.barrier0()
+ br label %Ret
+}
+
+; Check that call_syncthreads really does trigger tail duplication.
+; CHECK: .func call_foo
+; CHECK: call
+; CHECK: call
+define void @call_foo(i32* %a, i32* %b, i1 %cond, i1 %cond2) nounwind {
+ br i1 %cond, label %L1, label %L2
+ br i1 %cond2, label %Ret, label %L1
+Ret:
+ ret void
+L1:
+ store i32 0, i32* %a
+ br label %L42
+L2:
+ store i32 1, i32* %a
+ br label %L42
+L42:
+ call void @foo()
+ br label %Ret
+}
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll
index c1327274a9cf..3cd5a9225087 100644
--- a/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@@ -1,9 +1,18 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-use-infer-addrspace | FileCheck %s --check-prefix PTX
; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR
+; RUN: opt < %s -S -nvptx-infer-addrspace | FileCheck %s --check-prefix IR --check-prefix IR-WITH-LOOP
@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
@scalar = internal addrspace(3) global float 0.000000e+00, align 4
+@generic_scalar = internal global float 0.000000e+00, align 4
+
+define float @ld_from_shared() {
+ %1 = addrspacecast float* @generic_scalar to float addrspace(3)*
+ %2 = load float, float addrspace(3)* %1
+ ret float %2
+}
; Verifies nvptx-favor-non-generic correctly optimizes generic address space
; usage to non-generic address space usage for the patterns we claim to handle:
@@ -13,65 +22,66 @@
; 4. store gep cast
; gep and cast can be an instruction or a constant expression. This function
; tries all possible combinations.
-define float @ld_st_shared_f32(i32 %i, float %v) {
+define void @ld_st_shared_f32(i32 %i, float %v) {
; IR-LABEL: @ld_st_shared_f32
; IR-NOT: addrspacecast
; PTX-LABEL: ld_st_shared_f32(
; load cast
%1 = load float, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+ call void @use(float %1)
; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
; store cast
store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
; use syncthreads to disable optimizations across components
- call void @llvm.cuda.syncthreads()
+ call void @llvm.nvvm.barrier0()
; PTX: bar.sync 0;
; cast; load
%2 = addrspacecast float addrspace(3)* @scalar to float*
%3 = load float, float* %2, align 4
+ call void @use(float %3)
; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
; cast; store
store float %v, float* %2, align 4
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
- call void @llvm.cuda.syncthreads()
+ call void @llvm.nvvm.barrier0()
; PTX: bar.sync 0;
; load gep cast
%4 = load float, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+ call void @use(float %4)
; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
; store gep cast
store float %v, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
- call void @llvm.cuda.syncthreads()
+ call void @llvm.nvvm.barrier0()
; PTX: bar.sync 0;
; gep cast; load
%5 = getelementptr inbounds [10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
%6 = load float, float* %5, align 4
+ call void @use(float %6)
; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
; gep cast; store
store float %v, float* %5, align 4
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
- call void @llvm.cuda.syncthreads()
+ call void @llvm.nvvm.barrier0()
; PTX: bar.sync 0;
; cast; gep; load
%7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]*
%8 = getelementptr inbounds [10 x float], [10 x float]* %7, i32 0, i32 %i
%9 = load float, float* %8, align 4
+ call void @use(float %9)
; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
; cast; gep; store
store float %v, float* %8, align 4
; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
- call void @llvm.cuda.syncthreads()
+ call void @llvm.nvvm.barrier0()
; PTX: bar.sync 0;
- %sum2 = fadd float %1, %3
- %sum3 = fadd float %sum2, %4
- %sum4 = fadd float %sum3, %6
- %sum5 = fadd float %sum4, %9
- ret float %sum5
+ ret void
}
; When hoisting an addrspacecast between different pointer types, replace the
@@ -117,13 +127,62 @@ define void @rauw(float addrspace(1)* %input) {
store float %v, float* %addr
ret void
; IR-LABEL: @rauw(
-; IR-NEXT: %1 = getelementptr float, float addrspace(1)* %input, i64 10
-; IR-NEXT: %v = load float, float addrspace(1)* %1
-; IR-NEXT: store float %v, float addrspace(1)* %1
+; IR-NEXT: %addr = getelementptr float, float addrspace(1)* %input, i64 10
+; IR-NEXT: %v = load float, float addrspace(1)* %addr
+; IR-NEXT: store float %v, float addrspace(1)* %addr
; IR-NEXT: ret void
}
-declare void @llvm.cuda.syncthreads() #3
+define void @loop() {
+; IR-WITH-LOOP-LABEL: @loop(
+entry:
+ %p = addrspacecast [10 x float] addrspace(3)* @array to float*
+ %end = getelementptr float, float* %p, i64 10
+ br label %loop
-attributes #3 = { noduplicate nounwind }
+loop:
+ %i = phi float* [ %p, %entry ], [ %i2, %loop ]
+; IR-WITH-LOOP: phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ]
+ %v = load float, float* %i
+; IR-WITH-LOOP: %v = load float, float addrspace(3)* %i
+ call void @use(float %v)
+ %i2 = getelementptr float, float* %i, i64 1
+; IR-WITH-LOOP: %i2 = getelementptr float, float addrspace(3)* %i, i64 1
+ %exit_cond = icmp eq float* %i2, %end
+ br i1 %exit_cond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+@generic_end = external global float*
+define void @loop_with_generic_bound() {
+; IR-WITH-LOOP-LABEL: @loop_with_generic_bound(
+entry:
+ %p = addrspacecast [10 x float] addrspace(3)* @array to float*
+ %end = load float*, float** @generic_end
+ br label %loop
+
+loop:
+ %i = phi float* [ %p, %entry ], [ %i2, %loop ]
+; IR-WITH-LOOP: phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ]
+ %v = load float, float* %i
+; IR-WITH-LOOP: %v = load float, float addrspace(3)* %i
+ call void @use(float %v)
+ %i2 = getelementptr float, float* %i, i64 1
+; IR-WITH-LOOP: %i2 = getelementptr float, float addrspace(3)* %i, i64 1
+ %exit_cond = icmp eq float* %i2, %end
+; IR-WITH-LOOP: addrspacecast float addrspace(3)* %i2 to float*
+; IR-WITH-LOOP: icmp eq float* %{{[0-9]+}}, %end
+ br i1 %exit_cond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+declare void @llvm.nvvm.barrier0() #3
+
+declare void @use(float)
+
+attributes #3 = { noduplicate nounwind }
diff --git a/test/CodeGen/NVPTX/alias.ll b/test/CodeGen/NVPTX/alias.ll
new file mode 100644
index 000000000000..a27851927696
--- /dev/null
+++ b/test/CodeGen/NVPTX/alias.ll
@@ -0,0 +1,7 @@
+; RUN: not llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s
+
+; Check that llc dies gracefully when given an alias.
+
+define i32 @a() { ret i32 0 }
+; CHECK: ERROR: Module has aliases
+@b = internal alias i32 (), i32 ()* @a
diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll
index b5a2872299b7..e7c968c4c0bf 100644
--- a/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/test/CodeGen/NVPTX/arithmetic-int.ll
@@ -29,6 +29,30 @@ define i64 @mul_i64(i64 %a, i64 %b) {
ret i64 %ret
}
+define i64 @umul_lohi_i64(i64 %a) {
+; CHECK-LABEL: umul_lohi_i64(
+entry:
+ %0 = zext i64 %a to i128
+ %1 = mul i128 %0, 288
+; CHECK: mul.lo.{{u|s}}64
+; CHECK: mul.hi.{{u|s}}64
+ %2 = lshr i128 %1, 1
+ %3 = trunc i128 %2 to i64
+ ret i64 %3
+}
+
+define i64 @smul_lohi_i64(i64 %a) {
+; CHECK-LABEL: smul_lohi_i64(
+entry:
+ %0 = sext i64 %a to i128
+ %1 = mul i128 %0, 288
+; CHECK: mul.lo.{{u|s}}64
+; CHECK: mul.hi.{{u|s}}64
+ %2 = ashr i128 %1, 1
+ %3 = trunc i128 %2 to i64
+ ret i64 %3
+}
+
define i64 @sdiv_i64(i64 %a, i64 %b) {
; CHECK: div.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
diff --git a/test/CodeGen/NVPTX/bug22322.ll b/test/CodeGen/NVPTX/bug22322.ll
index 97863b9ea546..e073e361445b 100644
--- a/test/CodeGen/NVPTX/bug22322.ll
+++ b/test/CodeGen/NVPTX/bug22322.ll
@@ -10,10 +10,10 @@ target triple = "nvptx64-nvidia-cuda"
define void @some_kernel(%class.float3* nocapture %dst) #0 {
_ZL11compute_vecRK6float3jb.exit:
%ret_vec.sroa.8.i = alloca float, align 4
- %0 = tail call i32 @llvm.ptx.read.ctaid.x()
- %1 = tail call i32 @llvm.ptx.read.ntid.x()
+ %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+ %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%2 = mul nsw i32 %1, %0
- %3 = tail call i32 @llvm.ptx.read.tid.x()
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%4 = add nsw i32 %2, %3
%5 = zext i32 %4 to i64
%6 = bitcast float* %ret_vec.sroa.8.i to i8*
@@ -37,13 +37,13 @@ _ZL11compute_vecRK6float3jb.exit:
}
; Function Attrs: nounwind readnone
-declare i32 @llvm.ptx.read.ctaid.x() #1
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.ptx.read.ntid.x() #1
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
; Function Attrs: nounwind readnone
-declare i32 @llvm.ptx.read.tid.x() #1
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
; Function Attrs: nounwind
declare void @llvm.lifetime.start(i64, i8* nocapture) #2
diff --git a/test/CodeGen/NVPTX/bug26185-2.ll b/test/CodeGen/NVPTX/bug26185-2.ll
new file mode 100644
index 000000000000..55e9dad96c01
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug26185-2.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; Verify that we correctly emit code for extending ldg/ldu. We do not expose
+; extending variants in the backend, but the ldg/ldu selection code may pick
+; extending loads as candidates. We do want to support this, so make sure we
+; emit the necessary cvt.* instructions to implement the extension and let ptxas
+; emit the real extending loads.
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: spam
+define ptx_kernel void @spam(i8 addrspace(1)* noalias nocapture readonly %arg, i8 addrspace(1)* noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
+bb:
+ %tmp = bitcast i8 addrspace(1)* %arg to i16 addrspace(1)*
+ %tmp4 = bitcast i8 addrspace(1)* %arg1 to i64 addrspace(1)*
+ %tmp5 = add nsw i64 %arg3, 8
+ %tmp6 = getelementptr i16, i16 addrspace(1)* %tmp, i64 %tmp5
+; CHECK: ld.global.nc.u16
+ %tmp7 = load i16, i16 addrspace(1)* %tmp6, align 2
+; CHECK: cvt.s32.s16
+ %tmp8 = sext i16 %tmp7 to i64
+ %tmp9 = mul nsw i64 %tmp8, %tmp8
+ %tmp10 = load i64, i64 addrspace(1)* %tmp4, align 8
+ %tmp11 = add nsw i64 %tmp9, %tmp10
+ store i64 %tmp11, i64 addrspace(1)* %tmp4, align 8
+ ret void
+}
+
+attributes #0 = { norecurse nounwind "polly.skip.fn" }
+
+!nvvm.annotations = !{!0}
+
+!0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i64, i64)* @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}
diff --git a/test/CodeGen/NVPTX/bug26185.ll b/test/CodeGen/NVPTX/bug26185.ll
new file mode 100644
index 000000000000..30313481deb0
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug26185.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+
+; Verify that we correctly emit code for i8 ldg/ldu. We do not expose 8-bit
+; registers in the backend, so these loads need special handling.
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; CHECK-LABEL: ex_zext
+define void @ex_zext(i8* noalias readonly %data, i32* %res) {
+entry:
+; CHECK: ld.global.nc.u8
+ %val = load i8, i8* %data
+; CHECK: cvt.u32.u8
+ %valext = zext i8 %val to i32
+ store i32 %valext, i32* %res
+ ret void
+}
+
+; CHECK-LABEL: ex_sext
+define void @ex_sext(i8* noalias readonly %data, i32* %res) {
+entry:
+; CHECK: ld.global.nc.u8
+ %val = load i8, i8* %data
+; CHECK: cvt.s32.s8
+ %valext = sext i8 %val to i32
+ store i32 %valext, i32* %res
+ ret void
+}
+
+; CHECK-LABEL: ex_zext_v2
+define void @ex_zext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) {
+entry:
+; CHECK: ld.global.nc.v2.u8
+ %val = load <2 x i8>, <2 x i8>* %data
+; CHECK: cvt.u32.u16
+ %valext = zext <2 x i8> %val to <2 x i32>
+ store <2 x i32> %valext, <2 x i32>* %res
+ ret void
+}
+
+; CHECK-LABEL: ex_sext_v2
+define void @ex_sext_v2(<2 x i8>* noalias readonly %data, <2 x i32>* %res) {
+entry:
+; CHECK: ld.global.nc.v2.u8
+ %val = load <2 x i8>, <2 x i8>* %data
+; CHECK: cvt.s32.s8
+ %valext = sext <2 x i8> %val to <2 x i32>
+ store <2 x i32> %valext, <2 x i32>* %res
+ ret void
+}
+
+!nvvm.annotations = !{!0,!1,!2,!3}
+!0 = !{void (i8*, i32*)* @ex_zext, !"kernel", i32 1}
+!1 = !{void (i8*, i32*)* @ex_sext, !"kernel", i32 1}
+!2 = !{void (<2 x i8>*, <2 x i32>*)* @ex_zext_v2, !"kernel", i32 1}
+!3 = !{void (<2 x i8>*, <2 x i32>*)* @ex_sext_v2, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/convergent-mir-call.ll b/test/CodeGen/NVPTX/convergent-mir-call.ll
new file mode 100644
index 000000000000..18142450490c
--- /dev/null
+++ b/test/CodeGen/NVPTX/convergent-mir-call.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple nvptx64-nvidia-cuda -stop-after machine-cp -o - < %s 2>&1 | FileCheck %s
+
+; Check that convergent calls are emitted using convergent MIR instructions,
+; while non-convergent calls are not.
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @conv() convergent
+declare void @not_conv()
+
+define void @test(void ()* %f) {
+ ; CHECK: ConvergentCallUniPrintCall
+ ; CHECK-NEXT: @conv
+ call void @conv()
+
+ ; CHECK: CallUniPrintCall
+ ; CHECK-NEXT: @not_conv
+ call void @not_conv()
+
+ ; CHECK: ConvergentCallPrintCall
+ call void %f() convergent
+
+ ; CHECK: CallPrintCall
+ call void %f()
+
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/debug-file-loc.ll b/test/CodeGen/NVPTX/debug-file-loc.ll
new file mode 100644
index 000000000000..008e9ce54583
--- /dev/null
+++ b/test/CodeGen/NVPTX/debug-file-loc.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+
+; // Bitcode int this test case is reduced version of compiled code below:
+;extern "C" {
+;#line 1 "/source/dir/foo.h"
+;__device__ void foo() {}
+;#line 2 "/source/dir/bar.cu"
+;__device__ void bar() {}
+;}
+
+; CHECK: .file 1 "/source/dir{{/|\\\\}}bar.cu"
+; CHECK: .file 2 "/source/dir{{/|\\\\}}foo.h"
+
+; CHECK-LABEL: @foo
+define void @foo() !dbg !4 {
+bb:
+ ret void, !dbg !10
+}
+; CHECK: .loc 2 1
+; CHECK: ret
+
+; CHECK-LABEL: @bar
+define void @bar() !dbg !7 {
+bb:
+ ret void, !dbg !11
+}
+; CHECK: .loc 1 2
+; CHECK: ret
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "bar.cu", directory: "/source/dir")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!5 = !DIFile(filename: "foo.h", directory: "/source/dir")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !DILocation(line: 1, column: 31, scope: !4)
+!11 = !DILocation(line: 2, column: 31, scope: !7)
diff --git a/test/CodeGen/NVPTX/disable-opt.ll b/test/CodeGen/NVPTX/disable-opt.ll
new file mode 100644
index 000000000000..15e4913c1695
--- /dev/null
+++ b/test/CodeGen/NVPTX/disable-opt.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s
+
+define void @foo(i32* %output) {
+; CHECK-LABEL: .visible .func foo(
+entry:
+ %local = alloca i32
+; CHECK: __local_depot
+ store i32 1, i32* %local
+ %0 = load i32, i32* %local
+ store i32 %0, i32* %output
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/global-ctor-empty.ll b/test/CodeGen/NVPTX/global-ctor-empty.ll
new file mode 100644
index 000000000000..10ca0168b30c
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-ctor-empty.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 2>&1
+
+; Check that llc doesn't die when given an empty global ctor / dtor.
+@llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] []
+@llvm.global_dtors = appending global [0 x { i32, void ()*, i8* }] []
diff --git a/test/CodeGen/NVPTX/global-ctor.ll b/test/CodeGen/NVPTX/global-ctor.ll
new file mode 100644
index 000000000000..89155db08ea5
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-ctor.ll
@@ -0,0 +1,9 @@
+; RUN: not llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s
+
+; Check that llc dies when given a nonempty global ctor.
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo, i8* null }]
+
+; CHECK: ERROR: Module has a nontrivial global ctor
+define internal void @foo() {
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/global-dtor.ll b/test/CodeGen/NVPTX/global-dtor.ll
new file mode 100644
index 000000000000..9d01f9bd387c
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-dtor.ll
@@ -0,0 +1,9 @@
+; RUN: not llc < %s -march=nvptx -mcpu=sm_20 2>&1 | FileCheck %s
+
+; Check that llc dies when given a nonempty global dtor.
+@llvm.global_dtors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo, i8* null }]
+
+; CHECK: ERROR: Module has a nontrivial global dtor
+define internal void @foo() {
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/global-visibility.ll b/test/CodeGen/NVPTX/global-visibility.ll
new file mode 100644
index 000000000000..90af2950fb40
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-visibility.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; PTX does not support .hidden or .protected.
+; Make sure we do not emit them.
+
+define hidden void @f_hidden() {
+ ret void
+}
+; CHECK-NOT: .hidden
+; CHECK: .visible .func f_hidden
+
+define protected void @f_protected() {
+ ret void
+}
+; CHECK-NOT: .protected
+; CHECK: .visible .func f_protected
diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll
index 3c51776c0ec9..daf83a870075 100644
--- a/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -1,282 +1,323 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -nvvm-intr-range \
+; RUN: | FileCheck --check-prefix=RANGE --check-prefix=RANGE_20 %s
+; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \
+; RUN: -nvvm-intr-range -nvvm-intr-range-sm=30 \
+; RUN: | FileCheck --check-prefix=RANGE --check-prefix=RANGE_30 %s
define ptx_device i32 @test_tid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[BLK_IDX_XY:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.tid.x()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %x
}
define ptx_device i32 @test_tid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.y(), !range ![[BLK_IDX_XY]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.tid.y()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %x
}
define ptx_device i32 @test_tid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.z(), !range ![[BLK_IDX_Z:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.tid.z()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
ret i32 %x
}
define ptx_device i32 @test_tid_w() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.w;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.tid.w()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
ret i32 %x
}
define ptx_device i32 @test_ntid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range ![[BLK_SIZE_XY:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ntid.x()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %x
}
define ptx_device i32 @test_ntid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y(), !range ![[BLK_SIZE_XY]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ntid.y()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %x
}
define ptx_device i32 @test_ntid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z(), !range ![[BLK_SIZE_Z:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ntid.z()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
ret i32 %x
}
define ptx_device i32 @test_ntid_w() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.w;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ntid.w()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
ret i32 %x
}
define ptx_device i32 @test_laneid() {
; CHECK: mov.u32 %r{{[0-9]+}}, %laneid;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.laneid(), !range ![[LANEID:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.laneid()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
ret i32 %x
}
-define ptx_device i32 @test_warpid() {
-; CHECK: mov.u32 %r{{[0-9]+}}, %warpid;
+define ptx_device i32 @test_warpsize() {
+; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range ![[WARPSIZE:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.warpid()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
ret i32 %x
}
-define ptx_device i32 @test_nwarpid() {
-; CHECK: mov.u32 %r{{[0-9]+}}, %nwarpid;
+define ptx_device i32 @test_warpid() {
+; CHECK: mov.u32 %r{{[0-9]+}}, %warpid;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.nwarpid()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.warpid()
ret i32 %x
}
-define ptx_device i32 @test_ctaid_x() {
-; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
+define ptx_device i32 @test_nwarpid() {
+; CHECK: mov.u32 %r{{[0-9]+}}, %nwarpid;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ctaid.x()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
ret i32 %x
}
define ptx_device i32 @test_ctaid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range ![[GRID_IDX_YZ:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ctaid.y()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %x
}
define ptx_device i32 @test_ctaid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !range ![[GRID_IDX_YZ]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ctaid.z()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
ret i32 %x
}
-define ptx_device i32 @test_ctaid_w() {
-; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.w;
+define ptx_device i32 @test_ctaid_x() {
+; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
+; RANGE_30: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[GRID_IDX_X:[0-9]+]]
+; RANGE_20: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[GRID_IDX_YZ]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.ctaid.w()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %x
}
-define ptx_device i32 @test_nctaid_x() {
-; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
+define ptx_device i32 @test_ctaid_w() {
+; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.w;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.nctaid.x()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
ret i32 %x
}
define ptx_device i32 @test_nctaid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !range ![[GRID_SIZE_YZ:[0-9]+]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.nctaid.y()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
ret i32 %x
}
define ptx_device i32 @test_nctaid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z;
+; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z(), !range ![[GRID_SIZE_YZ]]
+; CHECK: ret;
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+ ret i32 %x
+}
+
+define ptx_device i32 @test_nctaid_x() {
+; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
+; RANGE_30: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[GRID_SIZE_X:[0-9]+]]
+; RANGE_20: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[GRID_SIZE_YZ]]
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.nctaid.z()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
ret i32 %x
}
+
define ptx_device i32 @test_nctaid_w() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.w;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.nctaid.w()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
ret i32 %x
}
define ptx_device i32 @test_smid() {
; CHECK: mov.u32 %r{{[0-9]+}}, %smid;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.smid()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.smid()
ret i32 %x
}
define ptx_device i32 @test_nsmid() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nsmid;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.nsmid()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
ret i32 %x
}
define ptx_device i32 @test_gridid() {
; CHECK: mov.u32 %r{{[0-9]+}}, %gridid;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.gridid()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.gridid()
ret i32 %x
}
define ptx_device i32 @test_lanemask_eq() {
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_eq;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.lanemask.eq()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
ret i32 %x
}
define ptx_device i32 @test_lanemask_le() {
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_le;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.lanemask.le()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
ret i32 %x
}
define ptx_device i32 @test_lanemask_lt() {
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_lt;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.lanemask.lt()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
ret i32 %x
}
define ptx_device i32 @test_lanemask_ge() {
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_ge;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.lanemask.ge()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
ret i32 %x
}
define ptx_device i32 @test_lanemask_gt() {
; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_gt;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.lanemask.gt()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
ret i32 %x
}
define ptx_device i32 @test_clock() {
; CHECK: mov.u32 %r{{[0-9]+}}, %clock;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.clock()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.clock()
ret i32 %x
}
define ptx_device i64 @test_clock64() {
; CHECK: mov.u64 %rd{{[0-9]+}}, %clock64;
; CHECK: ret;
- %x = call i64 @llvm.ptx.read.clock64()
+ %x = call i64 @llvm.nvvm.read.ptx.sreg.clock64()
ret i64 %x
}
define ptx_device i32 @test_pm0() {
; CHECK: mov.u32 %r{{[0-9]+}}, %pm0;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.pm0()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.pm0()
ret i32 %x
}
define ptx_device i32 @test_pm1() {
; CHECK: mov.u32 %r{{[0-9]+}}, %pm1;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.pm1()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.pm1()
ret i32 %x
}
define ptx_device i32 @test_pm2() {
; CHECK: mov.u32 %r{{[0-9]+}}, %pm2;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.pm2()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.pm2()
ret i32 %x
}
define ptx_device i32 @test_pm3() {
; CHECK: mov.u32 %r{{[0-9]+}}, %pm3;
; CHECK: ret;
- %x = call i32 @llvm.ptx.read.pm3()
+ %x = call i32 @llvm.nvvm.read.ptx.sreg.pm3()
ret i32 %x
}
define ptx_device void @test_bar_sync() {
; CHECK: bar.sync 0
; CHECK: ret;
- call void @llvm.ptx.bar.sync(i32 0)
+ call void @llvm.nvvm.bar.sync(i32 0)
ret void
}
-declare i32 @llvm.ptx.read.tid.x()
-declare i32 @llvm.ptx.read.tid.y()
-declare i32 @llvm.ptx.read.tid.z()
-declare i32 @llvm.ptx.read.tid.w()
-declare i32 @llvm.ptx.read.ntid.x()
-declare i32 @llvm.ptx.read.ntid.y()
-declare i32 @llvm.ptx.read.ntid.z()
-declare i32 @llvm.ptx.read.ntid.w()
-
-declare i32 @llvm.ptx.read.laneid()
-declare i32 @llvm.ptx.read.warpid()
-declare i32 @llvm.ptx.read.nwarpid()
-
-declare i32 @llvm.ptx.read.ctaid.x()
-declare i32 @llvm.ptx.read.ctaid.y()
-declare i32 @llvm.ptx.read.ctaid.z()
-declare i32 @llvm.ptx.read.ctaid.w()
-declare i32 @llvm.ptx.read.nctaid.x()
-declare i32 @llvm.ptx.read.nctaid.y()
-declare i32 @llvm.ptx.read.nctaid.z()
-declare i32 @llvm.ptx.read.nctaid.w()
-
-declare i32 @llvm.ptx.read.smid()
-declare i32 @llvm.ptx.read.nsmid()
-declare i32 @llvm.ptx.read.gridid()
-
-declare i32 @llvm.ptx.read.lanemask.eq()
-declare i32 @llvm.ptx.read.lanemask.le()
-declare i32 @llvm.ptx.read.lanemask.lt()
-declare i32 @llvm.ptx.read.lanemask.ge()
-declare i32 @llvm.ptx.read.lanemask.gt()
-
-declare i32 @llvm.ptx.read.clock()
-declare i64 @llvm.ptx.read.clock64()
-
-declare i32 @llvm.ptx.read.pm0()
-declare i32 @llvm.ptx.read.pm1()
-declare i32 @llvm.ptx.read.pm2()
-declare i32 @llvm.ptx.read.pm3()
-
-declare void @llvm.ptx.bar.sync(i32 %i)
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.w()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+declare i32 @llvm.nvvm.read.ptx.sreg.warpid()
+declare i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.smid()
+declare i32 @llvm.nvvm.read.ptx.sreg.nsmid()
+declare i32 @llvm.nvvm.read.ptx.sreg.gridid()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
+declare i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
+declare i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
+declare i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
+declare i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.clock()
+declare i64 @llvm.nvvm.read.ptx.sreg.clock64()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.pm0()
+declare i32 @llvm.nvvm.read.ptx.sreg.pm1()
+declare i32 @llvm.nvvm.read.ptx.sreg.pm2()
+declare i32 @llvm.nvvm.read.ptx.sreg.pm3()
+
+declare void @llvm.nvvm.bar.sync(i32 %i)
+
+; RANGE-DAG: ![[BLK_IDX_XY]] = !{i32 0, i32 1024}
+; RANGE-DAG: ![[BLK_IDX_Z]] = !{i32 0, i32 64}
+; RANGE-DAG: ![[BLK_SIZE_XY]] = !{i32 1, i32 1025}
+; RANGE-DAG: ![[BLK_SIZE_Z]] = !{i32 1, i32 65}
+; RANGE-DAG: ![[LANEID]] = !{i32 0, i32 32}
+; RANGE-DAG: ![[WARPSIZE]] = !{i32 32, i32 33}
+; RANGE_30-DAG: ![[GRID_IDX_X]] = !{i32 0, i32 2147483647}
+; RANGE-DAG: ![[GRID_IDX_YZ]] = !{i32 0, i32 65535}
+; RANGE_30-DAG: ![[GRID_SIZE_X]] = !{i32 1, i32 -2147483648}
+; RANGE-DAG: ![[GRID_SIZE_YZ]] = !{i32 1, i32 65536}
diff --git a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index 2fec31b3791d..ca7fb6eddfe8 100644
--- a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -3,8 +3,8 @@
; Make sure the call to syncthreads is not duplicate here by the LLVM
; optimizations, because it has the noduplicate attribute set.
-; CHECK: call void @llvm.cuda.syncthreads
-; CHECK-NOT: call void @llvm.cuda.syncthreads
+; CHECK: call void @llvm.nvvm.barrier0
+; CHECK-NOT: call void @llvm.nvvm.barrier0
; Function Attrs: nounwind
define void @foo(float* %output) #1 {
@@ -37,7 +37,7 @@ if.else: ; preds = %entry
br label %if.end
if.end: ; preds = %if.else, %if.then
- call void @llvm.cuda.syncthreads()
+ call void @llvm.nvvm.barrier0()
%6 = load float*, float** %output.addr, align 8
%arrayidx6 = getelementptr inbounds float, float* %6, i64 0
%7 = load float, float* %arrayidx6, align 4
@@ -68,7 +68,7 @@ if.end17: ; preds = %if.else13, %if.then
}
; Function Attrs: noduplicate nounwind
-declare void @llvm.cuda.syncthreads() #2
+declare void @llvm.nvvm.barrier0() #2
!0 = !{void (float*)* @foo, !"kernel", i32 1}
!1 = !{null, !"align", i32 8}
diff --git a/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll b/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll
new file mode 100644
index 000000000000..4fdab5c087de
--- /dev/null
+++ b/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -S -nvvm-reflect | FileCheck %s
+
+declare i32 @__nvvm_reflect(i8*)
+@str = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
+
+define i32 @foo() {
+ %call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
+ ; CHECK: ret i32 42
+ ret i32 %call
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nvvm-reflect-ftz", i32 42}
diff --git a/test/CodeGen/NVPTX/shfl.ll b/test/CodeGen/NVPTX/shfl.ll
new file mode 100644
index 000000000000..e4899f66fb65
--- /dev/null
+++ b/test/CodeGen/NVPTX/shfl.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -disable-nvptx-favor-non-generic | FileCheck %s
+
+declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
+declare float @llvm.nvvm.shfl.down.f32(float, i32, i32)
+declare i32 @llvm.nvvm.shfl.up.i32(i32, i32, i32)
+declare float @llvm.nvvm.shfl.up.f32(float, i32, i32)
+declare i32 @llvm.nvvm.shfl.bfly.i32(i32, i32, i32)
+declare float @llvm.nvvm.shfl.bfly.f32(float, i32, i32)
+declare i32 @llvm.nvvm.shfl.idx.i32(i32, i32, i32)
+declare float @llvm.nvvm.shfl.idx.f32(float, i32, i32)
+
+; Try all four permutations of register and immediate parameters with
+; shfl.down.
+
+; CHECK-LABEL: .func{{.*}}shfl.down1
+define i32 @shfl.down1(i32 %in) {
+ ; CHECK: ld.param.u32 [[IN:%r[0-9]+]]
+ ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]], [[IN]], 1, 2;
+ ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+ %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 1, i32 2)
+ ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.down2
+define i32 @shfl.down2(i32 %in, i32 %width) {
+ ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
+ ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
+ ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], 3;
+ %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 3)
+ ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.down3
+define i32 @shfl.down3(i32 %in, i32 %mask) {
+ ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
+ ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
+ ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], 4, [[IN2]];
+ %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 4, i32 %mask)
+ ret i32 %val
+}
+
+; CHECK-LABEL: .func{{.*}}shfl.down4
+define i32 @shfl.down4(i32 %in, i32 %width, i32 %mask) {
+ ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
+ ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
+ ; CHECK: ld.param.u32 [[IN3:%r[0-9]+]]
+ ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], [[IN3]];
+ %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 %mask)
+ ret i32 %val
+}
+
+; Try shfl.down with floating-point params.
+; CHECK-LABEL: .func{{.*}}shfl.down.float
+define float @shfl.down.float(float %in) {
+ ; CHECK: ld.param.f32 [[IN:%f[0-9]+]]
+ ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]], [[IN]], 5, 6;
+ ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
+ %out = call float @llvm.nvvm.shfl.down.f32(float %in, i32 5, i32 6)
+ ret float %out
+}
+
+; Try the rest of the shfl modes. Hopefully they're declared in such a way
+; that if shfl.down works correctly, they also work correctly.
+define void @shfl.rest(i32 %in_i32, float %in_float, i32* %out_i32, float* %out_float) {
+ ; CHECK: shfl.up.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 1, 2;
+ %up_i32 = call i32 @llvm.nvvm.shfl.up.i32(i32 %in_i32, i32 1, i32 2)
+ store i32 %up_i32, i32* %out_i32
+
+ ; CHECK: shfl.up.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 3, 4;
+ %up_float = call float @llvm.nvvm.shfl.up.f32(float %in_float, i32 3, i32 4)
+ store float %up_float, float* %out_float
+
+ ; CHECK: shfl.bfly.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 5, 6;
+ %bfly_i32 = call i32 @llvm.nvvm.shfl.bfly.i32(i32 %in_i32, i32 5, i32 6)
+ store i32 %bfly_i32, i32* %out_i32
+
+ ; CHECK: shfl.bfly.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 7, 8;
+ %bfly_float = call float @llvm.nvvm.shfl.bfly.f32(float %in_float, i32 7, i32 8)
+ store float %bfly_float, float* %out_float
+
+ ; CHECK: shfl.idx.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 9, 10;
+ %idx_i32 = call i32 @llvm.nvvm.shfl.idx.i32(i32 %in_i32, i32 9, i32 10)
+ store i32 %idx_i32, i32* %out_i32
+
+ ; CHECK: shfl.idx.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 11, 12;
+ %idx_float = call float @llvm.nvvm.shfl.idx.f32(float %in_float, i32 11, i32 12)
+ store float %idx_float, float* %out_float
+
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/sm-version-60.ll b/test/CodeGen/NVPTX/sm-version-60.ll
new file mode 100644
index 000000000000..4f6b508a70b5
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-60.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
+
+; CHECK: .version 5.0
+; CHECK: .target sm_60
diff --git a/test/CodeGen/NVPTX/sm-version-61.ll b/test/CodeGen/NVPTX/sm-version-61.ll
new file mode 100644
index 000000000000..535ef066d0c3
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-61.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_61 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_61 | FileCheck %s
+
+; CHECK: .version 5.0
+; CHECK: .target sm_61
diff --git a/test/CodeGen/NVPTX/sm-version-62.ll b/test/CodeGen/NVPTX/sm-version-62.ll
new file mode 100644
index 000000000000..7d425b6d12e9
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-62.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_62 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_62 | FileCheck %s
+
+; CHECK: .version 5.0
+; CHECK: .target sm_62
diff --git a/test/CodeGen/NVPTX/speculative-execution-divergent-target.ll b/test/CodeGen/NVPTX/speculative-execution-divergent-target.ll
new file mode 100644
index 000000000000..128e9e9ba2a3
--- /dev/null
+++ b/test/CodeGen/NVPTX/speculative-execution-divergent-target.ll
@@ -0,0 +1,24 @@
+; Checks that speculative-execution only runs on divergent targets, if you pass
+; -spec-exec-only-if-divergent-target.
+
+; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -speculative-execution | \
+; RUN: FileCheck --check-prefix=ON %s
+; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -speculative-execution \
+; RUN: -spec-exec-only-if-divergent-target | \
+; RUN: FileCheck --check-prefix=ON %s
+; RUN: opt < %s -S -speculative-execution -spec-exec-only-if-divergent-target | \
+; RUN: FileCheck --check-prefix=OFF %s
+
+; Hoist in if-then pattern.
+define void @f() {
+; ON: %x = add i32 2, 3
+; ON: br i1 true
+; OFF: br i1 true
+; OFF: %x = add i32 2, 3
+ br i1 true, label %a, label %b
+a:
+ %x = add i32 2, 3
+ br label %b
+b:
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/zeroext-32bit.ll b/test/CodeGen/NVPTX/zeroext-32bit.ll
new file mode 100644
index 000000000000..c2f0ec4b1447
--- /dev/null
+++ b/test/CodeGen/NVPTX/zeroext-32bit.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+
+; The zeroext attribute below should be silently ignored because
+; we can pass a 32-bit integer across a function call without
+; needing to extend it.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-cuda"
+
+; CHECK-LABEL: .visible .func zeroext_test
+; CHECK-NOT: cvt.u32.u16
+define void @zeroext_test() {
+ tail call void @call1(i32 zeroext 0)
+ ret void
+}
+
+declare void @call1(i32 zeroext)
+
+; CHECK-LABEL: .visible .func signext_test
+; CHECK-NOT: cvt.s32.s16
+define void @signext_test() {
+ tail call void @call2(i32 zeroext 0)
+ ret void
+}
+
+declare void @call2(i32 zeroext)
diff --git a/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll b/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll
index aae914ecc435..3be596f9a531 100644
--- a/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll
+++ b/test/CodeGen/PowerPC/2007-09-07-LoadStoreIdxForms.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s
+; RUN: llc < %s -march=ppc64 -O1 | FileCheck %s
+; RUN: llc < %s -march=ppc64 | FileCheck --check-prefix=CHECK-OPT %s
%struct.__db_region = type { %struct.__mutex_t, [4 x i8], %struct.anon, i32, [1 x i32] }
%struct.__mutex_t = type { i32 }
@@ -15,6 +16,24 @@ entry:
; CHECK: @foo
; CHECK: lwzx
; CHECK: blr
+; CHECK-OPT: @foo
+; CHECK-OPT: lwz
+; CHECK-OPT: blr
}
+define signext i32 @test(i32* noalias nocapture readonly %b, i32 signext %n) {
+entry:
+ %idxprom = sext i32 %n to i64
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+ %0 = load i32, i32* %arrayidx, align 4
+ %mul = mul nsw i32 %0, 7
+ ret i32 %mul
+
+; CHECK-OPT: @test
+; CHECK-OPT: lwzx
+; CHECK-OPT: blr
+
+}
+
+
declare i32 @bork(...)
diff --git a/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll b/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll
index 01c83cb4bcbe..e1d19a7b246a 100644
--- a/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll
+++ b/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll
@@ -12,7 +12,7 @@
%struct.objc_selector = type opaque
%struct.pthread_mutex_t = type { i32, [40 x i8] }
%struct.pthread_rwlock_t = type { i32, [124 x i8] }
-external constant %struct.__builtin_CFString ; <%struct.__builtin_CFString*>:0 [#uses=1]
+@0 = external constant %struct.__builtin_CFString ; <%struct.__builtin_CFString*>:0 [#uses=1]
define void @"-[PFTPersistentSymbols saveSymbolWithName:address:path:lineNumber:flags:owner:]"(%struct.PFTPersistentSymbols* %self, %struct.objc_selector* %_cmd, %struct.NSArray* %name, i64 %address, %struct.NSArray* %path, i32 %lineNumber, i64 %flags, %struct..0objc_object* %owner) nounwind {
entry:
diff --git a/test/CodeGen/PowerPC/2008-07-15-Bswap.ll b/test/CodeGen/PowerPC/2008-07-15-Bswap.ll
index b271048fd045..ab1973fa429c 100644
--- a/test/CodeGen/PowerPC/2008-07-15-Bswap.ll
+++ b/test/CodeGen/PowerPC/2008-07-15-Bswap.ll
@@ -68,8 +68,6 @@ declare void @_Z33LoopFilter_Internal_FilterChromaHPhiiiiii(i8*, i32, i32, i32,
declare void @_Z42LoopFilter_Internal_filter_macroblock_lumaPK14LoopFilterInfoPhS2_iiiPK30PerMacroblockBoundaryStrengthsjj(%struct.LoopFilterInfo*, i8*, i8*, i32, i32, i32, %struct.PerMacroblockBoundaryStrengths*, i32, i32) nounwind
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
-
declare i32 @_Z40LoopFilter_Internal_FilterLumaPlaneMBAFFPK14LoopFilterInfojjj(%struct.LoopFilterInfo*, i32, i32, i32) nounwind
declare void @_Z18LoopFilter_DestroyP14LoopFilterInfo(%struct.LoopFilterInfo*)
@@ -86,8 +84,6 @@ declare void @jvtDisposePTRMemAligned(i8*)
declare void @_Z31LoopFilter_Internal_ResetTablesP14LoopFilterInfo(%struct.LoopFilterInfo*) nounwind
-declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
-
define i32 @_Z60LoopFilter_Internal_CalculateBoundaryStrengths_MbaffFramePicPK14LoopFilterInfoP22FrameMotionVectorCachejj(%struct.LoopFilterInfo* %lfiPtr, %struct.FrameMotionVectorCache* %frameMotionVectorCachePtr, i32 %mbY_min, i32 %mbY_maxPlus1) nounwind {
entry:
icmp ult i32 %mbY_min, %mbY_maxPlus1 ; <i1>:0 [#uses=1]
@@ -383,4 +379,3 @@ declare i32 @_Z22LoopFilter_FilterFrameP14LoopFilterInfoP11FramePixelsP22FrameMo
declare void @_Z34LF_Threading2_ProcessTasks_WrapperPv(i8*)
-declare void @llvm.memset.i64(i8*, i8, i64, i32) nounwind
diff --git a/test/CodeGen/PowerPC/2010-02-04-EmptyGlobal.ll b/test/CodeGen/PowerPC/2010-02-04-EmptyGlobal.ll
index 1ba11d3fd036..160b26e9078d 100644
--- a/test/CodeGen/PowerPC/2010-02-04-EmptyGlobal.ll
+++ b/test/CodeGen/PowerPC/2010-02-04-EmptyGlobal.ll
@@ -6,7 +6,7 @@
@_cmd = constant %cmd.type zeroinitializer
; CHECK: .globl __cmd
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: __cmd:
; CHECK-NEXT: .byte 0
diff --git a/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll b/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll
index 35e3fdd26e72..dbc521fb8f1f 100644
--- a/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll
+++ b/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll
@@ -8,10 +8,10 @@ declare void @init() nounwind
declare void @clock() nounwind
-; CHECK: %entry
+; CHECK: mflr 0
; CHECK: fmr 31, 1
; CHECK: bl init
-define void @s332(double %t) nounwind {
+define double @s332(double %t) nounwind {
entry:
tail call void @init()
tail call void @clock() nounwind
@@ -29,5 +29,7 @@ for.body4: ; preds = %for.cond2
L20: ; preds = %for.body4, %for.cond2
%index.0 = phi i32 [ -2, %for.cond2 ], [ %i.0, %for.body4 ]
- unreachable
+ %index.d = sitofp i32 %index.0 to double
+ %retval = fadd double %t, %index.d
+ ret double %retval
}
diff --git a/test/CodeGen/PowerPC/2016-04-16-ADD8TLS.ll b/test/CodeGen/PowerPC/2016-04-16-ADD8TLS.ll
new file mode 100644
index 000000000000..c0b48d30bd48
--- /dev/null
+++ b/test/CodeGen/PowerPC/2016-04-16-ADD8TLS.ll
@@ -0,0 +1,43 @@
+; RUN: llc <%s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@x = external thread_local global i32, align 4
+
+; Function Attrs: nounwind
+; CHECK-NOT: add [[REG1:[0-9]+]], 0, x@tls
+define void @f() {
+ %1 = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "nop", "={r3},={r4},={r5},={r6},={r7},={r8},={r9},={r10},={r11},={r12},={r14},={r15},={r16},={r17},={r18},={r19},={r20},={r21},={r22},={r23},={r24},={r25},={r26},={r27},={r28},={r29},={r30},~{memory}"()
+ %2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 1
+ %4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 2
+ %5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 3
+ %6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 4
+ %7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 5
+ %8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 6
+ %9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 7
+ %10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 8
+ %11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 9
+ %12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 10
+ %13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 11
+ %14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 12
+ %15 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 13
+ %16 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 14
+ %17 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 15
+ %18 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 16
+ %19 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 17
+ %20 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 18
+ %21 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 19
+ %22 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 20
+ %23 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 21
+ %24 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 22
+ %25 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 23
+ %26 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 24
+ %27 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 25
+ %28 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %1, 26
+ %29 = load i32, i32* @x, align 4
+ %30 = add nsw i32 %29, 1
+ store i32 %30, i32* @x, align 4
+ tail call void asm sideeffect "nop", "{r3},{r4},{r5},{r6},{r7},{r8},{r9},{r10},{r11},{r12},{r14},{r15},{r16},{r17},{r18},{r19},{r20},{r21},{r22},{r23},{r24},{r25},{r26},{r27},{r28},{r29},{r30},~{memory}"(i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %17, i32 %18, i32 %19, i32 %20, i32 %21, i32 %22, i32 %23, i32 %24, i32 %25, i32 %26, i32 %27, i32 %28)
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/2016-04-17-combine.ll b/test/CodeGen/PowerPC/2016-04-17-combine.ll
new file mode 100644
index 000000000000..7ad943cf35fe
--- /dev/null
+++ b/test/CodeGen/PowerPC/2016-04-17-combine.ll
@@ -0,0 +1,26 @@
+; RUN: llc <%s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; PR27390 crasher
+
+%typ = type { i32, i32 }
+
+; On release builds, it doesn't crash, spewing nonsense instead.
+; To make sure it works, check that and is still alive.
+; CHECK: and
+; Also, in release, it emits a COPY from a 32-bit register to
+; a 64-bit register, which happens to be emitted as cror [!]
+; by the confused CodeGen. Just to be sure, check there isn't one.
+; CHECK-NOT: cror
+; Function Attrs: uwtable
+define signext i32 @_Z8access_pP1Tc(%typ* %p, i8 zeroext %type) {
+ %b = getelementptr inbounds %typ, %typ* %p, i64 0, i32 1
+ %1 = load i32, i32* %b, align 4
+ %2 = ptrtoint i32* %b to i64
+ %3 = and i64 %2, -35184372088833
+ %4 = inttoptr i64 %3 to i32*
+ %_msld = load i32, i32* %4, align 4
+ %zzz = add i32 %1, %_msld
+ ret i32 %zzz
+}
diff --git a/test/CodeGen/PowerPC/2016-04-28-setjmp.ll b/test/CodeGen/PowerPC/2016-04-28-setjmp.ll
new file mode 100644
index 000000000000..09c0fa7ba972
--- /dev/null
+++ b/test/CodeGen/PowerPC/2016-04-28-setjmp.ll
@@ -0,0 +1,48 @@
+; RUN: llc -filetype=obj <%s | llvm-objdump --disassemble - | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@ptr = common global i8* null, align 8
+
+; Verify there's no junk between these two instructions from misemitted
+; EH_SjLj_Setup.
+
+; CHECK: li 3, 1
+; CHECK-NEXT: b .+4
+
+define void @h() nounwind {
+ %1 = load i8**, i8*** bitcast (i8** @ptr to i8***), align 8
+ %2 = tail call i8* @llvm.frameaddress(i32 0)
+ store i8* %2, i8** %1, align 8
+ %3 = tail call i8* @llvm.stacksave()
+ %4 = getelementptr inbounds i8*, i8** %1, i64 2
+ store i8* %3, i8** %4, align 8
+ %5 = bitcast i8** %1 to i8*
+ %6 = tail call i32 @llvm.eh.sjlj.setjmp(i8* %5)
+ %7 = icmp eq i32 %6, 0
+ br i1 %7, label %9, label %8
+
+; <label>:8: ; preds = %0
+ tail call void @g()
+ br label %10
+
+; <label>:9: ; preds = %0
+ tail call void @f()
+ br label %10
+
+; <label>:10: ; preds = %9, %8
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.frameaddress(i32)
+
+; Function Attrs: nounwind
+declare i8* @llvm.stacksave()
+
+; Function Attrs: nounwind
+declare i32 @llvm.eh.sjlj.setjmp(i8*)
+
+declare void @g()
+
+declare void @f()
diff --git a/test/CodeGen/PowerPC/BreakableToken-reduced.ll b/test/CodeGen/PowerPC/BreakableToken-reduced.ll
index 2077dbb820f7..992d2aac4a58 100644
--- a/test/CodeGen/PowerPC/BreakableToken-reduced.ll
+++ b/test/CodeGen/PowerPC/BreakableToken-reduced.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-shrink-wrap=true %s -o - | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-shrink-wrap=true %s -o - | FileCheck %s
;
; Test the use of a non-R0 register to save/restore the LR in function
; prologue/epilogue.
diff --git a/test/CodeGen/PowerPC/aantidep-def-ec.mir b/test/CodeGen/PowerPC/aantidep-def-ec.mir
index d1cb6782f038..809d3693af60 100644
--- a/test/CodeGen/PowerPC/aantidep-def-ec.mir
+++ b/test/CodeGen/PowerPC/aantidep-def-ec.mir
@@ -45,6 +45,7 @@ name: mm_update_next_owner
alignment: 4
exposesReturnsTwice: false
hasInlineAsm: true
+allVRegsAllocated: true
isSSA: false
tracksRegLiveness: true
tracksSubRegLiveness: false
diff --git a/test/CodeGen/PowerPC/addisdtprelha-nonr3.mir b/test/CodeGen/PowerPC/addisdtprelha-nonr3.mir
index e4aaaf30f90f..6f52aa21a775 100644
--- a/test/CodeGen/PowerPC/addisdtprelha-nonr3.mir
+++ b/test/CodeGen/PowerPC/addisdtprelha-nonr3.mir
@@ -27,6 +27,7 @@ name: test1
alignment: 4
exposesReturnsTwice: false
hasInlineAsm: false
+allVRegsAllocated: true
isSSA: false
tracksRegLiveness: true
tracksSubRegLiveness: false
diff --git a/test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll b/test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll
new file mode 100644
index 000000000000..c57588668d8b
--- /dev/null
+++ b/test/CodeGen/PowerPC/aggressive-anti-dep-breaker-subreg.ll
@@ -0,0 +1,24 @@
+; RUN: llc %s -mtriple=powerpc64-unknown-linux-gnu -O2 -o - -optimize-regalloc=false -regalloc=fast | FileCheck %s
+
+declare void @func(i8*, i64, i64)
+
+define void @test(i8* %context, i32** %elementArrayPtr, i32 %value) {
+entry:
+ %cmp = icmp eq i32 %value, 0
+ br i1 %cmp, label %lreturn, label %lnext
+
+lnext:
+ %elementArray = load i32*, i32** %elementArrayPtr, align 8
+; CHECK: lwz [[LDREG:[0-9]+]], 124(1) # 4-byte Folded Reload
+; CHECK: # implicit-def: %X[[TEMPREG:[0-9]+]]
+ %element = load i32, i32* %elementArray, align 4
+; CHECK: mr [[TEMPREG]], [[LDREG]]
+; CHECK: clrldi 4, [[TEMPREG]], 32
+ %element.ext = zext i32 %element to i64
+ %value.ext = zext i32 %value to i64
+ call void @func(i8* %context, i64 %value.ext, i64 %element.ext)
+ br label %lreturn
+
+lreturn:
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/align.ll b/test/CodeGen/PowerPC/align.ll
index 0797ca8d0be8..52ac2c12e03a 100644
--- a/test/CodeGen/PowerPC/align.ll
+++ b/test/CodeGen/PowerPC/align.ll
@@ -9,33 +9,33 @@
; no alignment
@c = global i16 2
-;ELF: .align 1
+;ELF: .p2align 1
;ELF: c:
-;DARWIN: .align 1
+;DARWIN: .p2align 1
;DARWIN: _c:
@d = global i32 3
-;ELF: .align 2
+;ELF: .p2align 2
;ELF: d:
-;DARWIN: .align 2
+;DARWIN: .p2align 2
;DARWIN: _d:
@e = global i64 4
-;ELF: .align 3
+;ELF: .p2align 3
;ELF: e
-;DARWIN: .align 3
+;DARWIN: .p2align 3
;DARWIN: _e:
@f = global float 5.0
-;ELF: .align 2
+;ELF: .p2align 2
;ELF: f:
-;DARWIN: .align 2
+;DARWIN: .p2align 2
;DARWIN: _f:
@g = global double 6.0
-;ELF: .align 3
+;ELF: .p2align 3
;ELF: g:
-;DARWIN: .align 3
+;DARWIN: .p2align 3
;DARWIN: _g:
@bar = common global [75 x i8] zeroinitializer, align 128
diff --git a/test/CodeGen/PowerPC/andc.ll b/test/CodeGen/PowerPC/andc.ll
new file mode 100644
index 000000000000..8b8eca6bad5b
--- /dev/null
+++ b/test/CodeGen/PowerPC/andc.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin | FileCheck %s
+
+; TODO: These could use 'andc'.
+
+define i1 @and_cmp1(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp1:
+; CHECK: ; BB#0:
+; CHECK-NEXT: and r2, r3, r4
+; CHECK-NEXT: li r3, 1
+; CHECK-NEXT: cmpw cr0, r2, r4
+; CHECK-NEXT: bclr 12, 2, 0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: li r3, 0
+; CHECK-NEXT: blr
+;
+ %and = and i32 %x, %y
+ %cmp = icmp eq i32 %and, %y
+ ret i1 %cmp
+}
+
+define i1 @and_cmp_const(i32 %x) {
+; CHECK-LABEL: and_cmp_const:
+; CHECK: ; BB#0:
+; CHECK-NEXT: andi. r2, r3, 43
+; CHECK-NEXT: li r3, 1
+; CHECK-NEXT: cmpwi r2, 43
+; CHECK-NEXT: bclr 12, 2, 0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT: li r3, 0
+; CHECK-NEXT: blr
+;
+ %and = and i32 %x, 43
+ %cmp = icmp eq i32 %and, 43
+ ret i1 %cmp
+}
+
diff --git a/test/CodeGen/PowerPC/asm-constraints.ll b/test/CodeGen/PowerPC/asm-constraints.ll
index 2d9b0eb591d3..e7b6366bf995 100644
--- a/test/CodeGen/PowerPC/asm-constraints.ll
+++ b/test/CodeGen/PowerPC/asm-constraints.ll
@@ -1,5 +1,10 @@
; RUN: llc < %s -mcpu=pwr8 | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+; Check that we accept 'U' and 'X' constraints.
; Generated from following C code:
;
; void foo (int result, char *addr) {
@@ -12,34 +17,57 @@
; : "m"(*addr) : "memory", "cr0");
; }
-target datalayout = "e-m:e-i64:64-n32:64"
-target triple = "powerpc64le-unknown-linux-gnu"
-
-; Function Attrs: nounwind
-; Check that we accept 'U' and 'X' constraints.
define void @foo(i32 signext %result, i8* %addr) #0 {
+
+; CHECK-LABEL: @foo
+; CHECK: ld [[REG:[0-9]+]], 0(4)
+; CHECK: cmpw [[REG]], [[REG]]
+; CHECK: bne- 0, .Ltmp[[TMP:[0-9]+]]
+; CHECK: .Ltmp[[TMP]]:
+; CHECK: isync
+
entry:
%result.addr = alloca i32, align 4
%addr.addr = alloca i8*, align 8
store i32 %result, i32* %result.addr, align 4
store i8* %addr, i8** %addr.addr, align 8
%0 = load i8*, i8** %addr.addr, align 8
- %1 = call i32 asm sideeffect "ld${1:U}${1:X} $0,$1\0Acmpw $0,$0\0Abne- 1f\0A1: isync\0A", "=r,*m,~{memory},~{cr0}"(i8* %0) #1, !srcloc !1
+ %1 = call i32 asm sideeffect "ld${1:U}${1:X} $0,$1\0Acmpw $0,$0\0Abne- 1f\0A1: isync\0A", "=r,*m,~{memory},~{cr0}"(i8* %0) #1, !srcloc !0
store i32 %1, i32* %result.addr, align 4
ret void
}
-; CHECK-LABEL: @foo
-; CHECK: ld [[REG:[0-9]+]], 0(4)
-; CHECK: cmpw [[REG]], [[REG]]
-; CHECK: bne- 0, .Ltmp[[TMP:[0-9]+]]
-; CHECK: .Ltmp[[TMP]]:
-; CHECK: isync
+; Function Attrs: nounwind
+; Check that we accept the 'd' constraint.
+; Generated from the following C code:
+; int foo(double x) {
+; int64_t result;
+; __asm__ __volatile__("fctid %0, %1"
+; : "=d"(result)
+; : "d"(x)
+; : /* No clobbers */);
+; return result;
+; }
+define signext i32 @bar(double %x) #0 {
+
+; CHECK-LABEL: @bar
+; CHECK: fctid 0, 1
+entry:
+ %x.addr = alloca double, align 8
+ %result = alloca i64, align 8
+ store double %x, double* %x.addr, align 8
+ %0 = load double, double* %x.addr, align 8
+ %1 = call i64 asm sideeffect "fctid $0, $1", "=d,d"(double %0) #1, !srcloc !1
+ store i64 %1, i64* %result, align 8
+ %2 = load i64, i64* %result, align 8
+ %conv = trunc i64 %2 to i32
+ ret i32 %conv
+}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-!llvm.ident = !{!0}
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+attributes #1 = { nounwind }
-!0 = !{!"clang version 3.6.0 (trunk 217557)"}
-!1 = !{i32 67, i32 91, i32 110, i32 126}
+!0 = !{i32 67, i32 91, i32 110, i32 126}
+!1 = !{i32 84}
diff --git a/test/CodeGen/PowerPC/asm-printer-topological-order.ll b/test/CodeGen/PowerPC/asm-printer-topological-order.ll
new file mode 100644
index 000000000000..e1e62da4dbc2
--- /dev/null
+++ b/test/CodeGen/PowerPC/asm-printer-topological-order.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+@TestA = alias void (), void ()* @TestC
+@TestB = alias void (), void ()* @TestC
+@TestC = alias void (), void ()* @TestD
+
+define void @TestD() {
+entry:
+ ret void
+}
+
+; CHECK-LABEL: TestD:
+; CHECK: TestC = TestD
+; CHECK-DAG: TestB = TestC
+; CHECK-DAG: TestA = TestC
diff --git a/test/CodeGen/PowerPC/atomics-fences.ll b/test/CodeGen/PowerPC/atomics-fences.ll
index c015fa6eefb0..7682f8a7ef46 100644
--- a/test/CodeGen/PowerPC/atomics-fences.ll
+++ b/test/CodeGen/PowerPC/atomics-fences.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=440 | FileCheck %s --check-prefix=PPC440
; Fences
diff --git a/test/CodeGen/PowerPC/available-externally.ll b/test/CodeGen/PowerPC/available-externally.ll
index 53c435995485..6169e3fba995 100644
--- a/test/CodeGen/PowerPC/available-externally.ll
+++ b/test/CodeGen/PowerPC/available-externally.ll
@@ -1,12 +1,12 @@
; RUN: llc < %s -relocation-model=static | FileCheck %s -check-prefix=STATIC
-; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-apple-darwin9 | FileCheck %s -check-prefix=PIC
; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-unknown-linux | FileCheck %s -check-prefix=PICELF
-; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=PIC64
-; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC
-; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC64
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64-apple-darwin9 | FileCheck %s -check-prefix=PIC64
+; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc-apple-darwin9 | FileCheck %s -check-prefix=DYNAMIC
+; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc64-apple-darwin9 | FileCheck %s -check-prefix=DYNAMIC64
; PR4482
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "powerpc-apple-darwin8"
+target triple = "powerpc-apple-darwin9"
define i32 @foo(i64 %x) nounwind {
entry:
@@ -16,7 +16,7 @@ entry:
; STATIC: .subsections_via_symbols
; PIC: _foo:
-; PIC: bl L_exact_log2$stub
+; PIC: bl _exact_log2
; PIC: blr
; PICELF: foo:
@@ -24,15 +24,15 @@ entry:
; PICELF: blr
; PIC64: _foo:
-; PIC64: bl L_exact_log2$stub
+; PIC64: bl _exact_log2
; PIC64: blr
; DYNAMIC: _foo:
-; DYNAMIC: bl L_exact_log2$stub
+; DYNAMIC: bl _exact_log2
; DYNAMIC: blr
; DYNAMIC64: _foo:
-; DYNAMIC64: bl L_exact_log2$stub
+; DYNAMIC64: bl _exact_log2
; DYNAMIC64: blr
%A = call i32 @exact_log2(i64 %x) nounwind
@@ -45,70 +45,7 @@ entry:
}
-; PIC: .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
-; PIC: L_exact_log2$stub:
-; PIC: .indirect_symbol _exact_log2
-; PIC: mflr r0
-; PIC: bcl 20, 31, L_exact_log2$stub$tmp
-
-; PIC: L_exact_log2$stub$tmp:
-; PIC: mflr r11
-; PIC: addis r11, r11, ha16(L_exact_log2$lazy_ptr-L_exact_log2$stub$tmp)
-; PIC: mtlr r0
-; PIC: lwzu r12, lo16(L_exact_log2$lazy_ptr-L_exact_log2$stub$tmp)(r11)
-; PIC: mtctr r12
-; PIC: bctr
-
-; PIC: .section __DATA,__la_symbol_ptr,lazy_symbol_pointers
-; PIC: L_exact_log2$lazy_ptr:
-; PIC: .indirect_symbol _exact_log2
-; PIC: .long dyld_stub_binding_helper
-
; PIC: .subsections_via_symbols
-; PIC64: .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
-; PIC64: L_exact_log2$stub:
-; PIC64: .indirect_symbol _exact_log2
-; PIC64: mflr r0
-; PIC64: bcl 20, 31, L_exact_log2$stub$tmp
-
-; PIC64: L_exact_log2$stub$tmp:
-; PIC64: mflr r11
-; PIC64: addis r11, r11, ha16(L_exact_log2$lazy_ptr-L_exact_log2$stub$tmp)
-; PIC64: mtlr r0
-; PIC64: ldu r12, lo16(L_exact_log2$lazy_ptr-L_exact_log2$stub$tmp)(r11)
-; PIC64: mtctr r12
-; PIC64: bctr
-
-; PIC64: .section __DATA,__la_symbol_ptr,lazy_symbol_pointers
-; PIC64: L_exact_log2$lazy_ptr:
-; PIC64: .indirect_symbol _exact_log2
-; PIC64: .quad dyld_stub_binding_helper
; PIC64: .subsections_via_symbols
-
-; DYNAMIC: .section __TEXT,__symbol_stub1,symbol_stubs,pure_instructions,16
-; DYNAMIC: L_exact_log2$stub:
-; DYNAMIC: .indirect_symbol _exact_log2
-; DYNAMIC: lis r11, ha16(L_exact_log2$lazy_ptr)
-; DYNAMIC: lwzu r12, lo16(L_exact_log2$lazy_ptr)(r11)
-; DYNAMIC: mtctr r12
-; DYNAMIC: bctr
-
-; DYNAMIC: .section __DATA,__la_symbol_ptr,lazy_symbol_pointers
-; DYNAMIC: L_exact_log2$lazy_ptr:
-; DYNAMIC: .indirect_symbol _exact_log2
-; DYNAMIC: .long dyld_stub_binding_helper
-
-; DYNAMIC64: .section __TEXT,__symbol_stub1,symbol_stubs,pure_instructions,16
-; DYNAMIC64: L_exact_log2$stub:
-; DYNAMIC64: .indirect_symbol _exact_log2
-; DYNAMIC64: lis r11, ha16(L_exact_log2$lazy_ptr)
-; DYNAMIC64: ldu r12, lo16(L_exact_log2$lazy_ptr)(r11)
-; DYNAMIC64: mtctr r12
-; DYNAMIC64: bctr
-
-; DYNAMIC64: .section __DATA,__la_symbol_ptr,lazy_symbol_pointers
-; DYNAMIC64: L_exact_log2$lazy_ptr:
-; DYNAMIC64: .indirect_symbol _exact_log2
-; DYNAMIC64: .quad dyld_stub_binding_helper
diff --git a/test/CodeGen/PowerPC/bdzlr.ll b/test/CodeGen/PowerPC/bdzlr.ll
index d6506044868f..a7c5fa4faa00 100644
--- a/test/CodeGen/PowerPC/bdzlr.ll
+++ b/test/CodeGen/PowerPC/bdzlr.ll
@@ -37,7 +37,8 @@ for.body: ; preds = %for.body.for.body_c
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]
%tt = getelementptr inbounds %struct.lua_TValue.17.692, %struct.lua_TValue.17.692* %0, i64 %indvars.iv, i32 1
%1 = load i32, i32* %tt, align 4
- store i32 %1, i32* undef, align 4
+ %2 = add i32 %1, %1
+ store i32 %2, i32* %tt, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
diff --git a/test/CodeGen/PowerPC/builtins-ppc-p8vector.ll b/test/CodeGen/PowerPC/builtins-ppc-p8vector.ll
index 37111ef0d89b..7afb1a659b6d 100644
--- a/test/CodeGen/PowerPC/builtins-ppc-p8vector.ll
+++ b/test/CodeGen/PowerPC/builtins-ppc-p8vector.ll
@@ -4,7 +4,9 @@
; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-VSX
@vsc = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, align 16
+@vsc2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, align 16
@vuc = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, align 16
+@vuc2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, align 16
@res_vll = common global <2 x i64> zeroinitializer, align 16
@res_vull = common global <2 x i64> zeroinitializer, align 16
@res_vsc = common global <16 x i8> zeroinitializer, align 16
@@ -13,54 +15,39 @@
; Function Attrs: nounwind
define void @test1() {
entry:
- %__a.addr.i = alloca <16 x i8>, align 16
- %__b.addr.i = alloca <16 x i8>, align 16
%0 = load <16 x i8>, <16 x i8>* @vsc, align 16
- %1 = load <16 x i8>, <16 x i8>* @vsc, align 16
- store <16 x i8> %0, <16 x i8>* %__a.addr.i, align 16
- store <16 x i8> %1, <16 x i8>* %__b.addr.i, align 16
- %2 = load <16 x i8>, <16 x i8>* %__a.addr.i, align 16
- %3 = load <16 x i8>, <16 x i8>* %__b.addr.i, align 16
- %4 = call <2 x i64> @llvm.ppc.altivec.vbpermq(<16 x i8> %2, <16 x i8> %3)
- store <2 x i64> %4, <2 x i64>* @res_vll, align 16
+ %1 = load <16 x i8>, <16 x i8>* @vsc2, align 16
+ %2 = call <2 x i64> @llvm.ppc.altivec.vbpermq(<16 x i8> %0, <16 x i8> %1)
+ store <2 x i64> %2, <2 x i64>* @res_vll, align 16
ret void
; CHECK-LABEL: @test1
-; CHECK: lvx [[REG1:[0-9]+]],
-; CHECK: lvx [[REG2:[0-9]+]],
-; CHECK: vbpermq {{[0-9]+}}, [[REG2]], [[REG1]]
+; CHECK: lvx [[REG1:[0-9]+]], 0, 3
+; CHECK: lvx [[REG2:[0-9]+]], 0, 4
+; CHECK: vbpermq {{[0-9]+}}, [[REG1]], [[REG2]]
; CHECK-VSX: vbpermq {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
}
; Function Attrs: nounwind
define void @test2() {
entry:
- %__a.addr.i = alloca <16 x i8>, align 16
- %__b.addr.i = alloca <16 x i8>, align 16
%0 = load <16 x i8>, <16 x i8>* @vuc, align 16
- %1 = load <16 x i8>, <16 x i8>* @vuc, align 16
- store <16 x i8> %0, <16 x i8>* %__a.addr.i, align 16
- store <16 x i8> %1, <16 x i8>* %__b.addr.i, align 16
- %2 = load <16 x i8>, <16 x i8>* %__a.addr.i, align 16
- %3 = load <16 x i8>, <16 x i8>* %__b.addr.i, align 16
- %4 = call <2 x i64> @llvm.ppc.altivec.vbpermq(<16 x i8> %2, <16 x i8> %3)
- store <2 x i64> %4, <2 x i64>* @res_vull, align 16
+ %1 = load <16 x i8>, <16 x i8>* @vuc2, align 16
+ %2 = call <2 x i64> @llvm.ppc.altivec.vbpermq(<16 x i8> %0, <16 x i8> %1)
+ store <2 x i64> %2, <2 x i64>* @res_vull, align 16
ret void
; CHECK-LABEL: @test2
-; CHECK: lvx [[REG1:[0-9]+]],
-; CHECK: lvx [[REG2:[0-9]+]],
-; CHECK: vbpermq {{[0-9]+}}, [[REG2]], [[REG1]]
+; CHECK: lvx [[REG1:[0-9]+]], 0, 3
+; CHECK: lvx [[REG2:[0-9]+]], 0, 4
+; CHECK: vbpermq {{[0-9]+}}, [[REG1]], [[REG2]]
; CHECK-VSX: vbpermq {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
}
; Function Attrs: nounwind
define void @test3() {
entry:
- %__a.addr.i = alloca <16 x i8>, align 16
%0 = load <16 x i8>, <16 x i8>* @vsc, align 16
- store <16 x i8> %0, <16 x i8>* %__a.addr.i, align 16
- %1 = load <16 x i8>, <16 x i8>* %__a.addr.i, align 16
- %2 = call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> %1)
- store <16 x i8> %2, <16 x i8>* @res_vsc, align 16
+ %1 = call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> %0)
+ store <16 x i8> %1, <16 x i8>* @res_vsc, align 16
ret void
; CHECK-LABEL: @test3
; CHECK: lvx [[REG1:[0-9]+]],
@@ -71,12 +58,9 @@ entry:
; Function Attrs: nounwind
define void @test4() {
entry:
- %__a.addr.i = alloca <16 x i8>, align 16
%0 = load <16 x i8>, <16 x i8>* @vuc, align 16
- store <16 x i8> %0, <16 x i8>* %__a.addr.i, align 16
- %1 = load <16 x i8>, <16 x i8>* %__a.addr.i, align 16
- %2 = call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> %1)
- store <16 x i8> %2, <16 x i8>* @res_vuc, align 16
+ %1 = call <16 x i8> @llvm.ppc.altivec.vgbbd(<16 x i8> %0)
+ store <16 x i8> %1, <16 x i8>* @res_vuc, align 16
ret void
; CHECK-LABEL: @test4
; CHECK: lvx [[REG1:[0-9]+]],
diff --git a/test/CodeGen/PowerPC/cannonicalize-vector-shifts.ll b/test/CodeGen/PowerPC/cannonicalize-vector-shifts.ll
new file mode 100644
index 000000000000..68d4530e81ec
--- /dev/null
+++ b/test/CodeGen/PowerPC/cannonicalize-vector-shifts.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+define <4 x i32> @test1(<4 x i32> %a) {
+entry:
+; CHECK-LABEL: test1
+; CHECK: xxswapd 34, 34
+ %vecins6 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ ret <4 x i32> %vecins6
+}
+
+define <8 x i16> @test2(<8 x i16> %a) #0 {
+entry:
+; CHECK-LABEL: test2
+; CHECK: xxswapd 34, 34
+ %vecins14 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i16> %vecins14
+}
+
+define <16 x i8> @test3(<16 x i8> %a) #0 {
+entry:
+; CHECK-LABEL: test3
+; CHECK: xxswapd 34, 34
+ %vecins30 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i8> %vecins30
+}
diff --git a/test/CodeGen/PowerPC/code-align.ll b/test/CodeGen/PowerPC/code-align.ll
index 19d1b236ce0d..a6fd37421064 100644
--- a/test/CodeGen/PowerPC/code-align.ll
+++ b/test/CodeGen/PowerPC/code-align.ll
@@ -22,9 +22,9 @@ entry:
; GENERIC-LABEL: .globl foo
; BASIC-LABEL: .globl foo
; PWR-LABEL: .globl foo
-; GENERIC: .align 2
-; BASIC: .align 4
-; PWR: .align 4
+; GENERIC: .p2align 2
+; BASIC: .p2align 4
+; PWR: .p2align 4
; GENERIC: @foo
; BASIC: @foo
; PWR: @foo
@@ -41,9 +41,9 @@ entry:
; GENERIC: mtctr
; BASIC: mtctr
; PWR: mtctr
-; GENERIC-NOT: .align
-; BASIC: .align 4
-; PWR: .align 4
+; GENERIC-NOT: .p2align
+; BASIC: .p2align 4
+; PWR: .p2align 4
; GENERIC: lwzu
; BASIC: lwzu
; PWR: lwzu
@@ -83,9 +83,9 @@ entry:
; GENERIC: mtctr
; BASIC: mtctr
; PWR: mtctr
-; GENERIC-NOT: .align
-; BASIC: .align 4
-; PWR: .align 5
+; GENERIC-NOT: .p2align
+; BASIC: .p2align 4
+; PWR: .p2align 5
; GENERIC: bdnz
; BASIC: bdnz
; PWR: bdnz
@@ -105,6 +105,48 @@ for.end: ; preds = %for.body
ret void
}
+; Function Attrs: nounwind
+define void @test_minsize(i32 signext %x, i32* nocapture %a) #2 {
+entry:
+ br label %vector.body
+
+; GENERIC-LABEL: @test_minsize
+; BASIC-LABEL: @test_minsize
+; PWR-LABEL: @test_minsize
+; GENERIC: mtctr
+; BASIC: mtctr
+; PWR: mtctr
+; GENERIC-NOT: .p2align
+; BASIC-NOT: .p2align
+; PWR-NOT: .p2align
+; GENERIC: lwzu
+; BASIC: lwzu
+; PWR: lwzu
+; GENERIC: bdnz
+; BASIC: bdnz
+; PWR: bdnz
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %induction45 = or i64 %index, 1
+ %0 = getelementptr inbounds i32, i32* %a, i64 %index
+ %1 = getelementptr inbounds i32, i32* %a, i64 %induction45
+ %2 = load i32, i32* %0, align 4
+ %3 = load i32, i32* %1, align 4
+ %4 = add nsw i32 %2, 4
+ %5 = add nsw i32 %3, 4
+ %6 = mul nsw i32 %4, 3
+ %7 = mul nsw i32 %5, 3
+ store i32 %6, i32* %0, align 4
+ store i32 %7, i32* %1, align 4
+ %index.next = add i64 %index, 2
+ %8 = icmp eq i64 %index.next, 2048
+ br i1 %8, label %for.end, label %vector.body
+
+for.end: ; preds = %vector.body
+ ret void
+}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
+attributes #2 = { nounwind minsize}
diff --git a/test/CodeGen/PowerPC/combine-to-pre-index-store-crash.ll b/test/CodeGen/PowerPC/combine-to-pre-index-store-crash.ll
new file mode 100644
index 000000000000..9bd0dd874b4a
--- /dev/null
+++ b/test/CodeGen/PowerPC/combine-to-pre-index-store-crash.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; CHECK-LABEL: TestFoo:
+; CHECK: std
+; CHECK: bl TestBar
+; CHECK: stbu
+; CHECK: std
+; CHECK: blr
+
+%StructA = type <{ i64, { i64, i64 }, { i64, i64 } }>
+
+define void @TestFoo(%StructA* %this) {
+ %tmp = getelementptr inbounds %StructA, %StructA* %this, i64 0, i32 1
+ %tmp11 = getelementptr inbounds %StructA, %StructA* %this, i64 0, i32 1, i32 1
+ %tmp12 = bitcast { i64, i64 }* %tmp to i64**
+ store i64* %tmp11, i64** %tmp12
+ call void @TestBar()
+ %tmp13 = getelementptr inbounds %StructA, %StructA* %this, i64 0, i32 2, i32 1
+ store i64* %tmp13, i64** undef
+ %.cast.i.i.i = bitcast i64* %tmp13 to i8*
+ store i8 0, i8* %.cast.i.i.i
+ ret void
+}
+
+declare void @TestBar()
diff --git a/test/CodeGen/PowerPC/crsave.ll b/test/CodeGen/PowerPC/crsave.ll
index 8121e1b6e639..a079e7f1585e 100644
--- a/test/CodeGen/PowerPC/crsave.ll
+++ b/test/CodeGen/PowerPC/crsave.ll
@@ -1,5 +1,6 @@
; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC32
; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -O0 -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefix=PPC64-ELFv2
declare void @foo()
@@ -60,3 +61,22 @@ entry:
; PPC64: mtocrf 16, 12
; PPC64: mtocrf 8, 12
+; Generate mfocrf in prologue when we need to save 1 nonvolatile CR field
+define void @cloberOneNvCrField() {
+entry:
+ tail call void asm sideeffect "# clobbers", "~{cr2}"()
+ ret void
+
+; PPC64-ELFv2-LABEL: @cloberOneNvCrField
+; PPC64-ELFv2: mfocrf [[REG1:[0-9]+]], 32
+}
+
+; Generate mfcr in prologue when we need to save all nonvolatile CR field
+define void @cloberAllNvCrField() {
+entry:
+ tail call void asm sideeffect "# clobbers", "~{cr2},~{cr3},~{cr4}"()
+ ret void
+
+; PPC64-ELFv2-LABEL: @cloberAllNvCrField
+; PPC64-ELFv2: mfcr [[REG1:[0-9]+]]
+}
diff --git a/test/CodeGen/PowerPC/crypto_bifs.ll b/test/CodeGen/PowerPC/crypto_bifs.ll
index f58935b85b66..62247e8118f4 100644
--- a/test/CodeGen/PowerPC/crypto_bifs.ll
+++ b/test/CodeGen/PowerPC/crypto_bifs.ll
@@ -1,6 +1,7 @@
; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+crypto < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
; FIXME: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s
; FIXME: The original intent was to add a check-next for the blr after every check.
; However, this currently fails since we don't eliminate stores of the unused
diff --git a/test/CodeGen/PowerPC/ctr-minmaxnum.ll b/test/CodeGen/PowerPC/ctr-minmaxnum.ll
new file mode 100644
index 000000000000..cecf620124c1
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctr-minmaxnum.ll
@@ -0,0 +1,231 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare float @fabsf(float)
+
+declare float @fminf(float, float)
+declare double @fmin(double, double)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+
+declare float @fmaxf(float, float)
+declare double @fmax(double, double)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
+
+define void @test1(float %f, float* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call float @llvm.minnum.f32(float %f, float 1.0)
+ store float %0, float* %fp, align 4
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test1:
+; CHECK-NOT: mtctr
+; CHECK: bl fminf
+
+define void @test1v(<4 x float> %f, <4 x float>* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call <4 x float> @llvm.minnum.v4f32(<4 x float> %f, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ store <4 x float> %0, <4 x float>* %fp, align 16
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test1v:
+; CHECK-NOT: mtctr
+; CHECK: bl fminf
+
+; QPX-LABEL: test1v:
+; QPX: mtctr
+; QPX-NOT: bl fminf
+; QPX: blr
+
+define void @test1a(float %f, float* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call float @fminf(float %f, float 1.0) readnone
+ store float %0, float* %fp, align 4
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test1a:
+; CHECK-NOT: mtctr
+; CHECK: bl fminf
+
+define void @test2(float %f, float* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call float @llvm.maxnum.f32(float %f, float 1.0)
+ store float %0, float* %fp, align 4
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test2:
+; CHECK-NOT: mtctr
+; CHECK: bl fmaxf
+
+define void @test2v(<4 x double> %f, <4 x double>* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %f, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>)
+ store <4 x double> %0, <4 x double>* %fp, align 16
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test2v:
+; CHECK-NOT: mtctr
+; CHECK: bl fmax
+
+; QPX-LABEL: test2v:
+; QPX: mtctr
+; QPX-NOT: bl fmax
+; QPX: blr
+
+define void @test2a(float %f, float* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call float @fmaxf(float %f, float 1.0) readnone
+ store float %0, float* %fp, align 4
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test2a:
+; CHECK-NOT: mtctr
+; CHECK: bl fmaxf
+
+define void @test3(double %f, double* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call double @llvm.minnum.f64(double %f, double 1.0)
+ store double %0, double* %fp, align 8
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test3:
+; CHECK-NOT: mtctr
+; CHECK: bl fmin
+
+define void @test3a(double %f, double* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call double @fmin(double %f, double 1.0) readnone
+ store double %0, double* %fp, align 8
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test3a:
+; CHECK-NOT: mtctr
+; CHECK: bl fmin
+
+define void @test4(double %f, double* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call double @llvm.maxnum.f64(double %f, double 1.0)
+ store double %0, double* %fp, align 8
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test4:
+; CHECK-NOT: mtctr
+; CHECK: bl fmax
+
+define void @test4a(double %f, double* %fp) {
+entry:
+ br label %loop_body
+
+loop_body:
+ %invar_address.dim.0.01 = phi i64 [ 0, %entry ], [ %1, %loop_body ]
+ %0 = call double @fmax(double %f, double 1.0) readnone
+ store double %0, double* %fp, align 8
+ %1 = add i64 %invar_address.dim.0.01, 1
+ %2 = icmp eq i64 %1, 2
+ br i1 %2, label %loop_exit, label %loop_body
+
+loop_exit:
+ ret void
+}
+
+; CHECK-LABEL: test4a:
+; CHECK-NOT: mtctr
+; CHECK: bl fmax
+
diff --git a/test/CodeGen/PowerPC/ctrloop-udivti3.ll b/test/CodeGen/PowerPC/ctrloop-udivti3.ll
index d07a11fe60fb..e3f6838d0670 100644
--- a/test/CodeGen/PowerPC/ctrloop-udivti3.ll
+++ b/test/CodeGen/PowerPC/ctrloop-udivti3.ll
@@ -13,8 +13,9 @@ for.body.lr.ph: ; preds = %entry
for.body: ; preds = %for.body, %for.body.lr.ph
%i.018.in = phi i64 [ %n, %for.body.lr.ph ], [ %i.018, %for.body ]
%i.018 = add i64 %i.018.in, -1
- %add.i = or i128 undef, undef
- %div.i = udiv i128 %add.i, 0
+ %jj = sext i64 %i.018 to i128
+ %add.i = or i128 %jj, undef
+ %div.i = udiv i128 %add.i, %jj
%conv3.i11 = trunc i128 %div.i to i64
store i64 %conv3.i11, i64* undef, align 8
%cmp = icmp eq i64 %i.018, 0
diff --git a/test/CodeGen/PowerPC/ctrloops-softfloat.ll b/test/CodeGen/PowerPC/ctrloops-softfloat.ll
new file mode 100644
index 000000000000..037bfda7323b
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloops-softfloat.ll
@@ -0,0 +1,129 @@
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -O1 < %s | FileCheck %s
+
+; double x, y;
+;
+; void foo1()
+; {
+; x = y = 1.1;
+; for (int i = 0; i < 175; i++)
+; y = x + y;
+; }
+; void foo2()
+; {
+; x = y = 1.1;
+; for (int i = 0; i < 175; i++)
+; y = x - y;
+; }
+; void foo3()
+; {
+; x = y = 1.1;
+; for (int i = 0; i < 175; i++)
+; y = x * y;
+; }
+; void foo4()
+; {
+; x = y = 1.1;
+; for (int i = 0; i < 175; i++)
+; y = x / y;
+; }
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-buildroot-linux-gnu"
+
+@y = common global double 0.000000e+00, align 8
+@x = common global double 0.000000e+00, align 8
+
+define void @foo1() #0 {
+ store double 1.100000e+00, double* @y, align 8
+ store double 1.100000e+00, double* @x, align 8
+ br label %2
+
+; <label>:1 ; preds = %2
+ %.lcssa = phi double [ %4, %2 ]
+ store double %.lcssa, double* @y, align 8
+ ret void
+
+; <label>:2 ; preds = %2, %0
+ %3 = phi double [ 1.100000e+00, %0 ], [ %4, %2 ]
+ %i.01 = phi i32 [ 0, %0 ], [ %5, %2 ]
+ %4 = fadd double %3, 1.100000e+00
+ %5 = add nuw nsw i32 %i.01, 1
+ %exitcond = icmp eq i32 %5, 75
+ br i1 %exitcond, label %1, label %2
+ ; CHECK: bl __adddf3
+ ; CHECK: cmplwi
+ ; CHECK-NOT: li [[REG1:[0-9]+]], 175
+ ; CHECK-NOT: mtctr [[REG1]]
+}
+
+define void @foo2() #0 {
+ store double 1.100000e+00, double* @y, align 8
+ store double 1.100000e+00, double* @x, align 8
+ br label %2
+
+; <label>:1 ; preds = %2
+ %.lcssa = phi double [ %4, %2 ]
+ store double %.lcssa, double* @y, align 8
+ ret void
+
+; <label>:2 ; preds = %2, %0
+ %3 = phi double [ 1.100000e+00, %0 ], [ %4, %2 ]
+ %i.01 = phi i32 [ 0, %0 ], [ %5, %2 ]
+ %4 = fsub double 1.100000e+00, %3
+ %5 = add nuw nsw i32 %i.01, 1
+ %exitcond = icmp eq i32 %5, 75
+ br i1 %exitcond, label %1, label %2
+ ; CHECK: bl __subdf3
+ ; CHECK: cmplwi
+ ; CHECK-NOT: li [[REG1:[0-9]+]], 175
+ ; CHECK-NOT: mtctr [[REG1]]
+}
+
+define void @foo3() #0 {
+ store double 1.100000e+00, double* @y, align 8
+ store double 1.100000e+00, double* @x, align 8
+ br label %2
+
+; <label>:1 ; preds = %2
+ %.lcssa = phi double [ %4, %2 ]
+ store double %.lcssa, double* @y, align 8
+ ret void
+
+; <label>:2 ; preds = %2, %0
+ %3 = phi double [ 1.100000e+00, %0 ], [ %4, %2 ]
+ %i.01 = phi i32 [ 0, %0 ], [ %5, %2 ]
+ %4 = fmul double %3, 1.100000e+00
+ %5 = add nuw nsw i32 %i.01, 1
+ %exitcond = icmp eq i32 %5, 75
+ br i1 %exitcond, label %1, label %2
+ ; CHECK: bl __muldf3
+ ; CHECK: cmplwi
+ ; CHECK-NOT: li [[REG1:[0-9]+]], 175
+ ; CHECK-NOT: mtctr [[REG1]]
+}
+
+define void @foo4() #0 {
+ store double 1.100000e+00, double* @y, align 8
+ store double 1.100000e+00, double* @x, align 8
+ br label %2
+
+; <label>:1 ; preds = %2
+ %.lcssa = phi double [ %4, %2 ]
+ store double %.lcssa, double* @y, align 8
+ ret void
+
+; <label>:2 ; preds = %2, %0
+ %3 = phi double [ 1.100000e+00, %0 ], [ %4, %2 ]
+ %i.01 = phi i32 [ 0, %0 ], [ %5, %2 ]
+ %4 = fdiv double 1.100000e+00, %3
+ %5 = add nuw nsw i32 %i.01, 1
+ %exitcond = icmp eq i32 %5, 75
+ br i1 %exitcond, label %1, label %2
+ ; CHECK: bl __divdf3
+ ; CHECK: cmplwi
+ ; CHECK-NOT: li [[REG1:[0-9]+]], 175
+ ; CHECK-NOT: mtctr [[REG1]]
+}
+
+attributes #0 = { "use-soft-float"="true" }
+
diff --git a/test/CodeGen/PowerPC/ctrloops.ll b/test/CodeGen/PowerPC/ctrloops.ll
index fff9e20d2626..f1ad58fde2b6 100644
--- a/test/CodeGen/PowerPC/ctrloops.ll
+++ b/test/CodeGen/PowerPC/ctrloops.ll
@@ -76,23 +76,22 @@ for.end: ; preds = %for.body, %entry
@tls_var = external thread_local global i8
-define i32 @test4() {
+define i32 @test4(i32 %inp) {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
- %phi = phi i32 [ %dec, %for.body ], [ undef, %entry ]
+ %phi = phi i32 [ %dec, %for.body ], [ %inp, %entry ]
%load = ptrtoint i8* @tls_var to i32
+ %val = add i32 %load, %phi
%dec = add i32 %phi, -1
%cmp = icmp sgt i32 %phi, 1
br i1 %cmp, label %for.body, label %return
return: ; preds = %for.body
- ret i32 %load
+ ret i32 %val
; CHECK-LABEL: @test4
-; CHECK-NOT: mtctr
-; CHECK: addi {{[0-9]+}}
-; CHECK: cmpwi
-; CHECK-NOT: bdnz
-; CHECK: bgt
+; CHECK: mtctr
+; CHECK: bdnz
+; CHECK: __tls_get_addr
}
diff --git a/test/CodeGen/PowerPC/cxx_tlscc64.ll b/test/CodeGen/PowerPC/cxx_tlscc64.ll
new file mode 100644
index 000000000000..2baff8498843
--- /dev/null
+++ b/test/CodeGen/PowerPC/cxx_tlscc64.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s --enable-shrink-wrap=false -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+@sum1 = internal thread_local global i32 0, align 4
+
+declare void @_ZN1SC1Ev(%struct.S*)
+declare void @_ZN1SD1Ev(%struct.S*)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+; CHECK-LABEL: _ZTW2sg
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
+ %.b.i = load i1, i1* @__tls_guard, align 1
+; CHECK: bc 12, 1, [[BB_end:.?LBB0_[0-9]+]]
+ br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+; CHECK: Folded Spill
+ store i1 true, i1* @__tls_guard, align 1
+ tail call void @_ZN1SC1Ev(%struct.S* nonnull @sg) #2
+; CHECK: bl _ZN1SC1Ev
+ %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (void (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle) #2
+; CHECK: Folded Reload
+; CHECK: _tlv_atexit
+ br label %__tls_init.exit
+
+; CHECK: [[BB_end]]:
+__tls_init.exit:
+ ret %struct.S* @sg
+}
+
+; CHECK-LABEL: _ZTW4sum1
+define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
+ ret i32* @sum1
+}
+
+define cxx_fast_tlscc i32* @_ZTW4sum2() #0 {
+ ret i32* @sum1
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" } \ No newline at end of file
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index b636cff0f205..94c8a88316a7 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -17,10 +17,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1", isOptimized: true, emissionKind: 0, file: !21, enums: !1, retainedTypes: !1, subprograms: !3, globals: !1, imports: !1)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1", isOptimized: true, emissionKind: FullDebug, file: !21, enums: !1, retainedTypes: !1, globals: !1, imports: !1)
!1 = !{}
-!3 = !{!5}
-!5 = distinct !DISubprogram(name: "main", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !21, scope: null, type: !7, variables: !13)
+!5 = distinct !DISubprogram(name: "main", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, file: !21, scope: null, type: !7, variables: !13)
!6 = !DIFile(filename: "dbg.c", directory: "/src")
!7 = !DISubroutineType(types: !8)
!8 = !{!9, !9, !10}
diff --git a/test/CodeGen/PowerPC/direct-move-profit.ll b/test/CodeGen/PowerPC/direct-move-profit.ll
new file mode 100644
index 000000000000..0fa8c776be33
--- /dev/null
+++ b/test/CodeGen/PowerPC/direct-move-profit.ll
@@ -0,0 +1,83 @@
+; RUN: llc -O2 -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+
+; Function Attrs: norecurse nounwind
+define void @test1(float* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readnone %c, i32 signext %n) #0 {
+
+; CHECK-LABEL: test1
+
+entry:
+ %idxprom = sext i32 %n to i64
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+ %0 = load i32, i32* %arrayidx, align 4, !tbaa !1
+ %conv = sitofp i32 %0 to float
+ %mul = fmul float %conv, 0x4002916880000000
+ %arrayidx2 = getelementptr inbounds float, float* %a, i64 %idxprom
+ store float %mul, float* %arrayidx2, align 4, !tbaa !5
+ ret void
+
+; CHECK-NOT: mtvsrwa
+; CHECK-NOT: mtfprwa
+; CHECK: lxsiwax [[REG:[0-9]+]], {{.*}}
+; CHECK-NOT: mtvsrwa
+; CHECK-NOT: mtfprwa
+; CHECK: xscvsxdsp {{.*}}, [[REG]]
+; CHECK-NOT: mtvsrwa
+; CHECK-NOT: mtfprwa
+; CHECK: blr
+
+}
+
+; Function Attrs: norecurse nounwind readonly
+define float @test2(i32* nocapture readonly %b) #0 {
+
+; CHECK-LABEL: test2
+
+entry:
+ %0 = load i32, i32* %b, align 4, !tbaa !1
+ %conv = sitofp i32 %0 to float
+ %mul = fmul float %conv, 0x40030A3D80000000
+ ret float %mul
+
+; CHECK-NOT: mtvsrwa
+; CHECK-NOT: mtfprwa
+; CHECK: lxsiwax [[REG:[0-9]+]], {{.*}}
+; CHECK-NOT: mtvsrwa
+; CHECK-NOT: mtfprwa
+; CHECK: xscvsxdsp {{.*}}, [[REG]]
+; CHECK-NOT: mtvsrwa
+; CHECK-NOT: mtfprwa
+; CHECK: blr
+
+}
+
+; Function Attrs: norecurse nounwind
+define void @test3(float* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 signext %n) #0 {
+
+; CHECK-LABEL: test3
+
+entry:
+ %idxprom = sext i32 %n to i64
+ %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+ %0 = load i32, i32* %arrayidx, align 4, !tbaa !1
+ %conv = sitofp i32 %0 to float
+ %mul = fmul float %conv, 0x4002916880000000
+ %arrayidx2 = getelementptr inbounds float, float* %a, i64 %idxprom
+ store float %mul, float* %arrayidx2, align 4, !tbaa !5
+ %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %idxprom
+ %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1
+ %add = add nsw i32 %1, %0
+ store i32 %add, i32* %arrayidx6, align 4, !tbaa !1
+ ret void
+
+; CHECK: mtvsrwa
+; CHECK: blr
+
+}
+
+!0 = !{!"clang version 3.9.0 (http://llvm.org/git/clang.git b88a395e7ba26c0fb96cd99a2a004d76f4f41d0c) (http://llvm.org/git/llvm.git 1ac3fbac0f5b037c17c0b0f9d271c32c4d7ca1b5)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"float", !3, i64 0}
diff --git a/test/CodeGen/PowerPC/ec-input.ll b/test/CodeGen/PowerPC/ec-input.ll
index a57f69be12da..6ef93d268b6b 100644
--- a/test/CodeGen/PowerPC/ec-input.ll
+++ b/test/CodeGen/PowerPC/ec-input.ll
@@ -17,7 +17,7 @@ target triple = "powerpc64-bgq-linux"
declare void @fprintf(%struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713* nocapture, i8* nocapture readonly, ...) #0
; Function Attrs: inlinehint nounwind
-define void @_ZN4PAMI6Device2MU15ResourceManager46calculatePerCoreMUResourcesBasedOnAvailabilityEv() #1 align 2 {
+define void @_ZN4PAMI6Device2MU15ResourceManager46calculatePerCoreMUResourcesBasedOnAvailabilityEv(i32 %inp32, i64 %inp64) #1 align 2 {
; CHECK-LABEL: @_ZN4PAMI6Device2MU15ResourceManager46calculatePerCoreMUResourcesBasedOnAvailabilityEv
; CHECK: sc
@@ -32,7 +32,7 @@ for.cond2.preheader: ; preds = %if.end23.3, %entry
%minFreeBatIdsPerCore.097 = phi i64 [ 32, %entry ], [ %numFreeBatIdsInGroup.0.minFreeBatIdsPerCore.0, %if.end23.3 ]
%minFreeRecFifosPerCore.096 = phi i64 [ 16, %entry ], [ %minFreeRecFifosPerCore.1, %if.end23.3 ]
%minFreeInjFifosPerCore.095 = phi i64 [ 32, %entry ], [ %numFreeInjFifosInGroup.0.minFreeInjFifosPerCore.0, %if.end23.3 ]
- %cmp5 = icmp eq i32 undef, 0
+ %cmp5 = icmp eq i32 %inp32, 0
br i1 %cmp5, label %if.end, label %if.then
if.then: ; preds = %if.end23.2, %if.end23.1, %if.end23, %for.cond2.preheader
@@ -41,7 +41,7 @@ if.then: ; preds = %if.end23.2, %if.end
if.end: ; preds = %for.cond2.preheader
%1 = load i32, i32* %numFreeResourcesInSubgroup, align 4
%conv = zext i32 %1 to i64
- %2 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1034, i64 %indvars.iv, i64 %0, i64 undef) #2
+ %2 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1034, i64 %indvars.iv, i64 %0, i64 %inp64) #2
%cmp10 = icmp eq i32 0, 0
br i1 %cmp10, label %if.end14, label %if.then11
@@ -50,11 +50,11 @@ if.then11: ; preds = %if.end.3, %if.end.2
if.end14: ; preds = %if.end
%3 = load i32, i32* %numFreeResourcesInSubgroup, align 4
- %cmp19 = icmp eq i32 undef, 0
+ %cmp19 = icmp eq i32 %inp32, 0
br i1 %cmp19, label %if.end23, label %if.then20
if.then20: ; preds = %if.end14.3, %if.end14.2, %if.end14.1, %if.end14
- %conv4.i65.lcssa = phi i32 [ undef, %if.end14 ], [ 0, %if.end14.1 ], [ %conv4.i65.2, %if.end14.2 ], [ %conv4.i65.3, %if.end14.3 ]
+ %conv4.i65.lcssa = phi i32 [ %inp32, %if.end14 ], [ 0, %if.end14.1 ], [ %conv4.i65.2, %if.end14.2 ], [ %conv4.i65.3, %if.end14.3 ]
call void (%struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i8*, ...) @fprintf(%struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713* undef, i8* getelementptr inbounds ([121 x i8], [121 x i8]* @.str236, i64 0, i64 0), i32 signext 2503) #3
call void (%struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i8*, ...) @fprintf(%struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713* undef, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str294, i64 0, i64 0), i32 signext %conv4.i65.lcssa) #3
unreachable
@@ -63,7 +63,7 @@ if.end23: ; preds = %if.end14
%conv15 = zext i32 %3 to i64
%4 = load i32, i32* %numFreeResourcesInSubgroup, align 4
%conv24 = zext i32 %4 to i64
- %5 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1033, i64 0, i64 %0, i64 undef) #2
+ %5 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1033, i64 0, i64 %0, i64 %inp64) #2
%cmp5.1 = icmp eq i32 0, 0
br i1 %cmp5.1, label %if.end.1, label %if.then
@@ -74,8 +74,8 @@ if.end.1: ; preds = %if.end23
%6 = load i32, i32* %numFreeResourcesInSubgroup, align 4
%conv.1 = zext i32 %6 to i64
%add.1 = add nuw nsw i64 %conv.1, %conv
- %7 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1034, i64 0, i64 %0, i64 undef) #2
- %cmp10.1 = icmp eq i32 undef, 0
+ %7 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1034, i64 0, i64 %0, i64 %inp64) #2
+ %cmp10.1 = icmp eq i32 %inp32, 0
br i1 %cmp10.1, label %if.end14.1, label %if.then11
if.end14.1: ; preds = %if.end.1
@@ -89,20 +89,20 @@ if.end23.1: ; preds = %if.end14.1
%9 = load i32, i32* %numFreeResourcesInSubgroup, align 4
%conv24.1 = zext i32 %9 to i64
%add25.1 = add nuw nsw i64 %conv24.1, %conv24
- %cmp5.2 = icmp eq i32 undef, 0
+ %cmp5.2 = icmp eq i32 %inp32, 0
br i1 %cmp5.2, label %if.end.2, label %if.then
if.end.2: ; preds = %if.end23.1
%10 = load i32, i32* %numFreeResourcesInSubgroup, align 4
%conv.2 = zext i32 %10 to i64
%add.2 = add nuw nsw i64 %conv.2, %add.1
- %11 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1034, i64 undef, i64 %0, i64 undef) #2
+ %11 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1034, i64 %inp64, i64 %0, i64 %inp64) #2
%cmp10.2 = icmp eq i32 0, 0
br i1 %cmp10.2, label %if.end14.2, label %if.then11
if.end14.2: ; preds = %if.end.2
%12 = load i32, i32* %numFreeResourcesInSubgroup, align 4
- %13 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1035, i64 undef, i64 %0, i64 0) #2
+ %13 = call { i64, i64, i64, i64 } asm sideeffect "sc", "=&{r0},=&{r3},=&{r4},=&{r5},{r0},{r3},{r4},{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1035, i64 %inp64, i64 %0, i64 0) #2
%asmresult1.i64.2 = extractvalue { i64, i64, i64, i64 } %13, 1
%conv4.i65.2 = trunc i64 %asmresult1.i64.2 to i32
%cmp19.2 = icmp eq i32 %conv4.i65.2, 0
@@ -121,7 +121,7 @@ if.end.3: ; preds = %if.end23.2
%15 = load i32, i32* %numFreeResourcesInSubgroup, align 4
%conv.3 = zext i32 %15 to i64
%add.3 = add nuw nsw i64 %conv.3, %add.2
- %cmp10.3 = icmp eq i32 undef, 0
+ %cmp10.3 = icmp eq i32 %inp32, 0
br i1 %cmp10.3, label %if.end14.3, label %if.then11
if.end14.3: ; preds = %if.end.3
diff --git a/test/CodeGen/PowerPC/ext-bool-trunc-repl.ll b/test/CodeGen/PowerPC/ext-bool-trunc-repl.ll
new file mode 100644
index 000000000000..df67ecd7a632
--- /dev/null
+++ b/test/CodeGen/PowerPC/ext-bool-trunc-repl.ll
@@ -0,0 +1,38 @@
+; RUN: llc -O0 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@c = external global i32, align 4
+@d = external global [2 x i32], align 4
+
+; Function Attrs: norecurse nounwind
+define void @fn2() #0 {
+; CHECK-LABEL: @fn2
+
+ br i1 undef, label %1, label %10
+
+; <label>:1: ; preds = %0
+ br i1 undef, label %3, label %2
+
+; <label>:2: ; preds = %2, %1
+ br i1 undef, label %3, label %2
+
+; <label>:3: ; preds = %2, %1
+ br i1 undef, label %8, label %4
+
+; <label>:4: ; preds = %4, %3
+ %5 = phi i64 [ %6, %4 ], [ undef, %3 ]
+ %6 = and i64 %5, and (i64 and (i64 and (i64 and (i64 and (i64 and (i64 and (i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64), i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64)), i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64)), i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64)), i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64)), i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64)), i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64)), i64 sext (i32 select (i1 icmp slt (i16 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i16), i16 0), i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 lshr (i32 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @d, i64 0, i64 1), i32* @c) to i32), i32 6)) to i64))
+ %7 = icmp slt i32 undef, 6
+ br i1 %7, label %4, label %8
+
+; <label>:8: ; preds = %4, %3
+ %9 = phi i64 [ undef, %3 ], [ %6, %4 ]
+ br label %10
+
+; <label>:10: ; preds = %8, %0
+ ret void
+}
+
+attributes #0 = { norecurse nounwind "target-cpu"="ppc64le" }
+
diff --git a/test/CodeGen/PowerPC/fabs.ll b/test/CodeGen/PowerPC/fabs.ll
index 36aa23d03550..5a17c838f1b0 100644
--- a/test/CodeGen/PowerPC/fabs.ll
+++ b/test/CodeGen/PowerPC/fabs.ll
@@ -1,7 +1,30 @@
-; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin | grep "fabs f1, f1"
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin | FileCheck %s
define double @fabs(double %f) {
-entry:
- %tmp2 = tail call double @fabs( double %f ) readnone ; <double> [#uses=1]
- ret double %tmp2
+; CHECK-LABEL: fabs:
+; CHECK: ; BB#0:
+; CHECK-NEXT: fabs f1, f1
+; CHECK-NEXT: blr
+;
+ %t = tail call double @fabs( double %f ) readnone
+ ret double %t
}
+
+define float @bitcast_fabs(float %x) {
+; CHECK-LABEL: bitcast_fabs:
+; CHECK: ; BB#0:
+; CHECK-NEXT: stfs f1, -8(r1)
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: lwz r2, -8(r1)
+; CHECK-NEXT: clrlwi r2, r2, 1
+; CHECK-NEXT: stw r2, -4(r1)
+; CHECK-NEXT: lfs f1, -4(r1)
+; CHECK-NEXT: blr
+;
+ %bc1 = bitcast float %x to i32
+ %and = and i32 %bc1, 2147483647
+ %bc2 = bitcast i32 %and to float
+ ret float %bc2
+}
+
diff --git a/test/CodeGen/PowerPC/fast-isel-fcmp-nan.ll b/test/CodeGen/PowerPC/fast-isel-fcmp-nan.ll
new file mode 100644
index 000000000000..4390b938aeab
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-fcmp-nan.ll
@@ -0,0 +1,187 @@
+; RUN: llc -mtriple powerpc64le-unknown-linux-gnu -fast-isel -O0 < %s | FileCheck %s
+
+define i1 @TestULT(double %t0) {
+; CHECK-LABEL: TestULT:
+; CHECK: mcrf
+; CHECK: blr
+entry:
+ %t1 = fcmp ult double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestULE(double %t0) {
+; CHECK-LABEL: TestULE:
+; CHECK: fcmpu
+; CHECK-NEXT: ble
+; CHECK: blr
+entry:
+ %t1 = fcmp ule double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestUNE(double %t0) {
+; CHECK-LABEL: TestUNE:
+; CHECK: fcmpu
+; CHECK-NEXT: bne
+; CHECK: blr
+entry:
+ %t1 = fcmp une double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestUEQ(double %t0) {
+; CHECK-LABEL: TestUEQ:
+; CHECK: mcrf
+; CHECK: blr
+entry:
+ %t1 = fcmp ueq double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestUGT(double %t0) {
+; CHECK-LABEL: TestUGT:
+; CHECK: mcrf
+; CHECK: blr
+entry:
+ %t1 = fcmp ugt double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestUGE(double %t0) {
+; CHECK-LABEL: TestUGE:
+; CHECK: fcmpu
+; CHECK-NEXT: bge
+; CHECK: blr
+entry:
+ %t1 = fcmp uge double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestOLT(double %t0) {
+; CHECK-LABEL: TestOLT:
+; CHECK: fcmpu
+; CHECK-NEXT: blt
+; CHECK: blr
+entry:
+ %t1 = fcmp olt double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestOLE(double %t0) {
+; CHECK-LABEL: TestOLE:
+; CHECK: mcrf
+; CHECK: blr
+entry:
+ %t1 = fcmp ole double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestONE(double %t0) {
+; CHECK-LABEL: TestONE:
+; CHECK: mcrf
+; CHECK: blr
+entry:
+ %t1 = fcmp one double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestOEQ(double %t0) {
+; CHECK-LABEL: TestOEQ:
+; CHECK: fcmpu
+; CHECK-NEXT: beq
+; CHECK: blr
+entry:
+ %t1 = fcmp oeq double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestOGT(double %t0) {
+; CHECK-LABEL: TestOGT:
+; CHECK: fcmpu
+; CHECK-NEXT: bgt
+; CHECK: blr
+entry:
+ %t1 = fcmp ogt double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
+
+define i1 @TestOGE(double %t0) {
+; CHECK-LABEL: TestOGE:
+; CHECK: mcrf
+; CHECK: blr
+entry:
+ %t1 = fcmp oge double %t0, 0.000000e+00
+ br i1 %t1, label %good, label %bad
+
+bad:
+ ret i1 false
+
+good:
+ ret i1 true
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-fpconv.ll b/test/CodeGen/PowerPC/fast-isel-fpconv.ll
new file mode 100644
index 000000000000..eb14cf2aa769
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-fpconv.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple powerpc64-unknown-linux-gnu -fast-isel -O0 < %s | FileCheck %s
+
+; The second fctiwz would use an incorrect input register due to wrong handling
+; of COPY_TO_REGCLASS in the FastISel pass. Verify that this is fixed.
+
+declare void @func(i32, i32)
+
+define void @test() {
+; CHECK-LABEL: test:
+; CHECK: bl func
+; CHECK-NEXT: nop
+; CHECK: lfs [[REG:[0-9]+]],
+; CHECK: fctiwz {{[0-9]+}}, [[REG]]
+; CHECK: bl func
+; CHECK-NEXT: nop
+
+ %memPos = alloca float, align 4
+ store float 1.500000e+01, float* %memPos
+ %valPos = load float, float* %memPos
+
+ %memNeg = alloca float, align 4
+ store float -1.500000e+01, float* %memNeg
+ %valNeg = load float, float* %memNeg
+
+ %FloatToIntPos = fptosi float %valPos to i32
+ call void @func(i32 15, i32 %FloatToIntPos)
+
+ %FloatToIntNeg = fptosi float %valNeg to i32
+ call void @func(i32 -15, i32 %FloatToIntNeg)
+
+ ret void
+}
+
diff --git a/test/CodeGen/PowerPC/fast-isel-i64offset.ll b/test/CodeGen/PowerPC/fast-isel-i64offset.ll
new file mode 100644
index 000000000000..00278d68f638
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-i64offset.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple powerpc64-unknown-linux-gnu -fast-isel -O0 < %s | FileCheck %s
+
+; Verify that pointer offsets larger than 32 bits work correctly.
+
+define void @test(i32* %array) {
+; CHECK-LABEL: test:
+; CHECK-NOT: li {{[0-9]+}}, -8
+ %element = getelementptr i32, i32* %array, i64 2147483646
+ store i32 1234, i32* %element
+ ret void
+}
+
diff --git a/test/CodeGen/PowerPC/fdiv-combine.ll b/test/CodeGen/PowerPC/fdiv-combine.ll
index d3dc3fe913fd..3aac7032c69e 100644
--- a/test/CodeGen/PowerPC/fdiv-combine.ll
+++ b/test/CodeGen/PowerPC/fdiv-combine.ll
@@ -9,8 +9,8 @@ target triple = "powerpc64-unknown-linux-gnu"
define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
; CHECK-LABEL: three_fdiv_double:
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fdiv
+; CHECK: fdiv {{[0-9]}}
+; CHECK-NOT: fdiv
; CHECK: fmul
; CHECK: fmul
; CHECK: fmul
@@ -23,9 +23,9 @@ define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
define void @two_fdiv_double(double %D, double %a, double %b) #0 {
; CHECK-LABEL: two_fdiv_double:
-; CHECK: fdiv
-; CHECK: fdiv
-; CHECK-NEXT-NOT: fmul
+; CHECK: fdiv {{[0-9]}}
+; CHECK: fdiv {{[0-9]}}
+; CHECK-NOT: fmul
%div = fdiv double %a, %D
%div1 = fdiv double %b, %D
tail call void @foo_2d(double %div, double %div1)
diff --git a/test/CodeGen/PowerPC/fma-assoc.ll b/test/CodeGen/PowerPC/fma-assoc.ll
index 3044dd09128c..4a2ca6010f65 100644
--- a/test/CodeGen/PowerPC/fma-assoc.ll
+++ b/test/CodeGen/PowerPC/fma-assoc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
+; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX %s
define double @test_FMADD_ASSOC1(double %A, double %B, double %C,
double %D, double %E) {
diff --git a/test/CodeGen/PowerPC/fma-ext.ll b/test/CodeGen/PowerPC/fma-ext.ll
index da7c34ccb9d8..fc3489def7cd 100644
--- a/test/CodeGen/PowerPC/fma-ext.ll
+++ b/test/CodeGen/PowerPC/fma-ext.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
+; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX %s
define double @test_FMADD_EXT1(float %A, float %B, double %C) {
%D = fmul float %A, %B ; <float> [#uses=1]
diff --git a/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll b/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
new file mode 100644
index 000000000000..08d1b1ba883f
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
@@ -0,0 +1,36 @@
+; RUN: llc -fp-contract=fast -O2 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; CHECK-LABEL: f
+; CHECK-NOT: xsmaddmsp [[REG:[0-9]+]], [[REG]], {{[0-9]+}}
+define float @f(float %xf) #0 {
+ %1 = fmul float %xf, %xf
+ %2 = fmul float %1, 0x3F43FB0140000000
+ %3 = fsub float 1.000000e+00, %2
+ %4 = fmul float %1, %3
+ %5 = fmul float %4, 0x3F461C5440000000
+ %6 = fsub float 1.000000e+00, %5
+ %7 = fmul float %1, %6
+ %8 = fmul float %7, 0x3F4899C100000000
+ %9 = fsub float 1.000000e+00, %8
+ %10 = fmul float %1, %9
+ %11 = fmul float %10, 0x3F4B894020000000
+ %12 = fsub float 1.000000e+00, %11
+ %13 = fmul float %1, %12
+ %14 = fmul float %13, 0x3F4F07C200000000
+ %15 = fsub float 1.000000e+00, %14
+ %16 = fmul float %1, %15
+ %17 = fmul float %16, 0x3F519E0120000000
+ %18 = fsub float 1.000000e+00, %17
+ %19 = fmul float %1, %18
+ %20 = fmul float %19, 0x3F542D6620000000
+ %21 = fsub float 1.000000e+00, %20
+ %22 = fmul float %1, %21
+ %23 = fmul float %22, 0x3F5756CAC0000000
+ %24 = fsub float 1.000000e+00, %23
+ %25 = fmul float %1, %24
+ ret float %25
+}
+
+attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/fma-mutate.ll b/test/CodeGen/PowerPC/fma-mutate.ll
index 1a391f4c2305..befd2d2e58b5 100644
--- a/test/CodeGen/PowerPC/fma-mutate.ll
+++ b/test/CodeGen/PowerPC/fma-mutate.ll
@@ -3,7 +3,7 @@
; same as the FMA target register. The second one is legal. The third
; one doesn't fit the feeding-copy pattern.
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=+vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=+vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
diff --git a/test/CodeGen/PowerPC/fma.ll b/test/CodeGen/PowerPC/fma.ll
index 9cfef398edfd..79c5e11d05c5 100644
--- a/test/CodeGen/PowerPC/fma.ll
+++ b/test/CodeGen/PowerPC/fma.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 | FileCheck -check-prefix=CHECK-P8 %s
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 | FileCheck -check-prefix=CHECK-P8 %s
+; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-P8 %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-P8 %s
declare double @dummy1(double) #0
declare double @dummy2(double, double) #0
diff --git a/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll b/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll
index 7742ffe33150..569e4e6cb923 100644
--- a/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll
+++ b/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll
@@ -2,9 +2,9 @@
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-bgq-linux"
-define linkonce_odr double @test1() {
+define linkonce_odr double @test1(ppc_fp128 %input) {
entry:
- %conv6.i.i = fptosi ppc_fp128 undef to i64
+ %conv6.i.i = fptosi ppc_fp128 %input to i64
%conv.i = sitofp i64 %conv6.i.i to double
ret double %conv.i
diff --git a/test/CodeGen/PowerPC/hello-reloc.s b/test/CodeGen/PowerPC/hello-reloc.s
index 12f4315f675a..bbf1e7cacbd4 100644
--- a/test/CodeGen/PowerPC/hello-reloc.s
+++ b/test/CodeGen/PowerPC/hello-reloc.s
@@ -2,7 +2,7 @@
; which is responsible for writing mach-o relocation entries for (PIC)
; PowerPC objects.
-; RUN: llvm-mc -filetype=obj -relocation-model=pic -mcpu=g4 -triple=powerpc-apple-darwin8 %s -o - | llvm-readobj -r --expand-relocs | FileCheck -check-prefix=DARWIN-G4-DUMP %s
+; RUN: llvm-mc -filetype=obj -mcpu=g4 -triple=powerpc-apple-darwin8 %s -o - | llvm-readobj -r --expand-relocs | FileCheck -check-prefix=DARWIN-G4-DUMP %s
.machine ppc7400
.section __TEXT,__textcoal_nt,coalesced,pure_instructions
diff --git a/test/CodeGen/PowerPC/hidden-vis-2.ll b/test/CodeGen/PowerPC/hidden-vis-2.ll
index 3eb9dbd21ade..4ef0f708b4f6 100644
--- a/test/CodeGen/PowerPC/hidden-vis-2.ll
+++ b/test/CodeGen/PowerPC/hidden-vis-2.ll
@@ -1,12 +1,19 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin9 | grep non_lazy_ptr | count 6
+; RUN: llc < %s -mtriple=powerpc-apple-darwin9 | FileCheck %s
-@x = external hidden global i32 ; <i32*> [#uses=1]
-@y = extern_weak hidden global i32 ; <i32*> [#uses=1]
+; CHECK: lis r2, ha16(L_x$non_lazy_ptr)
+; CHECK: lis r3, ha16(L_y$non_lazy_ptr)
+; CHECK: lwz r2, lo16(L_x$non_lazy_ptr)(r2)
+; CHECK: lwz r3, lo16(L_y$non_lazy_ptr)(r3)
+; CHECK: L_x$non_lazy_ptr:
+; CHECK: L_y$non_lazy_ptr:
+
+@x = external hidden global i32
+@y = extern_weak hidden global i32
define i32 @t() nounwind readonly {
entry:
- %0 = load i32, i32* @x, align 4 ; <i32> [#uses=1]
- %1 = load i32, i32* @y, align 4 ; <i32> [#uses=1]
- %2 = add i32 %1, %0 ; <i32> [#uses=1]
- ret i32 %2
+ %0 = load i32, i32* @x, align 4
+ %1 = load i32, i32* @y, align 4
+ %2 = add i32 %1, %0
+ ret i32 %2
}
diff --git a/test/CodeGen/PowerPC/indirect-hidden.ll b/test/CodeGen/PowerPC/indirect-hidden.ll
new file mode 100644
index 000000000000..5ef8b6df4b09
--- /dev/null
+++ b/test/CodeGen/PowerPC/indirect-hidden.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=powerpc-apple-darwin < %s | FileCheck %s
+
+@a = external hidden global i32
+@b = external global i32
+
+define i32* @get_a() {
+ ret i32* @a
+}
+
+define i32* @get_b() {
+ ret i32* @b
+}
+
+; CHECK: .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: L_a$non_lazy_ptr:
+; CHECK-NEXT: .indirect_symbol _a
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: L_b$non_lazy_ptr:
+; CHECK-NEXT: .indirect_symbol _b
+; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/PowerPC/inline-asm-scalar-to-vector-error.ll b/test/CodeGen/PowerPC/inline-asm-scalar-to-vector-error.ll
new file mode 100644
index 000000000000..3f1d9d32becd
--- /dev/null
+++ b/test/CodeGen/PowerPC/inline-asm-scalar-to-vector-error.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8a -mattr=+altivec %s -o - 2>&1 | FileCheck %s
+
+define hidden void @f(i32 %x) {
+ ; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
+ tail call void asm sideeffect "nop", "{v1}"(i32 %x) nounwind
+
+ ; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
+ tail call void asm sideeffect "nop", "{vsl1}"(i32 %x) nounwind
+
+ ; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
+ tail call void asm sideeffect "nop", "{vsh1}"(i32 %x) nounwind
+
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/lbzux.ll b/test/CodeGen/PowerPC/lbzux.ll
index 4bd9cb6ab18a..a8ca639e6880 100644
--- a/test/CodeGen/PowerPC/lbzux.ll
+++ b/test/CodeGen/PowerPC/lbzux.ll
@@ -2,7 +2,7 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "powerpc64-unknown-linux-gnu"
; RUN: llc -disable-ppc-unaligned < %s | FileCheck %s
-define fastcc void @allocateSpace(i1 %cond1, i1 %cond2) nounwind {
+define fastcc void @allocateSpace(i1 %cond1, i1 %cond2, i32 %offset) nounwind {
entry:
%0 = load i8*, i8** undef, align 8
br i1 undef, label %return, label %lor.lhs.false
@@ -19,14 +19,13 @@ if.then15: ; preds = %if.end7
while.cond: ; preds = %while.body, %if.then15
%idxprom17 = sext i32 0 to i64
%arrayidx18 = getelementptr inbounds i8, i8* %0, i64 %idxprom17
- %or = or i32 undef, undef
br i1 %cond1, label %if.end71, label %while.body
while.body: ; preds = %while.cond
br i1 %cond2, label %while.cond, label %if.then45
if.then45: ; preds = %while.body
- %idxprom48139 = zext i32 %or to i64
+ %idxprom48139 = zext i32 %offset to i64
%arrayidx49 = getelementptr inbounds i8, i8* %0, i64 %idxprom48139
%1 = bitcast i8* %arrayidx49 to i16*
%2 = bitcast i8* %arrayidx18 to i16*
diff --git a/test/CodeGen/PowerPC/load-two-flts.ll b/test/CodeGen/PowerPC/load-two-flts.ll
new file mode 100644
index 000000000000..270a852b1b04
--- /dev/null
+++ b/test/CodeGen/PowerPC/load-two-flts.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+ %v2 = load i64, i64* %ref.tmp, align 8
+ %v3 = lshr i64 %v2, 32
+ %v4 = trunc i64 %v3 to i32
+ %v5 = bitcast i32 %v4 to float
+ %v6 = trunc i64 %v2 to i32
+ %v7 = bitcast i32 %v6 to float
+ %mul_ad.i.i = fmul fast float %v5, %v1
+ %mul_bc.i.i = fmul fast float %v7, %v0
+ %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+ %mul_ac.i.i = fmul fast float %v5, %v0
+ %mul_bd.i.i = fmul fast float %v7, %v1
+ %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+ store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+ store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+ ret void
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 0(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 0(5)
+; CHECK: blr
+}
+
+define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+ %r = getelementptr i64, i64* %ref.tmp, i64 1
+ %v2 = load i64, i64* %r, align 8
+ %v3 = lshr i64 %v2, 32
+ %v4 = trunc i64 %v3 to i32
+ %v5 = bitcast i32 %v4 to float
+ %v6 = trunc i64 %v2 to i32
+ %v7 = bitcast i32 %v6 to float
+ %mul_ad.i.i = fmul fast float %v5, %v1
+ %mul_bc.i.i = fmul fast float %v7, %v0
+ %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+ %mul_ac.i.i = fmul fast float %v5, %v0
+ %mul_bd.i.i = fmul fast float %v7, %v1
+ %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+ store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+ store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+ ret i64* %r
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 8(5)
+; CHECK-NOT: ldu {{[0-9]+}}, 8(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfsu {{[0-9]+}}, 8(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/load-v4i8-improved.ll b/test/CodeGen/PowerPC/load-v4i8-improved.ll
new file mode 100644
index 000000000000..d03ff06bbd93
--- /dev/null
+++ b/test/CodeGen/PowerPC/load-v4i8-improved.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \
+; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck \
+; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s \
+; RUN: --check-prefix=CHECK-BE
+
+define <16 x i8> @test(i32* %s, i32* %t) {
+entry:
+ %0 = bitcast i32* %s to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 4
+ %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <16 x i8> %2
+; CHECK-LABEL: test
+; CHECK: lwz [[GPR:[0-9]+]], 0(3)
+; CHECK: mtvsrd [[VSR:[0-9]+]], [[GPR]]
+; CHECK: xxswapd [[SWP:[0-9]+]], [[VSR]]
+; CHECK: xxspltw 34, [[SWP]], 3
+; CHECK-BE-LABEL: test
+; CHECK-BE: lwz [[GPR:[0-9]+]], 0(3)
+; CHECK-BE: sldi [[SHL:[0-9]+]], [[GPR]], 32
+; CHECK-BE: mtvsrd [[VSR:[0-9]+]], [[SHL]]
+; CHECK-BE: xxspltw 34, [[VSR]], 0
+}
diff --git a/test/CodeGen/PowerPC/lsr-postinc-pos.ll b/test/CodeGen/PowerPC/lsr-postinc-pos.ll
index 7831df154606..17b05b219c72 100644
--- a/test/CodeGen/PowerPC/lsr-postinc-pos.ll
+++ b/test/CodeGen/PowerPC/lsr-postinc-pos.ll
@@ -3,27 +3,27 @@
; The icmp is a post-inc use, and the increment is in %bb11, but the
; scevgep needs to be inserted in %bb so that it is dominated by %t.
-; CHECK: %t = load i8*, i8** undef
+; CHECK: %t = load i8*, i8** %inp
; CHECK: %scevgep = getelementptr i8, i8* %t, i32 %lsr.iv.next
-; CHECK: %c1 = icmp ult i8* %scevgep, undef
+; CHECK: %c1 = icmp ult i8* %scevgep, %inp2
target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128-n32"
target triple = "powerpc-apple-darwin9"
-define void @foo() nounwind {
+define void @foo(i8** %inp, i8* %inp2) nounwind {
entry:
br label %bb11
bb11:
%i = phi i32 [ 0, %entry ], [ %i.next, %bb ] ; <i32> [#uses=3]
%ii = shl i32 %i, 2 ; <i32> [#uses=1]
- %c0 = icmp eq i32 %i, undef ; <i1> [#uses=1]
+ %c0 = icmp eq i32 %i, 0 ; <i1> [#uses=1]
br i1 %c0, label %bb13, label %bb
bb:
- %t = load i8*, i8** undef, align 16 ; <i8*> [#uses=1]
+ %t = load i8*, i8** %inp, align 16 ; <i8*> [#uses=1]
%p = getelementptr i8, i8* %t, i32 %ii ; <i8*> [#uses=1]
- %c1 = icmp ult i8* %p, undef ; <i1> [#uses=1]
+ %c1 = icmp ult i8* %p, %inp2 ; <i1> [#uses=1]
%i.next = add i32 %i, 1 ; <i32> [#uses=1]
br i1 %c1, label %bb11, label %bb13
diff --git a/test/CodeGen/PowerPC/machine-combiner.ll b/test/CodeGen/PowerPC/machine-combiner.ll
index 93fb2020d530..ae9e2e8cf6a6 100644
--- a/test/CodeGen/PowerPC/machine-combiner.ll
+++ b/test/CodeGen/PowerPC/machine-combiner.ll
@@ -98,6 +98,7 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <
; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35
; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37
; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-PWR: # kill
; CHECK-NEXT: blr
%t0 = fadd <4 x float> %x0, %x1
@@ -115,6 +116,7 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <
; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35
; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37
; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-PWR: # kill
; CHECK-NEXT: blr
%t0 = fadd <4 x float> %x0, %x1
@@ -132,6 +134,7 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <
; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35
; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37
; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-PWR: # kill
; CHECK-NEXT: blr
%t0 = fadd <4 x float> %x0, %x1
@@ -149,6 +152,7 @@ define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <
; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35
; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37
; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]]
+; CHECK-PWR: # kill
; CHECK-NEXT: blr
%t0 = fadd <4 x float> %x0, %x1
diff --git a/test/CodeGen/PowerPC/multi-return.ll b/test/CodeGen/PowerPC/multi-return.ll
new file mode 100644
index 000000000000..454d1e342212
--- /dev/null
+++ b/test/CodeGen/PowerPC/multi-return.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -O2 < %s | FileCheck %s
+
+; Verify that returning multiple return values in registers works,
+; both with fast-isel and regular isel.
+
+define { i32, i32, i32, i32 } @foo() nounwind {
+ %A1 = insertvalue { i32, i32, i32, i32 } undef, i32 1, 0
+ %A2 = insertvalue { i32, i32, i32, i32 } %A1, i32 2, 1
+ %A3 = insertvalue { i32, i32, i32, i32 } %A2, i32 3, 2
+ %A4 = insertvalue { i32, i32, i32, i32 } %A3, i32 4, 3
+ ret { i32, i32, i32, i32 } %A4
+}
+
+; CHECK-LABEL: foo:
+; CHECK: li 3, 1
+; CHECK: li 4, 2
+; CHECK: li 5, 3
+; CHECK: li 6, 4
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/no-rlwimi-trivial-commute.mir b/test/CodeGen/PowerPC/no-rlwimi-trivial-commute.mir
index 5c998d09a3db..76702ce50fd3 100644
--- a/test/CodeGen/PowerPC/no-rlwimi-trivial-commute.mir
+++ b/test/CodeGen/PowerPC/no-rlwimi-trivial-commute.mir
@@ -1,4 +1,4 @@
-# RUN: llc -start-after=dead-mi-elimination -stop-after=twoaddressinstruction -o /dev/null %s | FileCheck %s
+# RUN: llc -start-after=dead-mi-elimination -stop-after=twoaddressinstruction -o - %s | FileCheck %s
--- |
target datalayout = "E-m:e-i64:64-n32:64"
diff --git a/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll b/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll
new file mode 100644
index 000000000000..1e3de6457337
--- /dev/null
+++ b/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll
@@ -0,0 +1,23 @@
+; RUN: llc -print-before=peephole-opt -print-after=peephole-opt -mtriple=powerpc64-unknown-linux-gnu -o /dev/null 2>&1 < %s | FileCheck %s
+
+define signext i32 @fn1(i32 %baz) {
+ %1 = mul nsw i32 %baz, 208
+ %2 = zext i32 %1 to i64
+ %3 = shl i64 %2, 48
+ %4 = ashr exact i64 %3, 48
+; CHECK: ANDIo8 {{[^,]+}}, 65520, %CR0<imp-def,dead>;
+; CHECK: CMPLDI
+; CHECK: BCC
+
+; CHECK: ANDIo8 {{[^,]+}}, 65520, %CR0<imp-def>;
+; CHECK: COPY %CR0
+; CHECK: BCC
+ %5 = icmp eq i64 %4, 0
+ br i1 %5, label %foo, label %bar
+
+foo:
+ ret i32 1
+
+bar:
+ ret i32 0
+}
diff --git a/test/CodeGen/PowerPC/opt-sub-inst-cr0-live.mir b/test/CodeGen/PowerPC/opt-sub-inst-cr0-live.mir
new file mode 100644
index 000000000000..b835ce71c41d
--- /dev/null
+++ b/test/CodeGen/PowerPC/opt-sub-inst-cr0-live.mir
@@ -0,0 +1,143 @@
+# RUN: llc -start-after=machine-sink -stop-after=peephole-opt -mtriple=powerpc64-unknown-linux-gnu -o - %s | FileCheck %s
+
+--- |
+ ; ModuleID = '<stdin>'
+ source_filename = "<stdin>"
+ target datalayout = "E-m:e-i64:64-n32:64"
+ target triple = "powerpc64-unknown-linux-gnu"
+
+ ; Function Attrs: nounwind readnone
+ declare i128 @llvm.cttz.i128(i128, i1) #0
+
+ define void @fn1(i128, i128, i1) {
+ top:
+ br label %loop
+
+ loop: ; preds = %loop, %top
+ %v = phi i128 [ %3, %loop ], [ %0, %top ]
+ %u = phi i128 [ %3, %loop ], [ %1, %top ]
+ %s = sub i128 %v, %u
+ %3 = call i128 @llvm.cttz.i128(i128 %s, i1 false)
+ br label %loop
+ }
+
+ ; Function Attrs: nounwind
+ declare void @llvm.stackprotector(i8*, i8**) #1
+
+ attributes #0 = { nounwind readnone }
+ attributes #1 = { nounwind }
+
+...
+---
+name: fn1
+alignment: 2
+exposesReturnsTwice: false
+hasInlineAsm: false
+allVRegsAllocated: false
+isSSA: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+registers:
+ - { id: 0, class: g8rc }
+ - { id: 1, class: g8rc }
+ - { id: 2, class: g8rc }
+ - { id: 3, class: g8rc }
+ - { id: 4, class: g8rc }
+ - { id: 5, class: g8rc }
+ - { id: 6, class: g8rc }
+ - { id: 7, class: g8rc }
+ - { id: 8, class: g8rc }
+ - { id: 9, class: g8rc }
+ - { id: 10, class: g8rc }
+ - { id: 11, class: g8rc }
+ - { id: 12, class: g8rc }
+ - { id: 13, class: g8rc }
+ - { id: 14, class: g8rc }
+ - { id: 15, class: g8rc_and_g8rc_nox0 }
+ - { id: 16, class: g8rc_and_g8rc_nox0 }
+ - { id: 17, class: g8rc }
+ - { id: 18, class: g8rc }
+ - { id: 19, class: g8rc }
+ - { id: 20, class: g8rc }
+ - { id: 21, class: g8rc }
+ - { id: 22, class: g8rc }
+ - { id: 23, class: g8rc }
+ - { id: 24, class: g8rc }
+ - { id: 25, class: crrc }
+ - { id: 26, class: g8rc_and_g8rc_nox0 }
+ - { id: 27, class: g8rc_and_g8rc_nox0 }
+liveins:
+ - { reg: '%x3', virtual-reg: '%6' }
+ - { reg: '%x4', virtual-reg: '%7' }
+ - { reg: '%x5', virtual-reg: '%8' }
+ - { reg: '%x6', virtual-reg: '%9' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.top:
+ successors: %bb.1.loop
+ liveins: %x3, %x4, %x5, %x6
+
+ %9 = COPY %x6
+ %8 = COPY %x5
+ %7 = COPY %x4
+ %6 = COPY %x3
+ %14 = COPY %9
+ %13 = COPY %8
+ %12 = COPY %7
+ %11 = COPY %6
+ %21 = LI8 128
+ %23 = LI8 64
+
+ bb.1.loop:
+ successors: %bb.2.loop, %bb.4
+
+ %0 = PHI %11, %bb.0.top, %4, %bb.3.loop
+ %1 = PHI %12, %bb.0.top, %5, %bb.3.loop
+ %2 = PHI %13, %bb.0.top, %4, %bb.3.loop
+ %3 = PHI %14, %bb.0.top, %5, %bb.3.loop
+ %15 = SUBFC8 %3, %1, implicit-def %carry
+ %16 = SUBFE8 %2, %0, implicit-def dead %carry, implicit %carry
+ %17 = ADDI8 %16, -1
+ %18 = ADDI8 %15, -1
+ %19 = ANDC8 killed %17, %16
+ %20 = ANDC8 killed %18, %15
+ %22 = CNTLZD killed %19
+ %24 = CNTLZD killed %20
+ %25 = CMPLDI %15, 0
+ BCC 76, %25, %bb.2.loop
+ ; CHECK: SUBFC8o %3, %1, implicit-def %carry, implicit-def %cr0
+ ; CHECK: COPY killed %cr0
+ ; CHECK: BCC
+
+ bb.4:
+ successors: %bb.3.loop
+
+ %27 = SUBF8 %24, %23
+ B %bb.3.loop
+
+ bb.2.loop:
+ successors: %bb.3.loop
+
+ %26 = SUBF8 %22, %21
+
+ bb.3.loop:
+ successors: %bb.1.loop
+
+ %5 = PHI %26, %bb.2.loop, %27, %bb.4
+ %4 = LI8 0
+ B %bb.1.loop
+
+...
diff --git a/test/CodeGen/PowerPC/optcmp.ll b/test/CodeGen/PowerPC/optcmp.ll
index d929eae20608..3f7522c0c801 100644
--- a/test/CodeGen/PowerPC/optcmp.ll
+++ b/test/CodeGen/PowerPC/optcmp.ll
@@ -72,13 +72,13 @@ define i64 @foold(i64 %a, i64 %b, i64* nocapture %c) #0 {
entry:
%sub = sub nsw i64 %b, %a
store i64 %sub, i64* %c, align 8
- %cmp = icmp eq i64 %a, %b
+ %cmp = icmp slt i64 %a, %b
%cond = select i1 %cmp, i64 %a, i64 %b
ret i64 %cond
; CHECK: @foold
; CHECK: subf. [[REG:[0-9]+]], 3, 4
-; CHECK: isel 3, 3, 4, 2
+; CHECK: isel 3, 3, 4, 1
; CHECK: std [[REG]], 0(5)
}
@@ -86,13 +86,13 @@ define i64 @foold2(i64 %a, i64 %b, i64* nocapture %c) #0 {
entry:
%sub = sub nsw i64 %a, %b
store i64 %sub, i64* %c, align 8
- %cmp = icmp eq i64 %a, %b
+ %cmp = icmp slt i64 %a, %b
%cond = select i1 %cmp, i64 %a, i64 %b
ret i64 %cond
; CHECK: @foold2
; CHECK: subf. [[REG:[0-9]+]], 4, 3
-; CHECK: isel 3, 3, 4, 2
+; CHECK: isel 3, 3, 4, 0
; CHECK: std [[REG]], 0(5)
}
diff --git a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 8da8df58a85c..b34518431339 100644
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -1102,7 +1102,7 @@ entry:
; CHECK: mfvsrwz 3, [[SHL]]
; CHECK: extsw 3, 3
; CHECK-LE-LABEL: @getsi0
-; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK-LE: xxswapd [[SHL:[0-9]+]], 34
; CHECK-LE: mfvsrwz 3, [[SHL]]
; CHECK-LE: extsw 3, 3
}
@@ -1150,7 +1150,7 @@ entry:
%vecext = extractelement <4 x i32> %0, i32 3
ret i32 %vecext
; CHECK-LABEL: @getsi3
-; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK: xxswapd [[SHL:[0-9]+]], 34
; CHECK: mfvsrwz 3, [[SHL]]
; CHECK: extsw 3, 3
; CHECK-LE-LABEL: @getsi3
@@ -1172,7 +1172,7 @@ entry:
; CHECK: mfvsrwz 3, [[SHL]]
; CHECK: clrldi 3, 3, 32
; CHECK-LE-LABEL: @getui0
-; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK-LE: xxswapd [[SHL:[0-9]+]], 34
; CHECK-LE: mfvsrwz 3, [[SHL]]
; CHECK-LE: clrldi 3, 3, 32
}
@@ -1220,7 +1220,7 @@ entry:
%vecext = extractelement <4 x i32> %0, i32 3
ret i32 %vecext
; CHECK-LABEL: @getui3
-; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK: xxswapd [[SHL:[0-9]+]], 34
; CHECK: mfvsrwz 3, [[SHL]]
; CHECK: clrldi 3, 3, 32
; CHECK-LE-LABEL: @getui3
@@ -1380,7 +1380,7 @@ entry:
; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
; CHECK: xscvspdpn 1, [[SHL]]
; CHECK-LE-LABEL: @getf1
-; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK-LE: xxswapd [[SHL:[0-9]+]], 34
; CHECK-LE: xscvspdpn 1, [[SHL]]
}
@@ -1393,7 +1393,7 @@ entry:
%vecext = extractelement <4 x float> %0, i32 2
ret float %vecext
; CHECK-LABEL: @getf2
-; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 2
+; CHECK: xxswapd [[SHL:[0-9]+]], 34
; CHECK: xscvspdpn 1, [[SHL]]
; CHECK-LE-LABEL: @getf2
; CHECK-LE: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
diff --git a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
index 052f55644fe2..74dc1561d9f4 100644
--- a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
+++ b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
@@ -9,7 +9,8 @@ entry:
ret <2 x i32> %strided.vec
; CHECK-LABEL: @test1
-; CHECK: vsldoi 2, 2, 2, 12
+; CHECK: xxswapd 35, 34
+; CHECK: vmrghw 2, 2, 3
; CHECK: blr
}
diff --git a/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
new file mode 100644
index 000000000000..ac187e084257
--- /dev/null
+++ b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
@@ -0,0 +1,970 @@
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BE
+
+define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 0
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 4
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 8
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 12
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 0
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 4
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 8
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 12
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define float @_Z13testUiToFpExtILj0EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-LABEL: _Z13testUiToFpExtILj0EEfDv4_j
+; CHECK: xxextractuw 0, 34, 12
+; CHECK: xscvuxdsp 1, 0
+; CHECK-BE-LABEL: _Z13testUiToFpExtILj0EEfDv4_j
+; CHECK-BE: xxextractuw 0, 34, 0
+; CHECK-BE: xscvuxdsp 1, 0
+ %vecext = extractelement <4 x i32> %a, i32 0
+ %conv = uitofp i32 %vecext to float
+ ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj1EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-LABEL: _Z13testUiToFpExtILj1EEfDv4_j
+; CHECK: xxextractuw 0, 34, 8
+; CHECK: xscvuxdsp 1, 0
+; CHECK-BE-LABEL: _Z13testUiToFpExtILj1EEfDv4_j
+; CHECK-BE: xxextractuw 0, 34, 4
+; CHECK-BE: xscvuxdsp 1, 0
+ %vecext = extractelement <4 x i32> %a, i32 1
+ %conv = uitofp i32 %vecext to float
+ ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj2EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-LABEL: _Z13testUiToFpExtILj2EEfDv4_j
+; CHECK: xxextractuw 0, 34, 4
+; CHECK: xscvuxdsp 1, 0
+; CHECK-BE-LABEL: _Z13testUiToFpExtILj2EEfDv4_j
+; CHECK-BE: xxextractuw 0, 34, 8
+; CHECK-BE: xscvuxdsp 1, 0
+ %vecext = extractelement <4 x i32> %a, i32 2
+ %conv = uitofp i32 %vecext to float
+ ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj3EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-LABEL: _Z13testUiToFpExtILj3EEfDv4_j
+; CHECK: xxextractuw 0, 34, 0
+; CHECK: xscvuxdsp 1, 0
+; CHECK-BE-LABEL: _Z13testUiToFpExtILj3EEfDv4_j
+; CHECK-BE: xxextractuw 0, 34, 12
+; CHECK-BE: xscvuxdsp 1, 0
+ %vecext = extractelement <4 x i32> %a, i32 3
+ %conv = uitofp i32 %vecext to float
+ ret float %conv
+}
+
+define <4 x float> @_Z10testInsEltILj0EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj0EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z10testInsEltILj0EDv4_ffET0_S1_T1_
+; CHECK-BE: xscvdpspn 0, 1
+; CHECK-BE: xxsldwi 0, 0, 0, 3
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = insertelement <4 x float> %a, float %b, i32 0
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj1EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj1EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z10testInsEltILj1EDv4_ffET0_S1_T1_
+; CHECK-BE: xscvdpspn 0, 1
+; CHECK-BE: xxsldwi 0, 0, 0, 3
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = insertelement <4 x float> %a, float %b, i32 1
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj2EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj2EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z10testInsEltILj2EDv4_ffET0_S1_T1_
+; CHECK-BE: xscvdpspn 0, 1
+; CHECK-BE: xxsldwi 0, 0, 0, 3
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = insertelement <4 x float> %a, float %b, i32 2
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj3EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj3EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z10testInsEltILj3EDv4_ffET0_S1_T1_
+; CHECK-BE: xscvdpspn 0, 1
+; CHECK-BE: xxsldwi 0, 0, 0, 3
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = insertelement <4 x float> %a, float %b, i32 3
+ ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj0EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj0EDv4_jjET0_S1_T1_
+; CHECK: mtvsrwz 0, 5
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z10testInsEltILj0EDv4_jjET0_S1_T1_
+; CHECK-BE: mtvsrwz 0, 5
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = insertelement <4 x i32> %a, i32 %b, i32 0
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj1EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj1EDv4_jjET0_S1_T1_
+; CHECK: mtvsrwz 0, 5
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z10testInsEltILj1EDv4_jjET0_S1_T1_
+; CHECK-BE: mtvsrwz 0, 5
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = insertelement <4 x i32> %a, i32 %b, i32 1
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj2EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj2EDv4_jjET0_S1_T1_
+; CHECK: mtvsrwz 0, 5
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z10testInsEltILj2EDv4_jjET0_S1_T1_
+; CHECK-BE: mtvsrwz 0, 5
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = insertelement <4 x i32> %a, i32 %b, i32 2
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj3EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj3EDv4_jjET0_S1_T1_
+; CHECK: mtvsrwz 0, 5
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z10testInsEltILj3EDv4_jjET0_S1_T1_
+; CHECK-BE: mtvsrwz 0, 5
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = insertelement <4 x i32> %a, i32 %b, i32 3
+ ret <4 x i32> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 0
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 4
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 8
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 12
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+ ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+ ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 0
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 2, i32 5, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 0
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 4
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 4
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 8
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 8
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 3
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_
+; CHECK-BE-NOT: xxsldwi
+; CHECK-BE: xxinsertw 34, 35, 12
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 1
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+ ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
+; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxinsertw 34, 0, 12
+ %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+ ret <4 x i32> %vecins
+}
+define <4 x float> @testSameVecEl0BE(<4 x float> %a) {
+entry:
+; CHECK-BE-LABEL: testSameVecEl0BE
+; CHECK-BE: xxinsertw 34, 34, 0
+ %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl2BE(<4 x float> %a) {
+entry:
+; CHECK-BE-LABEL: testSameVecEl2BE
+; CHECK-BE: xxinsertw 34, 34, 8
+ %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+ ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl3BE(<4 x float> %a) {
+entry:
+; CHECK-BE-LABEL: testSameVecEl3BE
+; CHECK-BE: xxinsertw 34, 34, 12
+ %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+ ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl0LE(<4 x float> %a) {
+entry:
+; CHECK-LABEL: testSameVecEl0LE
+; CHECK: xxinsertw 34, 34, 12
+ %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl1LE(<4 x float> %a) {
+entry:
+; CHECK-LABEL: testSameVecEl1LE
+; CHECK: xxinsertw 34, 34, 8
+ %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl3LE(<4 x float> %a) {
+entry:
+; CHECK-LABEL: testSameVecEl3LE
+; CHECK: xxinsertw 34, 34, 0
+ %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x float> %vecins
+}
diff --git a/test/CodeGen/PowerPC/pie.ll b/test/CodeGen/PowerPC/pie.ll
new file mode 100644
index 000000000000..56e07caf2812
--- /dev/null
+++ b/test/CodeGen/PowerPC/pie.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=powerpc-pc-linux -relocation-model=pic | FileCheck %s
+
+
+define void @f() {
+ ret void
+}
+
+define void @g() {
+; CHECK: g:
+; CHECK: bl f{{$}}
+ call void @f()
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/PowerPC/popcnt.ll b/test/CodeGen/PowerPC/popcnt.ll
index b304d72aede2..5acaa29e2ee5 100644
--- a/test/CodeGen/PowerPC/popcnt.ll
+++ b/test/CodeGen/PowerPC/popcnt.ll
@@ -1,37 +1,53 @@
; RUN: llc -march=ppc64 -mattr=+popcntd < %s | FileCheck %s
+; RUN: llc -march=ppc64 -mattr=+slow-popcntd < %s | FileCheck %s --check-prefix=SLOWPC
+; RUN: llc -march=ppc64 -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -march=ppc64 -mcpu=a2q < %s | FileCheck %s --check-prefix=SLOWPC
+; RUN: llc -march=ppc64 -mcpu=a2q -mattr=+popcntd < %s | FileCheck %s
define i8 @cnt8(i8 %x) nounwind readnone {
%cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
ret i8 %cnt
-; CHECK: @cnt8
+; CHECK-LABEL: @cnt8
; CHECK: rlwinm
; CHECK: popcntw
; CHECK: blr
+
+; SLOWPC-LABEL: @cnt8
+; SLOWPC-NOT: popcnt
}
define i16 @cnt16(i16 %x) nounwind readnone {
%cnt = tail call i16 @llvm.ctpop.i16(i16 %x)
ret i16 %cnt
-; CHECK: @cnt16
+; CHECK-LABEL: @cnt16
; CHECK: rlwinm
; CHECK: popcntw
; CHECK: blr
+
+; SLOWPC-LABEL: @cnt16
+; SLOWPC-NOT: popcnt
}
define i32 @cnt32(i32 %x) nounwind readnone {
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
ret i32 %cnt
-; CHECK: @cnt32
+; CHECK-LABEL: @cnt32
; CHECK: popcntw
; CHECK: blr
+
+; SLOWPC-LABEL: @cnt32
+; SLOWPC-NOT: popcnt
}
define i64 @cnt64(i64 %x) nounwind readnone {
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %cnt
-; CHECK: @cnt64
+; CHECK-LABEL: @cnt64
; CHECK: popcntd
; CHECK: blr
+
+; SLOWPC-LABEL: @cnt64
+; SLOWPC-NOT: popcnt
}
declare i8 @llvm.ctpop.i8(i8) nounwind readnone
diff --git a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
index 2f75190327ef..3dcb0b2aee1d 100644
--- a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
+++ b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
@@ -169,7 +169,7 @@ declare i32 @something(...)
; CHECK-NEXT: bne 0, .[[LOOP]]
;
; Next BB
-; CHECK: %for.exit
+; CHECK: %for.end
; CHECK: mtlr {{[0-9]+}}
; CHECK-NEXT: blr
define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
@@ -629,10 +629,11 @@ end:
; CHECK-LABEL: transpose
;
; Store of callee-save register saved by shrink wrapping
-; CHECK: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
+; FIXME: Test disabled: Improved scheduling needs no spills/reloads any longer!
+; CHECKXX: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
;
; Reload of callee-save register
-; CHECK: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload
+; CHECKXX: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload
;
; Ensure no subsequent uses of callee-save register before end of function
; CHECK-NOT: {{[a-z]+}} [[CSR]]
diff --git a/test/CodeGen/PowerPC/ppc32-align-long-double-sf.ll b/test/CodeGen/PowerPC/ppc32-align-long-double-sf.ll
new file mode 100644
index 000000000000..f8a6d071cfea
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-align-long-double-sf.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O2 -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+
+@x = global ppc_fp128 0xM405EDA5E353F7CEE0000000000000000, align 16
+@.str = private unnamed_addr constant [5 x i8] c"%Lf\0A\00", align 1
+
+
+define void @foo() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @x, align 16
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), ppc_fp128 %0)
+ ret void
+}
+; Do not skip register r4 because of register alignment in soft float mode. Instead skipping
+; put in r4 part of first argument for printf function (long double).
+; CHECK: lwzu 4, x@l({{[0-9]+}})
+
+declare i32 @printf(i8* nocapture readonly, ...) #0
+
+attributes #0 = { "use-soft-float"="true" }
+
+ \ No newline at end of file
diff --git a/test/CodeGen/PowerPC/ppc32-constant-BE-ppcf128.ll b/test/CodeGen/PowerPC/ppc32-constant-BE-ppcf128.ll
new file mode 100644
index 000000000000..3a23ae31a851
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-constant-BE-ppcf128.ll
@@ -0,0 +1,24 @@
+; RUN: llc -O2 -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-buildroot-linux-gnu"
+
+@.str = private unnamed_addr constant [5 x i8] c"%Lf\0A\00", align 1
+
+define i32 @main() #0 {
+entry:
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), ppc_fp128 0xM3FF00000000000000000000000000000)
+ ret i32 0
+}
+
+; First available register for long double argument is r4, so put
+; Hi part in r4/r5, Lo part in r6/r7 (do not switch Hi/Lo parts)
+; CHECK: lis 4, 16368
+; CHECK-NOT: lis 6, 16368
+; CHECK: li 5, 0
+; CHECK: li 7, 0
+
+declare i32 @printf(i8* nocapture readonly, ...)
+
+attributes #0 = { "use-soft-float"="true" }
+
diff --git a/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
index 028006320cb5..272d882c8bbf 100644
--- a/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
+++ b/test/CodeGen/PowerPC/ppc32-i1-vaarg.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -march=ppc32 -mcpu=ppc32 | FileCheck %s
-; RUN: llc < %s -march=ppc32 -mcpu=ppc32 -mtriple=powerpc-darwin | FileCheck %s -check-prefix=CHECK-D
+; RUN: llc < %s -march=ppc32 -mcpu=ppc32 -mtriple=powerpc-darwin9 | FileCheck %s -check-prefix=CHECK-D
target triple = "powerpc-unknown-linux-gnu"
declare void @printf(i8*, ...)
@@ -16,5 +16,5 @@ define void @main() {
; CHECK-D-LABEL: @main
; CHECK-D: li r4, 0
-; CHECK-D: bl L_printf$stub
+; CHECK-D: bl _printf
diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
index b9fd6707f041..b268ceeafd5d 100644
--- a/test/CodeGen/PowerPC/ppc64-align-long-double.ll
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@@ -18,17 +18,17 @@ entry:
ret ppc_fp128 %0
}
-; CHECK: std 6, 72(1)
-; CHECK: std 5, 64(1)
-; CHECK: std 4, 56(1)
-; CHECK: std 3, 48(1)
+; CHECK-DAG: std 6, 72(1)
+; CHECK-DAG: std 5, 64(1)
+; CHECK-DAG: std 4, 56(1)
+; CHECK-DAG: std 3, 48(1)
; CHECK: lfd 1, 64(1)
; CHECK: lfd 2, 72(1)
-; CHECK-VSX: std 6, 72(1)
-; CHECK-VSX: std 5, 64(1)
-; CHECK-VSX: std 4, 56(1)
-; CHECK-VSX: std 3, 48(1)
+; CHECK-VSX-DAG: std 6, 72(1)
+; CHECK-VSX-DAG: std 5, 64(1)
+; CHECK-VSX-DAG: std 4, 56(1)
+; CHECK-VSX-DAG: std 3, 48(1)
; CHECK-VSX: li 3, 16
; CHECK-VSX: addi 4, 1, 48
; CHECK-VSX: lxsdx 1, 4, 3
diff --git a/test/CodeGen/PowerPC/ppc64-byval-align.ll b/test/CodeGen/PowerPC/ppc64-byval-align.ll
index 7170f5906581..89e7cc6c50eb 100644
--- a/test/CodeGen/PowerPC/ppc64-byval-align.ll
+++ b/test/CodeGen/PowerPC/ppc64-byval-align.ll
@@ -35,8 +35,7 @@ entry:
ret i64 %0
}
; CHECK-LABEL: @callee2
-; CHECK: ld [[REG:[0-9]+]], 128(1)
-; CHECK: mr 3, [[REG]]
+; CHECK: ld 3, 128(1)
; CHECK: blr
declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16)
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
index 23a14e6687d6..8b49cdb7f99c 100644
--- a/test/CodeGen/PowerPC/ppc64-calls.ll
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -2,7 +2,8 @@
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
-define void @foo() nounwind readnone noinline {
+
+define void @foo() nounwind noinline {
ret void
}
@@ -14,7 +15,8 @@ define weak void @foo_weak() nounwind {
define void @test_direct() nounwind readnone {
; CHECK-LABEL: test_direct:
tail call void @foo() nounwind
-; CHECK: bl foo
+; Because of tail call optimization, it can be 'b' instruction.
+; CHECK: [[BR:b[l]?]] foo
; CHECK-NOT: nop
ret void
}
diff --git a/test/CodeGen/PowerPC/ppc64-fastcc.ll b/test/CodeGen/PowerPC/ppc64-fastcc.ll
index 69e15d104da8..76677ecdf033 100644
--- a/test/CodeGen/PowerPC/ppc64-fastcc.ll
+++ b/test/CodeGen/PowerPC/ppc64-fastcc.ll
@@ -1,4 +1,6 @@
; RUN: llc -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+; XFAIL: *
+
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@@ -522,7 +524,7 @@ define void @cv13(<4 x i32> %v) #0 {
; CHECK-LABEL: @cv13
; CHECK-DAG: li [[REG1:[0-9]+]], 96
-; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
+; CHECK-DAG: vor [[REG2:[0-9]+]], 3, 3
; CHECK: stvx [[REG2]], 1, [[REG1]]
; CHECK: blr
}
@@ -533,7 +535,7 @@ define void @cv14(<4 x i32> %v) #0 {
; CHECK-LABEL: @cv14
; CHECK-DAG: li [[REG1:[0-9]+]], 128
-; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
+; CHECK-DAG: vor [[REG2:[0-9]+]], 3, 3
; CHECK: stvx [[REG2]], 1, [[REG1]]
; CHECK: blr
}
diff --git a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
index fb017bc224ba..6f8351bb5bdd 100644
--- a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
+++ b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
@@ -2,7 +2,7 @@
; CHECK: .section .opd,"aw",@progbits
; CHECK-NEXT: test1:
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: .quad .L[[BEGIN:.*]]
; CHECK-NEXT: .quad .TOC.@tocbase
; CHECK-NEXT: .quad 0
diff --git a/test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll b/test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll
new file mode 100644
index 000000000000..b27aeec50b84
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -disable-ppc-sco=false --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -disable-ppc-sco=false --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -disable-ppc-sco=false --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -disable-ppc-sco=false --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK
+
+%"class.clang::NamedDecl" = type { i32 }
+declare void @__assert_fail();
+
+define i8 @_ZNK5clang9NamedDecl23getLinkageAndVisibilityEv(
+ %"class.clang::NamedDecl"* %this) {
+entry:
+ %tobool = icmp eq %"class.clang::NamedDecl"* %this, null
+ br i1 %tobool, label %cond.false, label %exit
+
+cond.false:
+ tail call void @__assert_fail()
+ unreachable
+
+exit:
+ %DeclKind = getelementptr inbounds
+ %"class.clang::NamedDecl",
+ %"class.clang::NamedDecl"* %this, i64 0, i32 0
+ %bf.load = load i32, i32* %DeclKind, align 4
+ %call.i = tail call i8 @LVComputationKind(
+ %"class.clang::NamedDecl"* %this,
+ i32 %bf.load)
+ ret i8 %call.i
+
+; CHECK-SCO-SHRK-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv:
+; CHECK-SCO-SHRK: b LVComputationKind
+; CHECK-SCO-SHRK: #TC_RETURNd8
+; CHECK-SCO-SHRK: stdu 1, -{{[0-9]+}}(1)
+; CHECK-SCO-SHRK: bl __assert_fail
+;
+; CHECK-SCO-ONLY-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv:
+; CHECK-SCO-ONLY: stdu 1, -{{[0-9]+}}(1)
+; CHECK-SCO-ONLY: b LVComputationKind
+; CHECK-SCO-ONLY: #TC_RETURNd8
+; CHECK-SCO-ONLY: bl __assert_fail
+}
+
+define fastcc i8 @LVComputationKind(
+ %"class.clang::NamedDecl"* %D,
+ i32 %computation) {
+ ret i8 0
+}
diff --git a/test/CodeGen/PowerPC/ppc64-sibcall.ll b/test/CodeGen/PowerPC/ppc64-sibcall.ll
new file mode 100644
index 000000000000..e4fe5468e942
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-sibcall.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO
+; RUN: llc < %s -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX
+; RUN: llc < %s -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX
+
+; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because
+; only Power8 (and later) fully support LE.
+
+%S_56 = type { [13 x i32], i32 }
+%S_64 = type { [15 x i32], i32 }
+%S_32 = type { [7 x i32], i32 }
+
+; Function Attrs: noinline nounwind
+define void @callee_56_copy([7 x i64] %a, %S_56* %b) #0 { ret void }
+define void @callee_64_copy([8 x i64] %a, %S_64* %b) #0 { ret void }
+
+; Function Attrs: nounwind
+define void @caller_56_reorder_copy(%S_56* %b, [7 x i64] %a) #1 {
+ tail call void @callee_56_copy([7 x i64] %a, %S_56* %b)
+ ret void
+
+; CHECK-SCO-LABEL: caller_56_reorder_copy:
+; CHECK-SCO-NOT: stdu 1
+; CHECK-SCO: TC_RETURNd8 callee_56_copy
+}
+
+define void @caller_64_reorder_copy(%S_64* %b, [8 x i64] %a) #1 {
+ tail call void @callee_64_copy([8 x i64] %a, %S_64* %b)
+ ret void
+
+; CHECK-SCO-LABEL: caller_64_reorder_copy:
+; CHECK-SCO: bl callee_64_copy
+}
+
+define void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b) #0 { ret void }
+define void @caller_64_64_copy([8 x i64] %a, [8 x i64] %b) #1 {
+ tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b)
+ ret void
+
+; CHECK-SCO-LABEL: caller_64_64_copy:
+; CHECK-SCO: b callee_64_64_copy
+}
+
+define void @caller_64_64_reorder_copy([8 x i64] %a, [8 x i64] %b) #1 {
+ tail call void @callee_64_64_copy([8 x i64] %b, [8 x i64] %a)
+ ret void
+
+; CHECK-SCO-LABEL: caller_64_64_reorder_copy:
+; CHECK-SCO: bl callee_64_64_copy
+}
+
+define void @caller_64_64_undef_copy([8 x i64] %a, [8 x i64] %b) #1 {
+ tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] undef)
+ ret void
+
+; CHECK-SCO-LABEL: caller_64_64_undef_copy:
+; CHECK-SCO: b callee_64_64_copy
+}
+
+define void @arg8_callee(
+ float %a, i32 signext %b, float %c, i32* %d,
+ i8 zeroext %e, float %f, i32* %g, i32 signext %h)
+{
+ ret void
+}
+
+define void @arg8_caller(float %a, i32 signext %b, i8 zeroext %c, i32* %d) {
+entry:
+ tail call void @arg8_callee(float undef, i32 signext undef, float undef,
+ i32* %d, i8 zeroext undef, float undef,
+ i32* undef, i32 signext undef)
+ ret void
+
+; CHECK-SCO-LABEL: arg8_caller:
+; CHECK-SCO: b arg8_callee
+}
+
+; Struct return test
+
+; Function Attrs: noinline nounwind
+define void @callee_sret_56(%S_56* noalias sret %agg.result) #0 { ret void }
+define void @callee_sret_32(%S_32* noalias sret %agg.result) #0 { ret void }
+
+; Function Attrs: nounwind
+define void @caller_do_something_sret_32(%S_32* noalias sret %agg.result) #1 {
+ %1 = alloca %S_56, align 4
+ %2 = bitcast %S_56* %1 to i8*
+ call void @callee_sret_56(%S_56* nonnull sret %1)
+ tail call void @callee_sret_32(%S_32* sret %agg.result)
+ ret void
+
+; CHECK-SCO-LABEL: caller_do_something_sret_32:
+; CHECK-SCO: stdu 1
+; CHECK-SCO: bl callee_sret_56
+; CHECK-SCO: addi 1
+; CHECK-SCO: TC_RETURNd8 callee_sret_32
+}
+
+define void @caller_local_sret_32(%S_32* %a) #1 {
+ %tmp = alloca %S_32, align 4
+ tail call void @callee_sret_32(%S_32* nonnull sret %tmp)
+ ret void
+
+; CHECK-SCO-LABEL: caller_local_sret_32:
+; CHECK-SCO: bl callee_sret_32
+}
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { nounwind }
+
+; vector <4 x i1> test
+
+define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void }
+define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) {
+ tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b)
+ ret void
+
+; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't
+; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder
+
+; CHECK-SCO-LABEL: caller_v4i1_reorder:
+; CHECK-SCO: bl callee_v4i1
+
+; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder:
+; CHECK-SCO-HASQPX: b callee_v4i1
+}
+
+define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void }
+define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) {
+ tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b)
+ ret void
+
+; CHECK-SCO-LABEL: f128_caller:
+; CHECK-SCO: b f128_callee
+}
+
+; weak linkage test
+%class.T = type { [2 x i8] }
+
+define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_hcaller(%class.T* %this, i8* %c) {
+ tail call void @wo_hcallee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-SCO-LABEL: wo_hcaller:
+; CHECK-SCO: b wo_hcallee
+}
+
+define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_pcaller(%class.T* %this, i8* %c) {
+ tail call void @wo_pcallee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-SCO-LABEL: wo_pcaller:
+; CHECK-SCO: b wo_pcallee
+}
+
+define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void }
+define void @wo_caller(%class.T* %this, i8* %c) {
+ tail call void @wo_callee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-SCO-LABEL: wo_caller:
+; CHECK-SCO: bl wo_callee
+}
+
+define weak protected void @w_pcallee(i8* %ptr) { ret void }
+define void @w_pcaller(i8* %ptr) {
+ tail call void @w_pcallee(i8* %ptr)
+ ret void
+
+; CHECK-SCO-LABEL: w_pcaller:
+; CHECK-SCO: b w_pcallee
+}
+
+define weak hidden void @w_hcallee(i8* %ptr) { ret void }
+define void @w_hcaller(i8* %ptr) {
+ tail call void @w_hcallee(i8* %ptr)
+ ret void
+
+; CHECK-SCO-LABEL: w_hcaller:
+; CHECK-SCO: b w_hcallee
+}
+
+define weak void @w_callee(i8* %ptr) { ret void }
+define void @w_caller(i8* %ptr) {
+ tail call void @w_callee(i8* %ptr)
+ ret void
+
+; CHECK-SCO-LABEL: w_caller:
+; CHECK-SCO: bl w_callee
+}
diff --git a/test/CodeGen/PowerPC/ppc64-toc.ll b/test/CodeGen/PowerPC/ppc64-toc.ll
index 7500ed606636..8d35cba2d430 100644
--- a/test/CodeGen/PowerPC/ppc64-toc.ll
+++ b/test/CodeGen/PowerPC/ppc64-toc.ll
@@ -9,7 +9,7 @@ target triple = "powerpc64-unknown-linux-gnu"
define i64 @access_int64(i64 %a) nounwind readonly {
entry:
; CHECK-LABEL: access_int64:
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: .quad .L[[BEGIN:.*]]
; CHECK-NEXT: .quad .TOC.@tocbase
; CHECK-NEXT: .quad 0
diff --git a/test/CodeGen/PowerPC/ppcf128-endian.ll b/test/CodeGen/PowerPC/ppcf128-endian.ll
index ee314c1db58b..49dea37a2dac 100644
--- a/test/CodeGen/PowerPC/ppcf128-endian.ll
+++ b/test/CodeGen/PowerPC/ppcf128-endian.ll
@@ -104,9 +104,10 @@ entry:
%0 = bitcast i128 %x to ppc_fp128
ret ppc_fp128 %0
}
-; CHECK: @convert_to
+; CHECK: convert_to:
; CHECK: std 3, [[OFF1:.*]](1)
; CHECK: std 4, [[OFF2:.*]](1)
+; CHECK: ori 2, 2, 0
; CHECK: lfd 1, [[OFF1]](1)
; CHECK: lfd 2, [[OFF2]](1)
; CHECK: blr
@@ -118,9 +119,10 @@ entry:
ret ppc_fp128 %0
}
-; CHECK: @convert_to
+; CHECK: convert_to2:
; CHECK: std 3, [[OFF1:.*]](1)
-; CHECK: std 4, [[OFF2:.*]](1)
+; CHECK: std 5, [[OFF2:.*]](1)
+; CHECK: ori 2, 2, 0
; CHECK: lfd 1, [[OFF1]](1)
; CHECK: lfd 2, [[OFF2]](1)
; CHECK: blr
diff --git a/test/CodeGen/PowerPC/ppcf128sf.ll b/test/CodeGen/PowerPC/ppcf128sf.ll
new file mode 100644
index 000000000000..2eea6d89be6a
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppcf128sf.ll
@@ -0,0 +1,179 @@
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -O0 < %s | FileCheck %s
+
+@ld = common global ppc_fp128 0xM00000000000000000000000000000000, align 16
+@ld2 = common global ppc_fp128 0xM00000000000000000000000000000000, align 16
+@d = common global double 0.000000e+00, align 8
+@f = common global float 0.000000e+00, align 4
+@i = common global i32 0, align 4
+@ui = common global i32 0, align 4
+@var = common global i8 0, align 1
+
+define void @foo() #0 {
+entry:
+ %c = alloca ppc_fp128, align 16
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %add = fadd ppc_fp128 %0, %1
+ store ppc_fp128 %add, ppc_fp128* %c, align 16
+ %2 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %3 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %sub = fsub ppc_fp128 %2, %3
+ store ppc_fp128 %sub, ppc_fp128* %c, align 16
+ %4 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %5 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %mul = fmul ppc_fp128 %4, %5
+ store ppc_fp128 %mul, ppc_fp128* %c, align 16
+ %6 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %7 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %div = fdiv ppc_fp128 %6, %7
+ store ppc_fp128 %div, ppc_fp128* %c, align 16
+ ret void
+
+ ; CHECK-LABEL: __gcc_qadd
+ ; CHECK-LABEL: __gcc_qsub
+ ; CHECK-LABEL: __gcc_qmul
+ ; CHECK-LABEL: __gcc_qdiv
+}
+
+define void @foo1() #0 {
+entry:
+ %0 = load double, double* @d, align 8
+ %conv = fpext double %0 to ppc_fp128
+ store ppc_fp128 %conv, ppc_fp128* @ld, align 16
+ ret void
+
+ ; CHECK-LABEL: __gcc_dtoq
+}
+
+define void @foo2() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %conv = fptrunc ppc_fp128 %0 to double
+ store double %conv, double* @d, align 8
+ ret void
+
+ ; CHECK-LABEL: __gcc_qtod
+}
+
+define void @foo3() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %conv = fptrunc ppc_fp128 %0 to float
+ store float %conv, float* @f, align 4
+ ret void
+
+ ; CHECK-LABEL: __gcc_qtos
+}
+
+define void @foo4() #0 {
+entry:
+ %0 = load i32, i32* @i, align 4
+ %conv = sitofp i32 %0 to ppc_fp128
+ store ppc_fp128 %conv, ppc_fp128* @ld, align 16
+ ret void
+
+ ; CHECK-LABEL: __gcc_itoq
+}
+
+define void @foo5() #0 {
+entry:
+ %0 = load i32, i32* @ui, align 4
+ %conv = uitofp i32 %0 to ppc_fp128
+ store ppc_fp128 %conv, ppc_fp128* @ld, align 16
+ ret void
+
+ ; CHECK-LABEL: __gcc_utoq
+}
+
+define void @foo6() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %cmp = fcmp oeq ppc_fp128 %0, %1
+ %conv = zext i1 %cmp to i32
+ %conv1 = trunc i32 %conv to i8
+ store i8 %conv1, i8* @var, align 1
+ ret void
+
+ ; CHECK-LABEL: __gcc_qeq
+}
+
+define void @foo7() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %cmp = fcmp une ppc_fp128 %0, %1
+ %conv = zext i1 %cmp to i32
+ %conv1 = trunc i32 %conv to i8
+ store i8 %conv1, i8* @var, align 1
+ ret void
+
+ ; CHECK-LABEL: __gcc_qne
+}
+
+define void @foo8() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %cmp = fcmp ogt ppc_fp128 %0, %1
+ %conv = zext i1 %cmp to i32
+ %conv1 = trunc i32 %conv to i8
+ store i8 %conv1, i8* @var, align 1
+ ret void
+
+ ; CHECK-LABEL: __gcc_qgt
+}
+
+define void @foo9() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %cmp = fcmp olt ppc_fp128 %0, %1
+ %conv = zext i1 %cmp to i32
+ %conv1 = trunc i32 %conv to i8
+ store i8 %conv1, i8* @var, align 1
+ ret void
+
+ ; CHECK-LABEL: __gcc_qlt
+}
+
+define void @foo10() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %cmp = fcmp ole ppc_fp128 %0, %1
+ %conv = zext i1 %cmp to i32
+ %conv1 = trunc i32 %conv to i8
+ store i8 %conv1, i8* @var, align 1
+ ret void
+
+ ; CHECK-LABEL: __gcc_qle
+}
+
+define void @foo11() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %cmp = fcmp une ppc_fp128 %0, %1
+ %conv = zext i1 %cmp to i32
+ %conv1 = trunc i32 %conv to i8
+ store i8 %conv1, i8* @var, align 1
+ ret void
+
+ ; CHECK-LABEL: __gcc_qunord
+}
+
+define void @foo12() #0 {
+entry:
+ %0 = load ppc_fp128, ppc_fp128* @ld, align 16
+ %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
+ %cmp = fcmp oge ppc_fp128 %0, %1
+ %conv = zext i1 %cmp to i32
+ %conv1 = trunc i32 %conv to i8
+ store i8 %conv1, i8* @var, align 1
+ ret void
+
+ ; CHECK-LABEL: __gcc_qge
+}
+
+attributes #0 = { "use-soft-float"="true" }
diff --git a/test/CodeGen/PowerPC/ppcsoftops.ll b/test/CodeGen/PowerPC/ppcsoftops.ll
index 56c057613bdc..70ee8a42baa9 100644
--- a/test/CodeGen/PowerPC/ppcsoftops.ll
+++ b/test/CodeGen/PowerPC/ppcsoftops.ll
@@ -1,4 +1,6 @@
; RUN: llc -mtriple=powerpc-unknown-linux-gnu -O0 < %s | FileCheck %s
+
+; Testing operations in soft-float mode
define double @foo() #0 {
entry:
%a = alloca double, align 8
diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
index b1bac59c9ce1..852ebfec576c 100644
--- a/test/CodeGen/PowerPC/pr17168.ll
+++ b/test/CodeGen/PowerPC/pr17168.ll
@@ -54,11 +54,10 @@ attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!438, !464}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 190311)", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !298, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 190311)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !298, imports: !2)
!1 = !DIFile(filename: "bt.c", directory: "/home/hfinkel/src/NPB2.3-omp-C/BT")
!2 = !{}
-!3 = !{!4, !82, !102, !114, !132, !145, !154, !155, !162, !183, !200, !201, !207, !208, !215, !221, !230, !238, !246, !255, !260, !261, !268, !274, !279, !280, !287, !293}
-!4 = distinct !DISubprogram(name: "main", line: 74, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 74, file: !1, scope: !5, type: !6, variables: !12)
+!4 = distinct !DISubprogram(name: "main", line: 74, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 74, file: !1, scope: !5, type: !6, variables: !12)
!5 = !DIFile(filename: "bt.c", directory: "/home/hfinkel/src/NPB2.3-omp-C/BT")
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8, !9}
@@ -136,7 +135,7 @@ attributes #1 = { nounwind readnone }
!79 = !DICompositeType(tag: DW_TAG_array_type, size: 160, align: 8, baseType: !11, elements: !80)
!80 = !{!81}
!81 = !DISubrange(count: 20)
-!82 = distinct !DISubprogram(name: "verify", line: 2388, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2388, file: !1, scope: !5, type: !83, variables: !86)
+!82 = distinct !DISubprogram(name: "verify", line: 2388, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2388, file: !1, scope: !5, type: !83, variables: !86)
!83 = !DISubroutineType(types: !84)
!84 = !{null, !8, !10, !85}
!85 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !24)
@@ -156,7 +155,7 @@ attributes #1 = { nounwind readnone }
!99 = !DILocalVariable(name: "xcr", line: 2398, scope: !82, file: !5, type: !91)
!100 = !DILocalVariable(name: "dtref", line: 2398, scope: !82, file: !5, type: !20)
!101 = !DILocalVariable(name: "m", line: 2399, scope: !82, file: !5, type: !8)
-!102 = distinct !DISubprogram(name: "rhs_norm", line: 266, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 266, file: !1, scope: !5, type: !103, variables: !106)
+!102 = distinct !DISubprogram(name: "rhs_norm", line: 266, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 266, file: !1, scope: !5, type: !103, variables: !106)
!103 = !DISubroutineType(types: !104)
!104 = !{null, !105}
!105 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !20)
@@ -168,7 +167,7 @@ attributes #1 = { nounwind readnone }
!111 = !DILocalVariable(name: "d", line: 271, scope: !102, file: !5, type: !8)
!112 = !DILocalVariable(name: "m", line: 271, scope: !102, file: !5, type: !8)
!113 = !DILocalVariable(name: "add", line: 272, scope: !102, file: !5, type: !20)
-!114 = distinct !DISubprogram(name: "compute_rhs", line: 1767, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1767, file: !1, scope: !5, type: !115, variables: !117)
+!114 = distinct !DISubprogram(name: "compute_rhs", line: 1767, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1767, file: !1, scope: !5, type: !115, variables: !117)
!115 = !DISubroutineType(types: !116)
!116 = !{null}
!117 = !{!118, !119, !120, !121, !122, !123, !124, !125, !126, !127, !128, !129, !130, !131}
@@ -186,7 +185,7 @@ attributes #1 = { nounwind readnone }
!129 = !DILocalVariable(name: "wijk", line: 1770, scope: !114, file: !5, type: !20)
!130 = !DILocalVariable(name: "wp1", line: 1770, scope: !114, file: !5, type: !20)
!131 = !DILocalVariable(name: "wm1", line: 1770, scope: !114, file: !5, type: !20)
-!132 = distinct !DISubprogram(name: "error_norm", line: 225, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 225, file: !1, scope: !5, type: !103, variables: !133)
+!132 = distinct !DISubprogram(name: "error_norm", line: 225, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 225, file: !1, scope: !5, type: !103, variables: !133)
!133 = !{!134, !135, !136, !137, !138, !139, !140, !141, !142, !143, !144}
!134 = !DILocalVariable(name: "rms", line: 225, arg: 1, scope: !132, file: !5, type: !105)
!135 = !DILocalVariable(name: "i", line: 232, scope: !132, file: !5, type: !8)
@@ -199,7 +198,7 @@ attributes #1 = { nounwind readnone }
!142 = !DILocalVariable(name: "zeta", line: 233, scope: !132, file: !5, type: !20)
!143 = !DILocalVariable(name: "u_exact", line: 233, scope: !132, file: !5, type: !91)
!144 = !DILocalVariable(name: "add", line: 233, scope: !132, file: !5, type: !20)
-!145 = distinct !DISubprogram(name: "exact_solution", line: 643, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 644, file: !1, scope: !5, type: !146, variables: !148)
+!145 = distinct !DISubprogram(name: "exact_solution", line: 643, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 644, file: !1, scope: !5, type: !146, variables: !148)
!146 = !DISubroutineType(types: !147)
!147 = !{null, !20, !20, !20, !105}
!148 = !{!149, !150, !151, !152, !153}
@@ -208,15 +207,15 @@ attributes #1 = { nounwind readnone }
!151 = !DILocalVariable(name: "zeta", line: 643, arg: 3, scope: !145, file: !5, type: !20)
!152 = !DILocalVariable(name: "dtemp", line: 644, arg: 4, scope: !145, file: !5, type: !105)
!153 = !DILocalVariable(name: "m", line: 653, scope: !145, file: !5, type: !8)
-!154 = distinct !DISubprogram(name: "set_constants", line: 2191, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2191, file: !1, scope: !5, type: !115, variables: !2)
-!155 = distinct !DISubprogram(name: "lhsinit", line: 855, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 855, file: !1, scope: !5, type: !115, variables: !156)
+!154 = distinct !DISubprogram(name: "set_constants", line: 2191, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2191, file: !1, scope: !5, type: !115, variables: !2)
+!155 = distinct !DISubprogram(name: "lhsinit", line: 855, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 855, file: !1, scope: !5, type: !115, variables: !156)
!156 = !{!157, !158, !159, !160, !161}
!157 = !DILocalVariable(name: "i", line: 857, scope: !155, file: !5, type: !8)
!158 = !DILocalVariable(name: "j", line: 857, scope: !155, file: !5, type: !8)
!159 = !DILocalVariable(name: "k", line: 857, scope: !155, file: !5, type: !8)
!160 = !DILocalVariable(name: "m", line: 857, scope: !155, file: !5, type: !8)
!161 = !DILocalVariable(name: "n", line: 857, scope: !155, file: !5, type: !8)
-!162 = distinct !DISubprogram(name: "initialize", line: 669, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 669, file: !1, scope: !5, type: !115, variables: !163)
+!162 = distinct !DISubprogram(name: "initialize", line: 669, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 669, file: !1, scope: !5, type: !115, variables: !163)
!163 = !{!164, !165, !166, !167, !168, !169, !170, !171, !172, !173, !174, !179, !180, !181, !182}
!164 = !DILocalVariable(name: "i", line: 679, scope: !162, file: !5, type: !8)
!165 = !DILocalVariable(name: "j", line: 679, scope: !162, file: !5, type: !8)
@@ -237,7 +236,7 @@ attributes #1 = { nounwind readnone }
!180 = !DILocalVariable(name: "Peta", line: 680, scope: !162, file: !5, type: !20)
!181 = !DILocalVariable(name: "Pzeta", line: 680, scope: !162, file: !5, type: !20)
!182 = !DILocalVariable(name: "temp", line: 680, scope: !162, file: !5, type: !91)
-!183 = distinct !DISubprogram(name: "exact_rhs", line: 301, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 301, file: !1, scope: !5, type: !115, variables: !184)
+!183 = distinct !DISubprogram(name: "exact_rhs", line: 301, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 301, file: !1, scope: !5, type: !115, variables: !184)
!184 = !{!185, !186, !187, !188, !189, !190, !191, !192, !193, !194, !195, !196, !197, !198, !199}
!185 = !DILocalVariable(name: "dtemp", line: 310, scope: !183, file: !5, type: !91)
!186 = !DILocalVariable(name: "xi", line: 310, scope: !183, file: !5, type: !20)
@@ -254,28 +253,28 @@ attributes #1 = { nounwind readnone }
!197 = !DILocalVariable(name: "jm1", line: 311, scope: !183, file: !5, type: !8)
!198 = !DILocalVariable(name: "km1", line: 311, scope: !183, file: !5, type: !8)
!199 = !DILocalVariable(name: "kp1", line: 311, scope: !183, file: !5, type: !8)
-!200 = distinct !DISubprogram(name: "adi", line: 210, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 210, file: !1, scope: !5, type: !115, variables: !2)
-!201 = distinct !DISubprogram(name: "add", line: 187, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 187, file: !1, scope: !5, type: !115, variables: !202)
+!200 = distinct !DISubprogram(name: "adi", line: 210, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 210, file: !1, scope: !5, type: !115, variables: !2)
+!201 = distinct !DISubprogram(name: "add", line: 187, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 187, file: !1, scope: !5, type: !115, variables: !202)
!202 = !{!203, !204, !205, !206}
!203 = !DILocalVariable(name: "i", line: 193, scope: !201, file: !5, type: !8)
!204 = !DILocalVariable(name: "j", line: 193, scope: !201, file: !5, type: !8)
!205 = !DILocalVariable(name: "k", line: 193, scope: !201, file: !5, type: !8)
!206 = !DILocalVariable(name: "m", line: 193, scope: !201, file: !5, type: !8)
-!207 = distinct !DISubprogram(name: "z_solve", line: 3457, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3457, file: !1, scope: !5, type: !115, variables: !2)
-!208 = distinct !DISubprogram(name: "z_backsubstitute", line: 3480, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3480, file: !1, scope: !5, type: !115, variables: !209)
+!207 = distinct !DISubprogram(name: "z_solve", line: 3457, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3457, file: !1, scope: !5, type: !115, variables: !2)
+!208 = distinct !DISubprogram(name: "z_backsubstitute", line: 3480, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3480, file: !1, scope: !5, type: !115, variables: !209)
!209 = !{!210, !211, !212, !213, !214}
!210 = !DILocalVariable(name: "i", line: 3492, scope: !208, file: !5, type: !8)
!211 = !DILocalVariable(name: "j", line: 3492, scope: !208, file: !5, type: !8)
!212 = !DILocalVariable(name: "k", line: 3492, scope: !208, file: !5, type: !8)
!213 = !DILocalVariable(name: "m", line: 3492, scope: !208, file: !5, type: !8)
!214 = !DILocalVariable(name: "n", line: 3492, scope: !208, file: !5, type: !8)
-!215 = distinct !DISubprogram(name: "z_solve_cell", line: 3512, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3512, file: !1, scope: !5, type: !115, variables: !216)
+!215 = distinct !DISubprogram(name: "z_solve_cell", line: 3512, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3512, file: !1, scope: !5, type: !115, variables: !216)
!216 = !{!217, !218, !219, !220}
!217 = !DILocalVariable(name: "i", line: 3527, scope: !215, file: !5, type: !8)
!218 = !DILocalVariable(name: "j", line: 3527, scope: !215, file: !5, type: !8)
!219 = !DILocalVariable(name: "k", line: 3527, scope: !215, file: !5, type: !8)
!220 = !DILocalVariable(name: "ksize", line: 3527, scope: !215, file: !5, type: !8)
-!221 = distinct !DISubprogram(name: "binvrhs", line: 3154, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3154, file: !1, scope: !5, type: !222, variables: !225)
+!221 = distinct !DISubprogram(name: "binvrhs", line: 3154, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3154, file: !1, scope: !5, type: !222, variables: !225)
!222 = !DISubroutineType(types: !223)
!223 = !{null, !224, !105}
!224 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !91)
@@ -284,7 +283,7 @@ attributes #1 = { nounwind readnone }
!227 = !DILocalVariable(name: "r", line: 3154, arg: 2, scope: !221, file: !5, type: !105)
!228 = !DILocalVariable(name: "pivot", line: 3159, scope: !221, file: !5, type: !20)
!229 = !DILocalVariable(name: "coeff", line: 3159, scope: !221, file: !5, type: !20)
-!230 = distinct !DISubprogram(name: "matmul_sub", line: 2841, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2842, file: !1, scope: !5, type: !231, variables: !233)
+!230 = distinct !DISubprogram(name: "matmul_sub", line: 2841, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2842, file: !1, scope: !5, type: !231, variables: !233)
!231 = !DISubroutineType(types: !232)
!232 = !{null, !224, !224, !224}
!233 = !{!234, !235, !236, !237}
@@ -292,7 +291,7 @@ attributes #1 = { nounwind readnone }
!235 = !DILocalVariable(name: "bblock", line: 2841, arg: 2, scope: !230, file: !5, type: !224)
!236 = !DILocalVariable(name: "cblock", line: 2842, arg: 3, scope: !230, file: !5, type: !224)
!237 = !DILocalVariable(name: "j", line: 2851, scope: !230, file: !5, type: !8)
-!238 = distinct !DISubprogram(name: "matvec_sub", line: 2814, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2814, file: !1, scope: !5, type: !239, variables: !241)
+!238 = distinct !DISubprogram(name: "matvec_sub", line: 2814, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2814, file: !1, scope: !5, type: !239, variables: !241)
!239 = !DISubroutineType(types: !240)
!240 = !{null, !224, !105, !105}
!241 = !{!242, !243, !244, !245}
@@ -300,7 +299,7 @@ attributes #1 = { nounwind readnone }
!243 = !DILocalVariable(name: "avec", line: 2814, arg: 2, scope: !238, file: !5, type: !105)
!244 = !DILocalVariable(name: "bvec", line: 2814, arg: 3, scope: !238, file: !5, type: !105)
!245 = !DILocalVariable(name: "i", line: 2823, scope: !238, file: !5, type: !8)
-!246 = distinct !DISubprogram(name: "binvcrhs", line: 2885, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2885, file: !1, scope: !5, type: !247, variables: !249)
+!246 = distinct !DISubprogram(name: "binvcrhs", line: 2885, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2885, file: !1, scope: !5, type: !247, variables: !249)
!247 = !DISubroutineType(types: !248)
!248 = !{null, !224, !224, !105}
!249 = !{!250, !251, !252, !253, !254}
@@ -309,45 +308,45 @@ attributes #1 = { nounwind readnone }
!252 = !DILocalVariable(name: "r", line: 2885, arg: 3, scope: !246, file: !5, type: !105)
!253 = !DILocalVariable(name: "pivot", line: 2890, scope: !246, file: !5, type: !20)
!254 = !DILocalVariable(name: "coeff", line: 2890, scope: !246, file: !5, type: !20)
-!255 = distinct !DISubprogram(name: "lhsz", line: 1475, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1475, file: !1, scope: !5, type: !115, variables: !256)
+!255 = distinct !DISubprogram(name: "lhsz", line: 1475, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1475, file: !1, scope: !5, type: !115, variables: !256)
!256 = !{!257, !258, !259}
!257 = !DILocalVariable(name: "i", line: 1484, scope: !255, file: !5, type: !8)
!258 = !DILocalVariable(name: "j", line: 1484, scope: !255, file: !5, type: !8)
!259 = !DILocalVariable(name: "k", line: 1484, scope: !255, file: !5, type: !8)
-!260 = distinct !DISubprogram(name: "y_solve", line: 3299, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3299, file: !1, scope: !5, type: !115, variables: !2)
-!261 = distinct !DISubprogram(name: "y_backsubstitute", line: 3323, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3323, file: !1, scope: !5, type: !115, variables: !262)
+!260 = distinct !DISubprogram(name: "y_solve", line: 3299, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3299, file: !1, scope: !5, type: !115, variables: !2)
+!261 = distinct !DISubprogram(name: "y_backsubstitute", line: 3323, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3323, file: !1, scope: !5, type: !115, variables: !262)
!262 = !{!263, !264, !265, !266, !267}
!263 = !DILocalVariable(name: "i", line: 3335, scope: !261, file: !5, type: !8)
!264 = !DILocalVariable(name: "j", line: 3335, scope: !261, file: !5, type: !8)
!265 = !DILocalVariable(name: "k", line: 3335, scope: !261, file: !5, type: !8)
!266 = !DILocalVariable(name: "m", line: 3335, scope: !261, file: !5, type: !8)
!267 = !DILocalVariable(name: "n", line: 3335, scope: !261, file: !5, type: !8)
-!268 = distinct !DISubprogram(name: "y_solve_cell", line: 3355, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3355, file: !1, scope: !5, type: !115, variables: !269)
+!268 = distinct !DISubprogram(name: "y_solve_cell", line: 3355, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3355, file: !1, scope: !5, type: !115, variables: !269)
!269 = !{!270, !271, !272, !273}
!270 = !DILocalVariable(name: "i", line: 3370, scope: !268, file: !5, type: !8)
!271 = !DILocalVariable(name: "j", line: 3370, scope: !268, file: !5, type: !8)
!272 = !DILocalVariable(name: "k", line: 3370, scope: !268, file: !5, type: !8)
!273 = !DILocalVariable(name: "jsize", line: 3370, scope: !268, file: !5, type: !8)
-!274 = distinct !DISubprogram(name: "lhsy", line: 1181, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1181, file: !1, scope: !5, type: !115, variables: !275)
+!274 = distinct !DISubprogram(name: "lhsy", line: 1181, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1181, file: !1, scope: !5, type: !115, variables: !275)
!275 = !{!276, !277, !278}
!276 = !DILocalVariable(name: "i", line: 1190, scope: !274, file: !5, type: !8)
!277 = !DILocalVariable(name: "j", line: 1190, scope: !274, file: !5, type: !8)
!278 = !DILocalVariable(name: "k", line: 1190, scope: !274, file: !5, type: !8)
-!279 = distinct !DISubprogram(name: "x_solve", line: 2658, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2658, file: !1, scope: !5, type: !115, variables: !2)
-!280 = distinct !DISubprogram(name: "x_backsubstitute", line: 2684, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2684, file: !1, scope: !5, type: !115, variables: !281)
+!279 = distinct !DISubprogram(name: "x_solve", line: 2658, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2658, file: !1, scope: !5, type: !115, variables: !2)
+!280 = distinct !DISubprogram(name: "x_backsubstitute", line: 2684, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2684, file: !1, scope: !5, type: !115, variables: !281)
!281 = !{!282, !283, !284, !285, !286}
!282 = !DILocalVariable(name: "i", line: 2696, scope: !280, file: !5, type: !8)
!283 = !DILocalVariable(name: "j", line: 2696, scope: !280, file: !5, type: !8)
!284 = !DILocalVariable(name: "k", line: 2696, scope: !280, file: !5, type: !8)
!285 = !DILocalVariable(name: "m", line: 2696, scope: !280, file: !5, type: !8)
!286 = !DILocalVariable(name: "n", line: 2696, scope: !280, file: !5, type: !8)
-!287 = distinct !DISubprogram(name: "x_solve_cell", line: 2716, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2716, file: !1, scope: !5, type: !115, variables: !288)
+!287 = distinct !DISubprogram(name: "x_solve_cell", line: 2716, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2716, file: !1, scope: !5, type: !115, variables: !288)
!288 = !{!289, !290, !291, !292}
!289 = !DILocalVariable(name: "i", line: 2728, scope: !287, file: !5, type: !8)
!290 = !DILocalVariable(name: "j", line: 2728, scope: !287, file: !5, type: !8)
!291 = !DILocalVariable(name: "k", line: 2728, scope: !287, file: !5, type: !8)
!292 = !DILocalVariable(name: "isize", line: 2728, scope: !287, file: !5, type: !8)
-!293 = distinct !DISubprogram(name: "lhsx", line: 898, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 898, file: !1, scope: !5, type: !115, variables: !294)
+!293 = distinct !DISubprogram(name: "lhsx", line: 898, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 898, file: !1, scope: !5, type: !115, variables: !294)
!294 = !{!295, !296, !297}
!295 = !DILocalVariable(name: "i", line: 907, scope: !293, file: !5, type: !8)
!296 = !DILocalVariable(name: "j", line: 907, scope: !293, file: !5, type: !8)
diff --git a/test/CodeGen/PowerPC/pr24546.ll b/test/CodeGen/PowerPC/pr24546.ll
index 06f6bc93da99..2519bb623bc4 100644
--- a/test/CodeGen/PowerPC/pr24546.ll
+++ b/test/CodeGen/PowerPC/pr24546.ll
@@ -56,13 +56,12 @@ attributes #3 = { nounwind }
!llvm.module.flags = !{!29, !30}
!llvm.ident = !{!31}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (git://github.com/llvm-mirror/clang.git e0848b6353721eb1b278a5bbea257bbf6316251e) (git://github.com/llvm-mirror/llvm.git 8724a428dfd5e78d7865bb01783708e83f9ed128)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !5, globals: !23)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (git://github.com/llvm-mirror/clang.git e0848b6353721eb1b278a5bbea257bbf6316251e) (git://github.com/llvm-mirror/llvm.git 8724a428dfd5e78d7865bb01783708e83f9ed128)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !23)
!1 = !DIFile(filename: "testcase.i", directory: "/tmp/glibc.build")
!2 = !{}
!3 = !{!4}
!4 = !DIBasicType(name: "double", size: 64, align: 64, encoding: DW_ATE_float)
-!5 = !{!6, !18}
-!6 = distinct !DISubprogram(name: "_php_math_round", scope: !1, file: !1, line: 15, type: !7, isLocal: false, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: true, variables: !10)
+!6 = distinct !DISubprogram(name: "_php_math_round", scope: !1, file: !1, line: 15, type: !7, isLocal: false, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !10)
!7 = !DISubroutineType(types: !8)
!8 = !{!4, !4, !9, !9}
!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -74,7 +73,7 @@ attributes #3 = { nounwind }
!15 = !DILocalVariable(name: "f2", scope: !6, file: !1, line: 17, type: !4)
!16 = !DILocalVariable(name: "tmp_value", scope: !6, file: !1, line: 18, type: !4)
!17 = !DILocalVariable(name: "precision_places", scope: !6, file: !1, line: 19, type: !9)
-!18 = distinct !DISubprogram(name: "php_intpow10", scope: !1, file: !1, line: 1, type: !19, isLocal: true, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, variables: !21)
+!18 = distinct !DISubprogram(name: "php_intpow10", scope: !1, file: !1, line: 1, type: !19, isLocal: true, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !21)
!19 = !DISubroutineType(types: !20)
!20 = !{!4, !9}
!21 = !{!22}
diff --git a/test/CodeGen/PowerPC/pr25802.ll b/test/CodeGen/PowerPC/pr25802.ll
deleted file mode 100644
index 0631850be5fa..000000000000
--- a/test/CodeGen/PowerPC/pr25802.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-; CHECK: .long .Ltmp6-.Ltmp12 # Call between .Ltmp12 and .Ltmp6
-
-; We used to crash in filetype=obj when computing a negative value.
-; RUN: llc -filetype=obj < %s
-
-target triple = "powerpc--netbsd"
-@_ZTI1I = external constant { i8*, i8* }
-define void @f(i8 %foo, i32 %bar) personality i8* bitcast (void ()* @g to i8*) {
- invoke void @g()
- to label %try.cont unwind label %lpad
-lpad: ; preds = %0
- %tmp = landingpad { i8*, i32 }
- catch i8* bitcast ({ i8*, i8* }* @_ZTI1I to i8*)
- br i1 undef, label %catch10, label %catch
-catch10: ; preds = %lpad
- %tmp8 = load i32, i32* undef, align 4
- %conv.i.i = zext i8 %foo to i32
- %cond.i.i = select i1 undef, i32 %conv.i.i, i32 %tmp8
- invoke void @_Z24__put_character_sequenceIccEvR1AIT_T0_Ej(i32 %cond.i.i)
- to label %invoke.cont20 unwind label %lpad15
-invoke.cont20: ; preds = %catch10
- ret void
-try.cont: ; preds = %0
- ret void
-catch: ; preds = %lpad
- %tmp14 = load i32, i32* undef, align 4
- %conv.i.i34 = zext i8 %foo to i32
- %cond.i.i35 = select i1 undef, i32 %conv.i.i34, i32 %tmp14
- invoke void @_Z24__put_character_sequenceIccEvR1AIT_T0_Ej(i32 %cond.i.i35)
- to label %invoke.cont8 unwind label %lpad3
-invoke.cont8: ; preds = %call2.i.i.noexc36
- ret void
-lpad3: ; preds = %call2.i.i.noexc36, %catch
- %tmp16 = landingpad { i8*, i32 }
- cleanup
- invoke void @g()
- to label %eh.resume unwind label %terminate.lpad
-lpad15: ; preds = %catch10
- %tmp19 = landingpad { i8*, i32 }
- cleanup
- invoke void @g()
- to label %eh.resume unwind label %terminate.lpad
-eh.resume: ; preds = %lpad15, %lpad3
- ret void
-terminate.lpad: ; preds = %lpad15, %lpad3
- %tmp22 = landingpad { i8*, i32 }
- catch i8* null
- ret void
-}
-declare void @g()
-declare void @_Z24__put_character_sequenceIccEvR1AIT_T0_Ej(i32)
diff --git a/test/CodeGen/PowerPC/pr26180.ll b/test/CodeGen/PowerPC/pr26180.ll
new file mode 100644
index 000000000000..e4cbcb8725d5
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26180.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mcpu=generic -mtriple=powerpc64le-unknown-unknown -O0 < %s | FileCheck %s --check-prefix=GENERIC
+; RUN: llc -mcpu=ppc -mtriple=powerpc64le-unknown-unknown -O0 < %s | FileCheck %s
+
+define i32 @bad(double %x) {
+ %1 = fptoui double %x to i32
+ ret i32 %1
+}
+
+; CHECK: fctidz 1, 1
+; CHECK: stfd 1, [[OFF:.*]](1)
+; CHECK: lwz {{[0-9]*}}, [[OFF]](1)
+; GENERIC: fctiwuz 1, 1
+; GENERIC: stfd 1, [[OFF:.*]](1)
+; GENERIC: lwz {{[0-9]*}}, [[OFF]](1)
diff --git a/test/CodeGen/PowerPC/pr26378.ll b/test/CodeGen/PowerPC/pr26378.ll
new file mode 100644
index 000000000000..e5e20553f52b
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26378.ll
@@ -0,0 +1,6 @@
+; RUN: llc -compile-twice -filetype obj \
+; RUN: -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 < %s
+@foo = common global i32 0, align 4
+define i8* @blah() #0 {
+ ret i8* bitcast (i32* @foo to i8*)
+}
diff --git a/test/CodeGen/PowerPC/pr26617.ll b/test/CodeGen/PowerPC/pr26617.ll
new file mode 100644
index 000000000000..474d7b94aaf8
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26617.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc-unknown-unknown < %s | FileCheck %s
+define i32 @test(<4 x i32> %v, i32 %elem) #0 {
+entry:
+ %vecext = extractelement <4 x i32> %v, i32 %elem
+ ret i32 %vecext
+}
+; CHECK: stxvw4x 34,
+; CHECK: lwzx 3,
+
+define float @test2(i32 signext %a) {
+entry:
+ %conv = bitcast i32 %a to float
+ ret float %conv
+}
+; CHECK-NOT: mtvsr
diff --git a/test/CodeGen/PowerPC/pr26690.ll b/test/CodeGen/PowerPC/pr26690.ll
index 3e7662409d51..e1c3c496ed40 100644
--- a/test/CodeGen/PowerPC/pr26690.ll
+++ b/test/CodeGen/PowerPC/pr26690.ll
@@ -35,9 +35,9 @@ while.body.lr.ph: ; preds = %while.cond.preheade
while.body: ; preds = %while.body.backedge, %while.body.lr.ph
switch i32 %.pre, label %while.body.backedge [
i32 0, label %sw.bb1
- i32 8, label %sw.bb1
- i32 6, label %sw.bb1
- i32 24, label %while.cond.backedge
+ i32 80, label %sw.bb1
+ i32 60, label %sw.bb1
+ i32 240, label %while.cond.backedge
]
while.body.backedge: ; preds = %while.body, %while.cond.backedge
@@ -101,7 +101,7 @@ if.end16: ; preds = %entry, %if.end13, %
ret i32 2
}
-; CHECK: mfcr {{[0-9]+}}
+; CHECK: mfocrf {{[0-9]+}}
!llvm.ident = !{!0}
diff --git a/test/CodeGen/PowerPC/pr27078.ll b/test/CodeGen/PowerPC/pr27078.ll
new file mode 100644
index 000000000000..b1fdbbde6929
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr27078.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx < %s | FileCheck %s
+
+define <4 x float> @bar(float* %p, float* %q) {
+ %1 = bitcast float* %p to <12 x float>*
+ %2 = bitcast float* %q to <12 x float>*
+ %3 = load <12 x float>, <12 x float>* %1, align 16
+ %4 = load <12 x float>, <12 x float>* %2, align 16
+ %5 = fsub <12 x float> %4, %3
+ %6 = shufflevector <12 x float> %5, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ ret <4 x float> %6
+
+; CHECK: xxspltw
+; CHECK: vmrghw
+; CHECK: vsldoi
+}
diff --git a/test/CodeGen/PowerPC/pr27350.ll b/test/CodeGen/PowerPC/pr27350.ll
new file mode 100644
index 000000000000..a3f35b2e41a6
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr27350.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=ppc64le -mtriple=powerpc64le-unknown-linux-gnu < %s
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+; Function Attrs: nounwind
+define internal fastcc void @foo() unnamed_addr #1 align 2 {
+entry:
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* null, i64 16, i32 8, i1 false)
+ %0 = load <2 x i64>, <2 x i64>* null, align 8
+ %1 = extractelement <2 x i64> %0, i32 1
+ %.fca.1.insert159.i = insertvalue [2 x i64] undef, i64 %1, 1
+ tail call fastcc void @bar([2 x i64] undef, [2 x i64] %.fca.1.insert159.i) #2
+ unreachable
+}
+
+; Function Attrs: nounwind
+declare fastcc void @bar([2 x i64], [2 x i64]) unnamed_addr #1 align 2
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 (trunk) (llvm/trunk 266222)"}
diff --git a/test/CodeGen/PowerPC/pr28130.ll b/test/CodeGen/PowerPC/pr28130.ll
new file mode 100644
index 000000000000..e549c46d9568
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr28130.ll
@@ -0,0 +1,70 @@
+; RUN: llc -O0 < %s | FileCheck %s
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%StructA = type { double, double, double, double, double, double, double, double }
+
+define void @Test(%StructA* %tmp) unnamed_addr #0 align 2 {
+; CHECK-LABEL: Test:
+; CHECK: lxvd2x
+; CHECK-NEXT: xxswapd
+; CHECK: lxvd2x
+; CHECK-NEXT: xxswapd
+; CHECK: lxvd2x
+; CHECK-NEXT: xxswapd
+; CHECK: lxvd2x
+; CHECK-NEXT: xxswapd
+; CHECK: xxswapd [[OUTPUT:[0-9]+]]
+; CHECK-NEXT: stxvd2x [[OUTPUT]]
+bb:
+ %tmp2 = getelementptr inbounds %StructA, %StructA* %tmp, i64 0, i32 0
+ %tmp4 = bitcast %StructA* %tmp to <2 x double>*
+ %tmp5 = getelementptr inbounds %StructA, %StructA* %tmp, i64 0, i32 2
+ %tmp9 = getelementptr inbounds %StructA, %StructA* %tmp, i64 0, i32 4
+ %tmp11 = getelementptr inbounds %StructA, %StructA* %tmp, i64 0, i32 5
+ %tmp13 = getelementptr inbounds %StructA, %StructA* %tmp, i64 0, i32 6
+ %tmp15 = getelementptr inbounds %StructA, %StructA* %tmp, i64 0, i32 7
+ %tmp18 = load double, double* %tmp2, align 16
+ %tmp19 = load double, double* %tmp11, align 8
+ %tmp20 = load double, double* %tmp9, align 16
+ %tmp21 = fsub double 1.210000e+04, %tmp20
+ %tmp22 = fmul double %tmp18, %tmp21
+ %tmp23 = fadd double %tmp20, %tmp22
+ %tmp24 = load double, double* %tmp13, align 16
+ %tmp25 = fsub double 1.000000e+02, %tmp24
+ %tmp26 = fmul double %tmp18, %tmp25
+ %tmp27 = fadd double %tmp24, %tmp26
+ %tmp28 = load double, double* %tmp15, align 8
+ %tmp29 = insertelement <2 x double> undef, double %tmp19, i32 0
+ %tmp30 = insertelement <2 x double> %tmp29, double %tmp28, i32 1
+ %tmp31 = fsub <2 x double> <double 1.100000e+04, double 1.100000e+02>, %tmp30
+ %tmp32 = insertelement <2 x double> undef, double %tmp18, i32 0
+ %tmp33 = insertelement <2 x double> %tmp32, double %tmp18, i32 1
+ %tmp34 = fmul <2 x double> %tmp33, %tmp31
+ %tmp35 = fadd <2 x double> %tmp30, %tmp34
+ %tmp36 = bitcast double* %tmp5 to <2 x double>*
+ %tmp37 = load <2 x double>, <2 x double>* %tmp36, align 16
+ %tmp38 = fsub <2 x double> <double 1.000000e+00, double 1.000000e+04>, %tmp37
+ %tmp39 = fmul <2 x double> %tmp33, %tmp38
+ %tmp40 = fadd <2 x double> %tmp37, %tmp39
+ %tmp41 = fsub <2 x double> <double 1.000000e+00, double 1.000000e+04>, %tmp40
+ %tmp42 = fmul <2 x double> %tmp33, %tmp41
+ %tmp43 = fadd <2 x double> %tmp40, %tmp42
+ %tmp44 = fsub <2 x double> <double 1.200000e+04, double 1.200000e+02>, %tmp35
+ %tmp45 = fmul <2 x double> %tmp33, %tmp44
+ %tmp46 = fadd <2 x double> %tmp35, %tmp45
+ %tmp48 = fsub double 1.440000e+04, %tmp23
+ %tmp49 = fmul double %tmp18, %tmp48
+ %tmp50 = fadd double %tmp23, %tmp49
+ store double %tmp50, double* %tmp9, align 16
+ %tmp51 = fsub double 1.000000e+02, %tmp27
+ %tmp52 = fmul double %tmp18, %tmp51
+ %tmp53 = fadd double %tmp27, %tmp52
+ store double %tmp53, double* %tmp13, align 16
+ %tmp54 = extractelement <2 x double> %tmp46, i32 1
+ store double %tmp54, double* %tmp15, align 8
+ %tmp55 = bitcast double* %tmp5 to <2 x double>*
+ store <2 x double> %tmp43, <2 x double>* %tmp55, align 16
+ ret void
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/preincprep-invoke.ll b/test/CodeGen/PowerPC/preincprep-invoke.ll
index 8dbce9a3a08e..6e97468f07fa 100644
--- a/test/CodeGen/PowerPC/preincprep-invoke.ll
+++ b/test/CodeGen/PowerPC/preincprep-invoke.ll
@@ -11,12 +11,12 @@ declare void @_ZN13CStdOutStream5FlushEv()
declare i32 @__gxx_personality_v0(...)
-define void @_Z11GetPasswordP13CStdOutStreamb() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+define void @_Z11GetPasswordP13CStdOutStreamb(i1 %cond, i8 %arg1, i8* %arg2) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
entry:
br label %for.cond.i.i
for.cond.i.i: ; preds = %for.cond.i.i, %entry
- br i1 undef, label %_ZN11CStringBaseIcEC2EPKc.exit.critedge, label %for.cond.i.i
+ br i1 %cond, label %_ZN11CStringBaseIcEC2EPKc.exit.critedge, label %for.cond.i.i
_ZN11CStringBaseIcEC2EPKc.exit.critedge: ; preds = %for.cond.i.i
invoke void @_ZN13CStdOutStreamlsEPKc()
@@ -37,11 +37,13 @@ for.cond.i.i30: ; preds = %for.cond.i.i30, %in
%indvars.iv.i.i26 = phi i64 [ %indvars.iv.next.i.i29, %for.cond.i.i30 ], [ 0, %invoke.cont4 ]
%arrayidx.i.i27 = getelementptr inbounds i8, i8* %call7, i64 %indvars.iv.i.i26
%0 = load i8, i8* %arrayidx.i.i27, align 1
+ %1 = add i8 %0, %arg1
+ store i8 %1, i8* %arg2, align 1
%indvars.iv.next.i.i29 = add nuw nsw i64 %indvars.iv.i.i26, 1
br label %for.cond.i.i30
lpad: ; preds = %invoke.cont4, %invoke.cont, %_ZN11CStringBaseIcEC2EPKc.exit.critedge
- %1 = landingpad { i8*, i32 }
+ %2 = landingpad { i8*, i32 }
cleanup
resume { i8*, i32 } undef
}
diff --git a/test/CodeGen/PowerPC/qpx-bv-sint.ll b/test/CodeGen/PowerPC/qpx-bv-sint.ll
index 0bc14ed4351a..9682e756c425 100644
--- a/test/CodeGen/PowerPC/qpx-bv-sint.ll
+++ b/test/CodeGen/PowerPC/qpx-bv-sint.ll
@@ -2,13 +2,13 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "powerpc64-bgq-linux"
; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
-define void @s452() nounwind {
+define void @s452(i32 %inp1) nounwind {
entry:
br label %for.body4
for.body4: ; preds = %for.body4, %entry
- %conv.4 = sitofp i32 undef to double
- %conv.5 = sitofp i32 undef to double
+ %conv.4 = sitofp i32 %inp1 to double
+ %conv.5 = sitofp i32 %inp1 to double
%mul.4.v.i0.1 = insertelement <2 x double> undef, double %conv.4, i32 0
%mul.4.v.i0.2 = insertelement <2 x double> %mul.4.v.i0.1, double %conv.5, i32 1
%mul.4 = fmul <2 x double> %mul.4.v.i0.2, undef
diff --git a/test/CodeGen/PowerPC/qpx-load-splat.ll b/test/CodeGen/PowerPC/qpx-load-splat.ll
new file mode 100644
index 000000000000..c76d5226b348
--- /dev/null
+++ b/test/CodeGen/PowerPC/qpx-load-splat.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: norecurse nounwind readonly
+define <4 x double> @foo(double* nocapture readonly %a) #0 {
+entry:
+ %0 = load double, double* %a, align 8
+ %vecinit.i = insertelement <4 x double> undef, double %0, i32 0
+ %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %shuffle.i
+
+; CHECK-LABEL: @foo
+; CHECK: lfd 1, 0(3)
+; CHECK: blr
+}
+
+define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 {
+entry:
+ %p = getelementptr double, double* %a, i64 %idx
+ %0 = load double, double* %p, align 8
+ %vecinit.i = insertelement <4 x double> undef, double %0, i32 0
+ %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %shuffle.i
+
+; CHECK-LABEL: @foox
+; CHECK: sldi [[REG1:[0-9]+]], 4, 3
+; CHECK: lfdx 1, 3, [[REG1]]
+; CHECK: blr
+}
+
+define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 {
+entry:
+ %p = getelementptr double, double* %a, i64 %idx
+ %0 = load double, double* %p, align 8
+ %vecinit.i = insertelement <4 x double> undef, double %0, i32 0
+ %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer
+ store double* %p, double** %pptr, align 8
+ ret <4 x double> %shuffle.i
+
+; CHECK-LABEL: @foox
+; CHECK: sldi [[REG1:[0-9]+]], 4, 3
+; CHECK: lfdux 1, 3, [[REG1]]
+; CHECK: std 3, 0(5)
+; CHECK: blr
+}
+
+define <4 x float> @foof(float* nocapture readonly %a) #0 {
+entry:
+ %0 = load float, float* %a, align 4
+ %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
+ %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %shuffle.i
+
+; CHECK-LABEL: @foof
+; CHECK: lfs 1, 0(3)
+; CHECK: blr
+}
+
+define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 {
+entry:
+ %p = getelementptr float, float* %a, i64 %idx
+ %0 = load float, float* %p, align 4
+ %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
+ %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %shuffle.i
+
+; CHECK-LABEL: @foofx
+; CHECK: sldi [[REG1:[0-9]+]], 4, 2
+; CHECK: lfsx 1, 3, [[REG1]]
+; CHECK: blr
+}
+
+attributes #0 = { norecurse nounwind readonly "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+
diff --git a/test/CodeGen/PowerPC/qpx-s-sel.ll b/test/CodeGen/PowerPC/qpx-s-sel.ll
index 09a615c4597d..6481fc6681a0 100644
--- a/test/CodeGen/PowerPC/qpx-s-sel.ll
+++ b/test/CodeGen/PowerPC/qpx-s-sel.ll
@@ -1,7 +1,6 @@
; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
target triple = "powerpc64-bgq-linux"
-@Q = constant <4 x i1> <i1 0, i1 undef, i1 1, i1 1>, align 16
@R = global <4 x i1> <i1 0, i1 0, i1 0, i1 0>, align 16
define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) nounwind readnone {
@@ -44,9 +43,9 @@ entry:
; blr
}
-define <4 x i1> @test4(<4 x i1> %a) nounwind {
+define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind {
entry:
- %q = load <4 x i1>, <4 x i1>* @Q, align 16
+ %q = load <4 x i1>, <4 x i1>* %t, align 16
%v = and <4 x i1> %a, %q
ret <4 x i1> %v
diff --git a/test/CodeGen/PowerPC/qpx-sel.ll b/test/CodeGen/PowerPC/qpx-sel.ll
index a375e6effbae..4b23df328a86 100644
--- a/test/CodeGen/PowerPC/qpx-sel.ll
+++ b/test/CodeGen/PowerPC/qpx-sel.ll
@@ -1,7 +1,6 @@
; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
target triple = "powerpc64-bgq-linux"
-@Q = constant <4 x i1> <i1 0, i1 undef, i1 1, i1 1>, align 16
@R = global <4 x i1> <i1 0, i1 0, i1 0, i1 0>, align 16
define <4 x double> @test1(<4 x double> %a, <4 x double> %b, <4 x i1> %c) nounwind readnone {
@@ -48,9 +47,9 @@ entry:
; blr
}
-define <4 x i1> @test4(<4 x i1> %a) nounwind {
+define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind {
entry:
- %q = load <4 x i1>, <4 x i1>* @Q, align 16
+ %q = load <4 x i1>, <4 x i1>* %t, align 16
%v = and <4 x i1> %a, %q
ret <4 x i1> %v
diff --git a/test/CodeGen/PowerPC/qpx-split-vsetcc.ll b/test/CodeGen/PowerPC/qpx-split-vsetcc.ll
index c8cef0faeaa4..5bda3f625401 100644
--- a/test/CodeGen/PowerPC/qpx-split-vsetcc.ll
+++ b/test/CodeGen/PowerPC/qpx-split-vsetcc.ll
@@ -3,27 +3,18 @@ target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-bgq-linux"
; Function Attrs: nounwind
-define void @gsl_sf_legendre_Pl_deriv_array() #0 {
+define void @gsl_sf_legendre_Pl_deriv_array(<4 x i32> %inp1, <4 x double> %inp2) #0 {
entry:
- br i1 undef, label %do.body.i, label %if.else.i
-
-do.body.i: ; preds = %entry
- unreachable
-
-if.else.i: ; preds = %entry
- br i1 undef, label %return, label %for.body46.lr.ph
-
-for.body46.lr.ph: ; preds = %if.else.i
br label %vector.body198
vector.body198: ; preds = %vector.body198, %for.body46.lr.ph
- %0 = icmp ne <4 x i32> undef, zeroinitializer
+ %0 = icmp ne <4 x i32> %inp1, zeroinitializer
%1 = select <4 x i1> %0, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>
- %2 = fmul <4 x double> undef, %1
- %3 = fmul <4 x double> undef, %2
- %4 = fmul <4 x double> %3, undef
+ %2 = fmul <4 x double> %inp2, %1
+ %3 = fmul <4 x double> %inp2, %2
+ %4 = fmul <4 x double> %3, %inp2
store <4 x double> %4, <4 x double>* undef, align 8
- br label %vector.body198
+ br label %return
; CHECK-LABEL: @gsl_sf_legendre_Pl_deriv_array
; CHECK: qvlfiwzx
diff --git a/test/CodeGen/PowerPC/remove-redundant-moves.ll b/test/CodeGen/PowerPC/remove-redundant-moves.ll
new file mode 100644
index 000000000000..6b845cbf380a
--- /dev/null
+++ b/test/CodeGen/PowerPC/remove-redundant-moves.ll
@@ -0,0 +1,107 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BE
+define double @test1(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test1
+; CHECK: xxswapd [[SW:[0-9]+]], 34
+; CHECK: xscvsxddp 1, [[SW]]
+; CHECK-BE-LABEL: test1
+; CHECK-BE: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK-BE: xscvsxddp 1, [[CP]]
+ %0 = extractelement <2 x i64> %a, i32 0
+ %1 = sitofp i64 %0 to double
+ ret double %1
+}
+
+define double @test2(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test2
+; CHECK: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK: xscvsxddp 1, [[CP]]
+; CHECK-BE-LABEL: test2
+; CHECK-BE: xxswapd [[SW:[0-9]+]], 34
+; CHECK-BE: xscvsxddp 1, [[SW]]
+ %0 = extractelement <2 x i64> %a, i32 1
+ %1 = sitofp i64 %0 to double
+ ret double %1
+}
+
+define float @test1f(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test1f
+; CHECK: xxswapd [[SW:[0-9]+]], 34
+; CHECK: xscvsxdsp 1, [[SW]]
+; CHECK-BE-LABEL: test1f
+; CHECK-BE: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK-BE: xscvsxdsp 1, [[CP]]
+ %0 = extractelement <2 x i64> %a, i32 0
+ %1 = sitofp i64 %0 to float
+ ret float %1
+}
+
+define float @test2f(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test2f
+; CHECK: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK: xscvsxdsp 1, [[CP]]
+; CHECK-BE-LABEL: test2f
+; CHECK-BE: xxswapd [[SW:[0-9]+]], 34
+; CHECK-BE: xscvsxdsp 1, [[SW]]
+ %0 = extractelement <2 x i64> %a, i32 1
+ %1 = sitofp i64 %0 to float
+ ret float %1
+}
+
+define double @test1u(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test1u
+; CHECK: xxswapd [[SW:[0-9]+]], 34
+; CHECK: xscvuxddp 1, [[SW]]
+; CHECK-BE-LABEL: test1u
+; CHECK-BE: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK-BE: xscvuxddp 1, [[CP]]
+ %0 = extractelement <2 x i64> %a, i32 0
+ %1 = uitofp i64 %0 to double
+ ret double %1
+}
+
+define double @test2u(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test2u
+; CHECK: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK: xscvuxddp 1, [[CP]]
+; CHECK-BE-LABEL: test2u
+; CHECK-BE: xxswapd [[SW:[0-9]+]], 34
+; CHECK-BE: xscvuxddp 1, [[SW]]
+ %0 = extractelement <2 x i64> %a, i32 1
+ %1 = uitofp i64 %0 to double
+ ret double %1
+}
+
+define float @test1fu(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test1fu
+; CHECK: xxswapd [[SW:[0-9]+]], 34
+; CHECK: xscvuxdsp 1, [[SW]]
+; CHECK-BE-LABEL: test1fu
+; CHECK-BE: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK-BE: xscvuxdsp 1, [[CP]]
+ %0 = extractelement <2 x i64> %a, i32 0
+ %1 = uitofp i64 %0 to float
+ ret float %1
+}
+
+define float @test2fu(<2 x i64> %a) {
+entry:
+; CHECK-LABEL: test2fu
+; CHECK: xxlor [[CP:[0-9]+]], 34, 34
+; CHECK: xscvuxdsp 1, [[CP]]
+; CHECK-BE-LABEL: test2fu
+; CHECK-BE: xxswapd [[SW:[0-9]+]], 34
+; CHECK-BE: xscvuxdsp 1, [[SW]]
+ %0 = extractelement <2 x i64> %a, i32 1
+ %1 = uitofp i64 %0 to float
+ ret float %1
+}
diff --git a/test/CodeGen/PowerPC/rlwinm-zero-ext.ll b/test/CodeGen/PowerPC/rlwinm-zero-ext.ll
new file mode 100644
index 000000000000..f174bb64ba32
--- /dev/null
+++ b/test/CodeGen/PowerPC/rlwinm-zero-ext.ll
@@ -0,0 +1,57 @@
+; RUN: llc -O2 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-LABEL: test1
+define i8 @test1(i32 %a) {
+entry:
+; CHECK-NOT: rlwinm {{{[0-9]+}}}, {{[0-9]+}}, 0, 24, 27
+; CHECK: rlwinm. [[REG:[0-9]+]], {{[0-9]+}}, 0, 24, 27
+; CHECK-NOT: cmplwi [[REG]], 0
+; CHECK: beq 0
+ %0 = and i32 %a, 240
+ %1 = icmp eq i32 %0, 0
+ br i1 %1, label %eq0, label %neq0
+eq0:
+ ret i8 102
+neq0:
+ ret i8 116
+}
+
+; CHECK-LABEL: test2
+define i8 @test2(i32 %a) {
+entry:
+; CHECK: rlwinm [[REG:[0-9]+]], {{[0-9]+}}, 0, 28, 23
+; CHECK: cmplwi [[REG]], 0
+; CHECK: beq 0
+ %0 = and i32 %a, -241
+ %1 = icmp eq i32 %0, 0
+ br i1 %1, label %eq0, label %neq0
+eq0:
+ ret i8 102
+neq0:
+ ret i8 116
+}
+
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
+
+; CHECK-LABEL: test3
+define i8 @test3(i32 %a, i32 %b) {
+entry:
+; CHECK-NOT: rlwnm {{{[0-9]+}}}, {{[0-9]+}}, {{{[0-9]+}}}, 28, 31
+; CHECK: rlwnm. [[REG:[0-9]+]], {{[0-9]+}}, 4, 28, 31
+; CHECK-NOT: cmplwi [[REG]], 0
+; CHECK: beq 0
+ %left = shl i32 %a, %b
+ %res = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 32, i32 %b)
+ %right_amount = extractvalue {i32, i1} %res, 0
+ %right = lshr i32 %a, %right_amount
+ %0 = or i32 %left, %right
+ %1 = and i32 %0, 15
+ %2 = icmp eq i32 %1, 0
+ br i1 %2, label %eq0, label %neq0
+eq0:
+ ret i8 102
+neq0:
+ ret i8 116
+}
diff --git a/test/CodeGen/PowerPC/stack-protector.ll b/test/CodeGen/PowerPC/stack-protector.ll
index 8d255bd9a43b..57d1c7e52b1b 100644
--- a/test/CodeGen/PowerPC/stack-protector.ll
+++ b/test/CodeGen/PowerPC/stack-protector.ll
@@ -1,6 +1,17 @@
-; RUN: llc -march=ppc32 -mtriple=ppc32-unknown-linux < %s | FileCheck %s
-; CHECK: __stack_chk_guard
-; CHECK: __stack_chk_fail
+; RUN: llc -mtriple=powerpc-apple-darwin8 < %s | FileCheck -check-prefix=DARWIN32 %s
+; RUN: llc -mtriple=powerpc64-apple-darwin < %s | FileCheck -check-prefix=DARWIN64 %s
+; RUN: llc -mtriple=ppc32-unknown-linux < %s | FileCheck -check-prefix=LINUX32 %s
+; RUN: llc -mtriple=powerpc64le-unknown-linux < %s | FileCheck -check-prefix=LINUX64 %s
+
+; DARWIN32: __stack_chk_guard
+; DARWIN64: __stack_chk_guard
+; LINUX32: lwz {{[0-9]+}}, -28680(2)
+; LINUX64: ld {{[0-9]+}}, -28688(13)
+
+; DARWIN32: __stack_chk_fail
+; DARWIN64: __stack_chk_fail
+; LINUX32: __stack_chk_fail
+; LINUX64: __stack_chk_fail
@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00" ; <[11 x i8]*> [#uses=1]
diff --git a/test/CodeGen/PowerPC/stackmap-frame-setup.ll b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
index 487da00faa1c..b5f1d4cfe4bc 100644
--- a/test/CodeGen/PowerPC/stackmap-frame-setup.ll
+++ b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
define void @caller_meta_leaf() {
entry:
diff --git a/test/CodeGen/PowerPC/stubs.ll b/test/CodeGen/PowerPC/stubs.ll
index 694f208198a1..a167cb823325 100644
--- a/test/CodeGen/PowerPC/stubs.ll
+++ b/test/CodeGen/PowerPC/stubs.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - -mtriple=powerpc-apple-darwin8 | FileCheck %s
+; RUN: llc %s -o - -mtriple=powerpc-apple-darwin9 | FileCheck %s
define ppc_fp128 @test1(i64 %X) nounwind readnone {
entry:
%0 = sitofp i64 %X to ppc_fp128
@@ -6,17 +6,4 @@ entry:
}
; CHECK: _test1:
-; CHECK: bl L___floatditf$stub
-; CHECK: .section __TEXT,__symbol_stub1,symbol_stubs,pure_instructions,16
-; CHECK: L___floatditf$stub:
-; CHECK: .indirect_symbol ___floatditf
-; CHECK: lis r11, ha16(L___floatditf$lazy_ptr)
-; CHECK: lwzu r12, lo16(L___floatditf$lazy_ptr)(r11)
-; CHECK: mtctr r12
-; CHECK: bctr
-; CHECK: .section __DATA,__la_symbol_ptr,lazy_symbol_pointers
-; CHECK: L___floatditf$lazy_ptr:
-; CHECK: .indirect_symbol ___floatditf
-; CHECK: .long dyld_stub_binding_helper
-
-
+; CHECK: bl ___floatditf
diff --git a/test/CodeGen/PowerPC/stwux.ll b/test/CodeGen/PowerPC/stwux.ll
index 2ed630d8002d..4f83c9f64ea5 100644
--- a/test/CodeGen/PowerPC/stwux.ll
+++ b/test/CodeGen/PowerPC/stwux.ll
@@ -4,7 +4,7 @@ target triple = "powerpc64-unknown-linux-gnu"
@multvec_i = external unnamed_addr global [100 x i32], align 4
-define fastcc void @subs_STMultiExceptIntern() nounwind {
+define fastcc void @subs_STMultiExceptIntern(i32 %input) nounwind {
entry:
br i1 undef, label %while.body.lr.ph, label %return
@@ -16,10 +16,11 @@ while.body: ; preds = %if.end12, %while.bo
br i1 undef, label %if.end12, label %if.then
if.then: ; preds = %while.body
+ %0 = add i32 %input, 1
br label %if.end12
if.end12: ; preds = %if.then, %while.body
- %i.1 = phi i32 [ %i.0240, %while.body ], [ undef, %if.then ]
+ %i.1 = phi i32 [ %i.0240, %while.body ], [ %0, %if.then ]
br i1 undef, label %while.body, label %while.end
while.end: ; preds = %if.end12
diff --git a/test/CodeGen/PowerPC/subreg-postra-2.ll b/test/CodeGen/PowerPC/subreg-postra-2.ll
index 051536443413..5471b11933bf 100644
--- a/test/CodeGen/PowerPC/subreg-postra-2.ll
+++ b/test/CodeGen/PowerPC/subreg-postra-2.ll
@@ -3,158 +3,30 @@ target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
; Function Attrs: nounwind
-define void @jbd2_journal_commit_transaction() #0 {
+define void @jbd2_journal_commit_transaction(i32 %input1, i32* %input2, i32* %input3, i8** %input4) #0 {
entry:
- br i1 undef, label %do.body, label %if.then5
-
-if.then5: ; preds = %entry
- unreachable
-
-do.body: ; preds = %entry
- br i1 undef, label %do.body.i, label %trace_jbd2_start_commit.exit
-
-do.body.i: ; preds = %do.body
- unreachable
-
-trace_jbd2_start_commit.exit: ; preds = %do.body
- br i1 undef, label %do.body.i1116, label %trace_jbd2_commit_locking.exit
-
-do.body.i1116: ; preds = %trace_jbd2_start_commit.exit
- unreachable
-
-trace_jbd2_commit_locking.exit: ; preds = %trace_jbd2_start_commit.exit
- br i1 undef, label %while.end, label %while.body.lr.ph
-
-while.body.lr.ph: ; preds = %trace_jbd2_commit_locking.exit
- unreachable
-
-while.end: ; preds = %trace_jbd2_commit_locking.exit
- br i1 undef, label %spin_unlock.exit1146, label %if.then.i.i.i.i1144
-
-if.then.i.i.i.i1144: ; preds = %while.end
- unreachable
-
-spin_unlock.exit1146: ; preds = %while.end
- br i1 undef, label %spin_unlock.exit1154, label %if.then.i.i.i.i1152
-
-if.then.i.i.i.i1152: ; preds = %spin_unlock.exit1146
- unreachable
-
-spin_unlock.exit1154: ; preds = %spin_unlock.exit1146
- br i1 undef, label %do.body.i1159, label %trace_jbd2_commit_flushing.exit
-
-do.body.i1159: ; preds = %spin_unlock.exit1154
- br i1 undef, label %if.end.i1166, label %do.body5.i1165
-
-do.body5.i1165: ; preds = %do.body.i1159
- unreachable
-
-if.end.i1166: ; preds = %do.body.i1159
- unreachable
-
-trace_jbd2_commit_flushing.exit: ; preds = %spin_unlock.exit1154
- br i1 undef, label %for.end.i, label %for.body.lr.ph.i
-
-for.body.lr.ph.i: ; preds = %trace_jbd2_commit_flushing.exit
- unreachable
-
-for.end.i: ; preds = %trace_jbd2_commit_flushing.exit
- br i1 undef, label %journal_submit_data_buffers.exit, label %if.then.i.i.i.i31.i
-
-if.then.i.i.i.i31.i: ; preds = %for.end.i
- br label %journal_submit_data_buffers.exit
-
-journal_submit_data_buffers.exit: ; preds = %if.then.i.i.i.i31.i, %for.end.i
- br i1 undef, label %if.end103, label %if.then102
-
-if.then102: ; preds = %journal_submit_data_buffers.exit
- unreachable
-
-if.end103: ; preds = %journal_submit_data_buffers.exit
- br i1 undef, label %do.body.i1182, label %trace_jbd2_commit_logging.exit
-
-do.body.i1182: ; preds = %if.end103
- br i1 undef, label %if.end.i1189, label %do.body5.i1188
-
-do.body5.i1188: ; preds = %do.body5.i1188, %do.body.i1182
- br i1 undef, label %if.end.i1189, label %do.body5.i1188
-
-if.end.i1189: ; preds = %do.body5.i1188, %do.body.i1182
- unreachable
-
-trace_jbd2_commit_logging.exit: ; preds = %if.end103
- br label %while.cond129.outer1451
-
-while.cond129.outer1451: ; preds = %start_journal_io, %trace_jbd2_commit_logging.exit
- br label %while.cond129
-
-while.cond129: ; preds = %if.then135, %while.cond129.outer1451
- br i1 undef, label %while.end246, label %if.then135
-
-if.then135: ; preds = %while.cond129
- br i1 undef, label %start_journal_io, label %while.cond129
-
-start_journal_io: ; preds = %if.then135
- br label %while.cond129.outer1451
-
-while.end246: ; preds = %while.cond129
- br i1 undef, label %for.end.i1287, label %for.body.i1277
-
-for.body.i1277: ; preds = %while.end246
- unreachable
-
-for.end.i1287: ; preds = %while.end246
- br i1 undef, label %journal_finish_inode_data_buffers.exit, label %if.then.i.i.i.i84.i
-
-if.then.i.i.i.i84.i: ; preds = %for.end.i1287
- unreachable
-
-journal_finish_inode_data_buffers.exit: ; preds = %for.end.i1287
- br i1 undef, label %if.end256, label %if.then249
-
-if.then249: ; preds = %journal_finish_inode_data_buffers.exit
- unreachable
-
-if.end256: ; preds = %journal_finish_inode_data_buffers.exit
- br label %while.body318
-
-while.body318: ; preds = %wait_on_buffer.exit, %if.end256
- br i1 undef, label %wait_on_buffer.exit, label %if.then.i1296
-
-if.then.i1296: ; preds = %while.body318
- br label %wait_on_buffer.exit
-
-wait_on_buffer.exit: ; preds = %if.then.i1296, %while.body318
- br i1 undef, label %do.body378, label %while.body318
-
-do.body378: ; preds = %wait_on_buffer.exit
- br i1 undef, label %while.end418, label %while.body392.lr.ph
-
-while.body392.lr.ph: ; preds = %do.body378
br label %while.body392
while.body392: ; preds = %wait_on_buffer.exit1319, %while.body392.lr.ph
- %0 = load i8*, i8** undef, align 8
+ %0 = load i8*, i8** %input4, align 8
%add.ptr399 = getelementptr inbounds i8, i8* %0, i64 -72
%b_state.i.i1314 = bitcast i8* %add.ptr399 to i64*
- %tobool.i1316 = icmp eq i64 undef, 0
- br i1 %tobool.i1316, label %wait_on_buffer.exit1319, label %if.then.i1317
-
-if.then.i1317: ; preds = %while.body392
- unreachable
+ %ivar = add i32 %input1, 1
+ %tobool.i1316 = icmp eq i32 %input1, 0
+ br i1 %tobool.i1316, label %wait_on_buffer.exit1319, label %while.end418
wait_on_buffer.exit1319: ; preds = %while.body392
%1 = load volatile i64, i64* %b_state.i.i1314, align 8
%conv.i.i1322 = and i64 %1, 1
%lnot404 = icmp eq i64 %conv.i.i1322, 0
- %.err.4 = select i1 %lnot404, i32 -5, i32 undef
+ %.err.4 = select i1 %lnot404, i32 -5, i32 %input1
%2 = call i64 asm sideeffect "1:.long 0x7c0000a8 $| ((($0) & 0x1f) << 21) $| (((0) & 0x1f) << 16) $| ((($3) & 0x1f) << 11) $| (((0) & 0x1) << 0) \0Aandc $0,$0,$2\0Astdcx. $0,0,$3\0Abne- 1b\0A", "=&r,=*m,r,r,*m,~{cc},~{memory}"(i64* %b_state.i.i1314, i64 262144, i64* %b_state.i.i1314, i64* %b_state.i.i1314) #0
- store i8* %0, i8** undef, align 8
- %cmp.i1312 = icmp eq i32* undef, undef
+ store i8* %0, i8** %input4, align 8
+ %cmp.i1312 = icmp eq i32* %input2, %input3
br i1 %cmp.i1312, label %while.end418, label %while.body392
while.end418: ; preds = %wait_on_buffer.exit1319, %do.body378
- %err.4.lcssa = phi i32 [ undef, %do.body378 ], [ %.err.4, %wait_on_buffer.exit1319 ]
+ %err.4.lcssa = phi i32 [ %ivar, %while.body392 ], [ %.err.4, %wait_on_buffer.exit1319 ]
%tobool419 = icmp eq i32 %err.4.lcssa, 0
br i1 %tobool419, label %if.end421, label %if.then420
@@ -169,6 +41,7 @@ if.then420: ; preds = %while.end418
if.end421: ; preds = %while.end418
unreachable
+
}
attributes #0 = { nounwind }
diff --git a/test/CodeGen/PowerPC/subreg-postra.ll b/test/CodeGen/PowerPC/subreg-postra.ll
index ba1b967cf204..b9f17465d003 100644
--- a/test/CodeGen/PowerPC/subreg-postra.ll
+++ b/test/CodeGen/PowerPC/subreg-postra.ll
@@ -3,7 +3,10 @@ target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
; Function Attrs: nounwind
-define void @jbd2_journal_commit_transaction(i32* %journal) #0 {
+define void @jbd2_journal_commit_transaction(i32* %journal, i64 %inp1, i32 %inp2,
+ i32* %inp3, i32** %inp4,
+ i32** %inp5, i1 %inp6,
+ i1 %inp7, i1 %inp8) #0 {
entry:
br i1 undef, label %do.body, label %if.then5
@@ -104,17 +107,17 @@ do.body277: ; preds = %if.then260, %if.end
br label %while.body318
while.body318: ; preds = %wait_on_buffer.exit, %do.body277
- %tobool.i1295 = icmp eq i64 undef, 0
+ %tobool.i1295 = icmp eq i64 %inp1, 0
br i1 %tobool.i1295, label %wait_on_buffer.exit, label %if.then.i1296
if.then.i1296: ; preds = %while.body318
unreachable
wait_on_buffer.exit: ; preds = %while.body318
- br i1 undef, label %do.body378, label %while.body318
+ br i1 %inp6, label %do.body378, label %while.body318
do.body378: ; preds = %wait_on_buffer.exit
- br i1 undef, label %while.end418, label %while.body392.lr.ph
+ br i1 %inp7, label %while.end418, label %while.body392.lr.ph
while.body392.lr.ph: ; preds = %do.body378
br label %while.body392
@@ -123,7 +126,7 @@ while.body392: ; preds = %wait_on_buffer.exit
%0 = load i8*, i8** undef, align 8
%add.ptr399 = getelementptr inbounds i8, i8* %0, i64 -72
%b_state.i.i1314 = bitcast i8* %add.ptr399 to i64*
- %tobool.i1316 = icmp eq i64 undef, 0
+ %tobool.i1316 = icmp eq i64 %inp1, 0
br i1 %tobool.i1316, label %wait_on_buffer.exit1319, label %if.then.i1317
if.then.i1317: ; preds = %while.body392
@@ -133,23 +136,23 @@ wait_on_buffer.exit1319: ; preds = %while.body392
%1 = load volatile i64, i64* %b_state.i.i1314, align 8
%conv.i.i1322 = and i64 %1, 1
%lnot404 = icmp eq i64 %conv.i.i1322, 0
- %.err.4 = select i1 %lnot404, i32 -5, i32 undef
+ %.err.4 = select i1 %lnot404, i32 -5, i32 %inp2
%2 = call i64 asm sideeffect "1:.long 0x7c0000a8 $| ((($0) & 0x1f) << 21) $| (((0) & 0x1f) << 16) $| ((($3) & 0x1f) << 11) $| (((0) & 0x1) << 0) \0Aandc $0,$0,$2\0Astdcx. $0,0,$3\0Abne- 1b\0A", "=&r,=*m,r,r,*m,~{cc},~{memory}"(i64* %b_state.i.i1314, i64 262144, i64* %b_state.i.i1314, i64* %b_state.i.i1314) #1
%prev.i.i.i1325 = getelementptr inbounds i8, i8* %0, i64 8
- %3 = load i32*, i32** null, align 8
- store i32* %3, i32** undef, align 8
- call void @__brelse(i32* undef) #1
- br i1 undef, label %while.end418, label %while.body392
+ %3 = load i32*, i32** %inp4, align 8
+ store i32* %3, i32** %inp5, align 8
+ call void @__brelse(i32* %3) #1
+ br i1 %inp8, label %while.end418, label %while.body392
; CHECK-LABEL: @jbd2_journal_commit_transaction
; CHECK: andi.
-; CHECK: crmove [[REG:[0-9]+]], 1
+; CHECK: crmove
; CHECK: stdcx.
-; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]]
+; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}},
while.end418: ; preds = %wait_on_buffer.exit1319, %do.body378
- %err.4.lcssa = phi i32 [ undef, %do.body378 ], [ %.err.4, %wait_on_buffer.exit1319 ]
- br i1 undef, label %if.end421, label %if.then420
+ %err.4.lcssa = phi i32 [ %inp2, %do.body378 ], [ %.err.4, %wait_on_buffer.exit1319 ]
+ br i1 %inp7, label %if.end421, label %if.then420
if.then420: ; preds = %while.end418
call void @jbd2_journal_abort(i32* %journal, i32 signext %err.4.lcssa) #1
diff --git a/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
deleted file mode 100644
index 5389c1318445..000000000000
--- a/test/CodeGen/PowerPC/subsumes-pred-regs.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc < %s -mcpu=ppc64 -mattr=-crbits | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-; Function Attrs: nounwind
-define zeroext i1 @test1() unnamed_addr #0 align 2 {
-
-; CHECK-LABEL: @test1
-
-entry:
- br i1 undef, label %lor.end, label %lor.rhs
-
-lor.rhs: ; preds = %entry
- unreachable
-
-lor.end: ; preds = %entry
- br i1 undef, label %land.rhs, label %if.then
-
-if.then: ; preds = %lor.end
- br i1 undef, label %return, label %if.end.i24
-
-if.end.i24: ; preds = %if.then
- %0 = load i32, i32* undef, align 4
- %lnot.i.i16.i23 = icmp eq i32 %0, 0
- br i1 %lnot.i.i16.i23, label %if.end7.i37, label %test.exit27.i34
-
-test.exit27.i34: ; preds = %if.end.i24
- br i1 undef, label %return, label %if.end7.i37
-
-if.end7.i37: ; preds = %test.exit27.i34, %if.end.i24
- %tobool.i.i36 = icmp eq i8 undef, 0
- br i1 %tobool.i.i36, label %return, label %if.then9.i39
-
-if.then9.i39: ; preds = %if.end7.i37
- br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
-
-; CHECK: .LBB0_7:
-; CHECK: bne 1, .LBB0_10
-; CHECK: beq 0, .LBB0_10
-; CHECK: .LBB0_9:
-
-lor.rhs.i.i49: ; preds = %if.then9.i39
- %cmp.i.i.i.i48 = icmp ne i64 undef, 0
- br label %return
-
-land.rhs: ; preds = %lor.end
- br i1 undef, label %return, label %if.end.i
-
-if.end.i: ; preds = %land.rhs
- br i1 undef, label %return, label %if.then9.i
-
-if.then9.i: ; preds = %if.end.i
- br i1 undef, label %return, label %lor.rhs.i.i
-
-lor.rhs.i.i: ; preds = %if.then9.i
- %cmp.i.i.i.i = icmp ne i64 undef, 0
- br label %return
-
-return: ; preds = %lor.rhs.i.i, %if.then9.i, %if.end.i, %land.rhs, %lor.rhs.i.i49, %if.then9.i39, %if.end7.i37, %test.exit27.i34, %if.then
- %retval.0 = phi i1 [ false, %if.then ], [ false, %test.exit27.i34 ], [ true, %if.end7.i37 ], [ true, %if.then9.i39 ], [ %cmp.i.i.i.i48, %lor.rhs.i.i49 ], [ false, %land.rhs ], [ true, %if.end.i ], [ true, %if.then9.i ], [ %cmp.i.i.i.i, %lor.rhs.i.i ]
- ret i1 %retval.0
-}
-
-attributes #0 = { nounwind }
-
diff --git a/test/CodeGen/PowerPC/svr4-redzone.ll b/test/CodeGen/PowerPC/svr4-redzone.ll
index bee3ac32b648..a72ac104fc68 100644
--- a/test/CodeGen/PowerPC/svr4-redzone.ll
+++ b/test/CodeGen/PowerPC/svr4-redzone.ll
@@ -2,10 +2,10 @@
; RUN: llc -mtriple="powerpc64-unknown-linux-gnu" < %s | FileCheck %s --check-prefix=PPC64
; PR15332
-define void @regalloc() nounwind {
+define i32 @regalloc() nounwind {
entry:
- %0 = add i32 1, 2
- ret void
+ %0 = add i32 1, 2
+ ret i32 %0
}
; PPC32-LABEL: regalloc:
; PPC32-NOT: stwu 1, -{{[0-9]+}}(1)
@@ -15,10 +15,10 @@ entry:
; PPC64-NOT: stdu 1, -{{[0-9]+}}(1)
; PPC64: blr
-define void @smallstack() nounwind {
+define i8* @smallstack() nounwind {
entry:
- %0 = alloca i8, i32 4
- ret void
+ %0 = alloca i8, i32 4
+ ret i8* %0
}
; PPC32-LABEL: smallstack:
; PPC32: stwu 1, -16(1)
@@ -27,10 +27,10 @@ entry:
; PPC64-NOT: stdu 1, -{{[0-9]+}}(1)
; PPC64: blr
-define void @bigstack() nounwind {
+define i8* @bigstack() nounwind {
entry:
- %0 = alloca i8, i32 230
- ret void
+ %0 = alloca i8, i32 230
+ ret i8* %0
}
; PPC32-LABEL: bigstack:
; PPC32: stwu 1, -240(1)
diff --git a/test/CodeGen/PowerPC/swaps-le-2.ll b/test/CodeGen/PowerPC/swaps-le-2.ll
index 08096ed20ddb..9d1eb412cba3 100644
--- a/test/CodeGen/PowerPC/swaps-le-2.ll
+++ b/test/CodeGen/PowerPC/swaps-le-2.ll
@@ -87,5 +87,5 @@ entry:
; CHECK-LABEL: @ifoo
; CHECK: lxvd2x
-; CHECK: vspltw {{[0-9]+}}, {{[0-9]+}}, 0
+; CHECK: xxspltw {{[0-9]+}}, {{[0-9]+}}, 0
; CHECK: stxvd2x
diff --git a/test/CodeGen/PowerPC/swaps-le-7.ll b/test/CodeGen/PowerPC/swaps-le-7.ll
new file mode 100644
index 000000000000..0c6970f5b17e
--- /dev/null
+++ b/test/CodeGen/PowerPC/swaps-le-7.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -O3 < %s | FileCheck %s
+;
+; This is a regression test based on https://llvm.org/bugs/show_bug.cgi?id=27735
+;
+
+@G1 = global <2 x double> <double 2.0, double -10.0>
+@G2 = global <2 x double> <double 3.0, double 4.0>
+@G3 = global <2 x double> <double 5.0, double 6.0>
+@G4 = global <2 x double> <double 7.0, double 8.0>
+
+; CHECK-LABEL: @zg
+; CHECK: xxspltd
+; CHECK-NEXT: xxspltd
+; CHECK-NEXT: xxswapd
+; CHECK-NEXT: xvmuldp
+; CHECK-NEXT: xvmuldp
+; CHECK-NEXT: xvsubdp
+; CHECK-NEXT: xvadddp
+; CHECK-NEXT: xxpermdi
+; CHECK-NEXT: xvsubdp
+; CHECK-NEXT: xxswapd
+; CHECK-NEXT: stxvd2x
+; CHECK-NEXT: blr
+
+; Function Attrs: noinline
+define void @zg(i8* %.G0011_640.0, i8* %.G0012_642.0, <2 x double>* %JJ, <2 x double>* %.ka0000_391, double %.unpack, double %.unpack66) #0 {
+L.JA291:
+ %Z.L.JA291.2 = load <2 x double>, <2 x double>* %.ka0000_391, align 16
+ store <2 x double> %Z.L.JA291.2, <2 x double>* %JJ, align 8
+ %Z.L.JA291.3 = bitcast i8* %.G0012_642.0 to <2 x double>*
+ %Z.L.JA291.4 = load <2 x double>, <2 x double>* %Z.L.JA291.3, align 1
+ %.elt136 = bitcast i8* %.G0011_640.0 to double*
+ %.unpack137 = load double, double* %.elt136, align 1
+ %.elt138 = getelementptr inbounds i8, i8* %.G0011_640.0, i64 8
+ %Z.L.JA291.5 = bitcast i8* %.elt138 to double*
+ %.unpack139 = load double, double* %Z.L.JA291.5, align 1
+ %Z.L.JA291.6 = insertelement <2 x double> undef, double %.unpack137, i32 0
+ %Z.L.JA291.7 = insertelement <2 x double> %Z.L.JA291.6, double %.unpack137, i32 1
+ %Z.L.JA291.8 = fmul <2 x double> %Z.L.JA291.2, %Z.L.JA291.7
+ %Z.L.JA291.9 = shufflevector <2 x double> %Z.L.JA291.2, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %Z.L.JA291.10 = insertelement <2 x double> undef, double %.unpack139, i32 0
+ %Z.L.JA291.11 = insertelement <2 x double> %Z.L.JA291.10, double %.unpack139, i32 1
+ %Z.L.JA291.12 = fmul <2 x double> %Z.L.JA291.9, %Z.L.JA291.11
+ %Z.L.JA291.13 = fsub <2 x double> %Z.L.JA291.8, %Z.L.JA291.12
+ %Z.L.JA291.14 = fadd <2 x double> %Z.L.JA291.8, %Z.L.JA291.12
+ %Z.L.JA291.15 = shufflevector <2 x double> %Z.L.JA291.13, <2 x double> %Z.L.JA291.14, <2 x i32> <i32 0, i32 3>
+ %Z.L.JA291.16 = fsub <2 x double> %Z.L.JA291.4, %Z.L.JA291.15
+ %Z.L.JA291.17 = bitcast i8* %.G0012_642.0 to <2 x double>*
+ store <2 x double> %Z.L.JA291.16, <2 x double>* %Z.L.JA291.17, align 8
+ %.. = bitcast <2 x double>* %JJ to i32*
+ %.pre = load i32, i32* %.., align 32
+ ret void
+}
+
+attributes #0 = { noinline } \ No newline at end of file
diff --git a/test/CodeGen/PowerPC/tailcall-string-rvo.ll b/test/CodeGen/PowerPC/tailcall-string-rvo.ll
new file mode 100644
index 000000000000..3f850e6e94b7
--- /dev/null
+++ b/test/CodeGen/PowerPC/tailcall-string-rvo.ll
@@ -0,0 +1,47 @@
+; RUN: llc -O2 < %s | FileCheck %s
+
+; The call to function TestBar should be a tail call, when in C++ the string
+; `ret` is RVO returned.
+; string TestFoo() {
+; string ret = undef;
+; TestBar(&ret); // tail call optimized
+; return ret;
+; }
+
+target triple = "powerpc64le-linux-gnu"
+
+%class.basic_string.11.42.73 = type { %"class.__gnu_cxx::__versa_string.10.41.72" }
+%"class.__gnu_cxx::__versa_string.10.41.72" = type { %"class.__gnu_cxx::__sso_string_base.9.40.71" }
+%"class.__gnu_cxx::__sso_string_base.9.40.71" = type { %"struct.__gnu_cxx::__vstring_utility<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.7.38.69", i64, %union.anon.8.39.70 }
+%"struct.__gnu_cxx::__vstring_utility<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.7.38.69" = type { i8* }
+%union.anon.8.39.70 = type { i64, [8 x i8] }
+
+declare void @TestBaz(%class.basic_string.11.42.73* noalias sret %arg)
+
+define void @TestBar(%class.basic_string.11.42.73* noalias sret %arg) {
+bb:
+ call void @TestBaz(%class.basic_string.11.42.73* noalias sret %arg)
+ ret void
+}
+
+define void @TestFoo(%class.basic_string.11.42.73* noalias sret %arg) {
+; CHECK-LABEL: TestFoo:
+; CHECK: #TC_RETURNd8 TestBar 0
+bb:
+ %tmp = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 2
+ %tmp1 = bitcast %class.basic_string.11.42.73* %arg to %union.anon.8.39.70**
+ store %union.anon.8.39.70* %tmp, %union.anon.8.39.70** %tmp1, align 8
+ %tmp2 = bitcast %union.anon.8.39.70* %tmp to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* nonnull undef, i64 13, i32 1, i1 false)
+ %tmp3 = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 1
+ store i64 13, i64* %tmp3, align 8
+ %tmp4 = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 2, i32 1, i64 5
+ store i8 0, i8* %tmp4, align 1
+ tail call void @TestBar(%class.basic_string.11.42.73* noalias sret %arg)
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { argmemonly nounwind }
diff --git a/test/CodeGen/PowerPC/thread-pointer.ll b/test/CodeGen/PowerPC/thread-pointer.ll
new file mode 100644
index 000000000000..2e8282d8dfb0
--- /dev/null
+++ b/test/CodeGen/PowerPC/thread-pointer.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK-64
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.thread.pointer() #1
+
+define i8* @thread_pointer() {
+; CHECK-32-LABEL: @thread_pointer
+; CHECK-32: mr 3, 2
+; CHECK-32: blr
+; CHECK-64-LABEL: @thread_pointer
+; CHECK-64: mr 3, 13
+; CHECK-64: blr
+ %1 = tail call i8* @llvm.thread.pointer()
+ ret i8* %1
+}
diff --git a/test/CodeGen/PowerPC/tls_get_addr_stackframe.ll b/test/CodeGen/PowerPC/tls_get_addr_stackframe.ll
index 4a235983e6f7..5ce7dfc54117 100644
--- a/test/CodeGen/PowerPC/tls_get_addr_stackframe.ll
+++ b/test/CodeGen/PowerPC/tls_get_addr_stackframe.ll
@@ -9,24 +9,19 @@
@tls_var = external thread_local global %struct1.2.41*, align 8
-define void @foo_test() {
+define i32 @foo_test() {
%1 = load %struct1.2.41*, %struct1.2.41** @tls_var, align 8
- br i1 undef, label %foo.exit, label %2
-; <label>:2 ; preds = %0
- br i1 undef, label %foo.exit, label %3
+ %2 = getelementptr inbounds %struct1.2.41, %struct1.2.41* %1, i64 0, i32 0, i32 3
+ %3 = load i32, i32* %2, align 8
+ %4 = add nsw i32 %3, -1
+ %5 = icmp eq i32 %4, 0
+ br i1 %5, label %bb7, label %foo.exit
-; <label>:3 ; preds = %2
- %4 = getelementptr inbounds %struct1.2.41, %struct1.2.41* %1, i64 0, i32 0, i32 3
- %5 = load i32, i32* %4, align 8
- %6 = add nsw i32 %5, -1
- %7 = icmp eq i32 %6, 0
- br i1 %7, label %8, label %foo.exit
-
-; <label>:8 ; preds = %3
+bb7: ; preds = %3
tail call void undef(%struct1.2.41* undef, %struct1.2.41* nonnull undef)
br label %foo.exit
foo.exit: ; preds = %8, %3, %2, %0
- ret void
+ ret i32 %4
}
diff --git a/test/CodeGen/PowerPC/unal-altivec.ll b/test/CodeGen/PowerPC/unal-altivec.ll
index 02f7ab40f049..823a6a70b859 100644
--- a/test/CodeGen/PowerPC/unal-altivec.ll
+++ b/test/CodeGen/PowerPC/unal-altivec.ll
@@ -30,17 +30,19 @@ vector.body: ; preds = %vector.body, %vecto
; CHECK: @foo
; CHECK-DAG: li [[C0:[0-9]+]], 0
-; CHECK-DAG: li [[C16:[0-9]+]], 16
-; CHECK-DAG: li [[C31:[0-9]+]], 31
+; CHECK-DAG: li [[C15:[0-9]+]], 15
; CHECK-DAG: lvx [[CNST:[0-9]+]],
; CHECK: .LBB0_1:
-; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[C0]]
-; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[C0]]
+; CHECK-DAG: lvsl [[MASK1:[0-9]+]], [[B1:[0-9]+]], [[C0]]
+; CHECK-DAG: lvsl [[MASK2:[0-9]+]], [[B2:[0-9]+]], [[C0]]
; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[C0]]
-; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]], [[C16]]
-; CHECK-DAG: lvx [[LD3:[0-9]+]], [[B3]], [[C31]]
-; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[PC]]
-; CHECK-DAG: vperm [[R2:[0-9]+]], [[LD2]], [[LD3]], [[PC]]
+; CHECK-DAG: add [[B4:[0-9]+]], [[B2]], [[C0]]
+; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[C0]]
+; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]], [[C15]]
+; CHECK-DAG: lvx [[LD3:[0-9]+]], [[B2]], [[C0]]
+; CHECK-DAG: lvx [[LD4:[0-9]+]], [[B4]], [[C15]]
+; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[MASK1]]
+; CHECK-DAG: vperm [[R2:[0-9]+]], [[LD3]], [[LD4]], [[MASK2]]
; CHECK-DAG: vaddfp {{[0-9]+}}, [[R1]], [[CNST]]
; CHECK-DAG: vaddfp {{[0-9]+}}, [[R2]], [[CNST]]
; CHECK: blr
diff --git a/test/CodeGen/PowerPC/unal4-std.ll b/test/CodeGen/PowerPC/unal4-std.ll
index e91109911161..de2a68c813b8 100644
--- a/test/CodeGen/PowerPC/unal4-std.ll
+++ b/test/CodeGen/PowerPC/unal4-std.ll
@@ -3,17 +3,10 @@
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
-define fastcc void @copy_to_conceal() #0 {
+define void @copy_to_conceal(<8 x i16>* %inp) #0 {
entry:
- br i1 undef, label %if.then, label %if.end210
-
-if.then: ; preds = %entry
- br label %vector.body.i
-
-vector.body.i: ; preds = %vector.body.i, %if.then
- %index.i = phi i64 [ 0, %vector.body.i ], [ 0, %if.then ]
- store <8 x i16> zeroinitializer, <8 x i16>* undef, align 2
- br label %vector.body.i
+ store <8 x i16> zeroinitializer, <8 x i16>* %inp, align 2
+ br label %if.end210
if.end210: ; preds = %entry
ret void
diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index e44da85f5b36..59b68342bd9a 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll
@@ -21,11 +21,10 @@ attributes #0 = { nounwind }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8, !11}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4", isOptimized: false, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "/tmp/unwind-dw2.c", directory: "/tmp")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
!5 = !DIFile(filename: "/tmp/unwind-dw2.c", directory: "/tmp")
!6 = !DISubroutineType(types: !7)
!7 = !{null}
diff --git a/test/CodeGen/PowerPC/vec_abs.ll b/test/CodeGen/PowerPC/vec_abs.ll
new file mode 100644
index 000000000000..8fa26a614b96
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_abs.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64le \
+; RUN: -mattr=+altivec -mattr=+vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64le \
+; RUN: -mattr=+altivec -mattr=-vsx | FileCheck %s \
+; RUN: -check-prefix=CHECK-NOVSX
+
+define <4 x float> @test_float(<4 x float> %aa) #0 {
+
+; CHECK-LABEL: test_float
+; CHECK-NOVSX-LABEL: test_float
+; CHECK-NOVSX-LABEL: test_float
+
+ entry:
+ %0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %aa) #2
+ ret <4 x float> %0
+}
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1
+
+; CHECK: xvabssp
+; CHECK: blr
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: blr
+
+define <4 x float> @test2_float(<4 x float> %aa) #0 {
+
+; CHECK-LABEL: test2_float
+; CHECK-NOVSX-LABEL: test2_float
+
+ entry:
+ %0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %aa) #2
+ %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00,
+ float -0.000000e+00, float -0.000000e+00>, %0
+ ret <4 x float> %sub
+}
+
+; CHECK: xvnabssp
+; CHECK: blr
+; CHECK-NOVSX: vspltisb
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: vsubfp
+; CHECK-NOVSX: blr
+
+define <2 x double> @test_double(<2 x double> %aa) #0 {
+
+; CHECK-LABEL: test_double
+; CHECK-NOVSX-LABEL: test_double
+
+ entry:
+ %0 = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %aa) #2
+ ret <2 x double> %0
+}
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1
+
+; CHECK: xvabsdp
+; CHECK: blr
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: fabs
+; CHECK-NOVSX: blr
+
+define <2 x double> @foo(<2 x double> %aa) #0 {
+ entry:
+ %0 = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %aa) #2
+ %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %0
+ ret <2 x double> %sub
+}
+
+; CHECK: xvnabsdp
+; CHECK: blr
+; CHECK-NOVSX: fnabs
+; CHECK-NOVSX: fnabs
+; CHECK-NOVSX: blr
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
index 516b2dd58b99..7b228bffc386 100644
--- a/test/CodeGen/PowerPC/vec_cmp.ll
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -24,7 +24,7 @@ define <4 x i8> @v4si8_cmp(<4 x i8> %x, <4 x i8> %y) nounwind readnone {
ret <4 x i8> %sext
}
; CHECK-LABEL: v4si8_cmp:
-; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
@@ -33,7 +33,7 @@ define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
ret <8 x i8> %sext
}
; CHECK-LABEL: v8si8_cmp:
-; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
; Additional tests for v16i8 since it is a altivec native type
@@ -158,7 +158,7 @@ define <4 x i16> @v4si16_cmp(<4 x i16> %x, <4 x i16> %y) nounwind readnone {
ret <4 x i16> %sext
}
; CHECK-LABEL: v4si16_cmp:
-; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
; Additional tests for v8i16 since it is an altivec native type
diff --git a/test/CodeGen/PowerPC/vec_fneg.ll b/test/CodeGen/PowerPC/vec_fneg.ll
index d6f6def64ea2..117336dc6a46 100644
--- a/test/CodeGen/PowerPC/vec_fneg.ll
+++ b/test/CodeGen/PowerPC/vec_fneg.ll
@@ -1,8 +1,37 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vsubfp
+; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s -check-prefix=CHECK-NOVSX
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64le \
+; RUN: -mattr=+altivec -mattr=+vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64le \
+; RUN: -mattr=+altivec -mattr=-vsx | FileCheck %s \
+; RUN: -check-prefix=CHECK-NOVSX
-define void @t(<4 x float>* %A) {
+define void @test_float(<4 x float>* %A) {
+; CHECK-LABEL: test_float
+; CHECK-NOVSX-LABEL: test_float
%tmp2 = load <4 x float>, <4 x float>* %A
%tmp3 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp2
store <4 x float> %tmp3, <4 x float>* %A
ret void
+
+; CHECK: xvnegsp
+; CHECK: blr
+; CHECK-NOVSX: vsubfp
+; CHECK-NOVSX: blr
+
+}
+
+define void @test_double(<2 x double>* %A) {
+; CHECK-LABEL: test_double
+; CHECK-NOVSX-LABEL: test_double
+ %tmp2 = load <2 x double>, <2 x double>* %A
+ %tmp3 = fsub <2 x double> < double -0.000000e+00, double -0.000000e+00 >, %tmp2
+ store <2 x double> %tmp3, <2 x double>* %A
+ ret void
+
+; CHECK: xvnegdp
+; CHECK: blr
+; CHECK-NOVSX: fneg
+; CHECK-NOVSX: fneg
+; CHECK-NOVSX: blr
+
}
diff --git a/test/CodeGen/PowerPC/vrsave-spill.ll b/test/CodeGen/PowerPC/vrsave-spill.ll
index c73206d8fc86..ceb787d05c9f 100644
--- a/test/CodeGen/PowerPC/vrsave-spill.ll
+++ b/test/CodeGen/PowerPC/vrsave-spill.ll
@@ -10,8 +10,8 @@ entry:
br label %return
; CHECK: @foo
-; CHECK: mfspr r{{[0-9]+}}, 256
-; CHECK: mtspr 256, r{{[0-9]+}}
+; CHECK: mfvrsave r{{[0-9]+}}
+; CHECK: mtvrsave r{{[0-9]+}}
return: ; preds = %entry
ret <4 x float> %d
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index 4f556b6b79c2..a94d955d35d4 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 | FileCheck -check-prefix=CHECK-FISL %s
+; XFAIL: *
; Also run with -schedule-ppc-vsx-fma-mutation-early as a stress test for the
; live-interval-updating logic.
diff --git a/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll b/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll
index e3f4001aa1d3..06636f24f97c 100644
--- a/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-mutate-undef.ll
@@ -3,15 +3,15 @@ target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"
; Function Attrs: nounwind
-define void @acosh_float8() #0 {
+define void @acosh_float8(<4 x i32> %v1, <4 x i32> %v2) #0 {
entry:
br i1 undef, label %if.then, label %if.end
if.then: ; preds = %entry
%0 = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> <float 0x3FE62E4200000000, float 0x3FE62E4200000000, float 0x3FE62E4200000000, float 0x3FE62E4200000000>, <4 x float> undef) #0
%astype.i.i.74.i = bitcast <4 x float> %0 to <4 x i32>
- %and.i.i.76.i = and <4 x i32> %astype.i.i.74.i, undef
- %or.i.i.79.i = or <4 x i32> %and.i.i.76.i, undef
+ %and.i.i.76.i = and <4 x i32> %astype.i.i.74.i, %v1
+ %or.i.i.79.i = or <4 x i32> %and.i.i.76.i, %v2
%astype5.i.i.80.i = bitcast <4 x i32> %or.i.i.79.i to <4 x float>
%1 = shufflevector <4 x float> %astype5.i.i.80.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%2 = shufflevector <8 x float> undef, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
diff --git a/test/CodeGen/PowerPC/vsx-fma-sp.ll b/test/CodeGen/PowerPC/vsx-fma-sp.ll
index b4dd2e1627c4..1a1f54ec30a5 100644
--- a/test/CodeGen/PowerPC/vsx-fma-sp.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-sp.ll
@@ -1,5 +1,7 @@
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx -fast-isel -O0 | FileCheck -check-prefix=CHECK-FISL %s
+; XFAIL: *
+
define void @test1sp(float %a, float %b, float %c, float %e, float* nocapture %d) #0 {
entry:
%0 = tail call float @llvm.fma.f32(float %b, float %c, float %a)
diff --git a/test/CodeGen/PowerPC/vsx-infl-copy1.ll b/test/CodeGen/PowerPC/vsx-infl-copy1.ll
index 531e3ad2d87c..a518e1b890fb 100644
--- a/test/CodeGen/PowerPC/vsx-infl-copy1.ll
+++ b/test/CodeGen/PowerPC/vsx-infl-copy1.ll
@@ -6,7 +6,7 @@ target triple = "powerpc64-unknown-linux-gnu"
@uc = external global [1024 x i32], align 4
; Function Attrs: noinline nounwind
-define void @_Z8example9Pj() #0 {
+define <4 x i32> @_Z8example9Pj(<4 x i32>* %addr1, i64 %input1, i64 %input2) #0 {
entry:
br label %vector.body
@@ -31,7 +31,7 @@ vector.body: ; preds = %vector.body, %entry
%0 = getelementptr [1024 x i32], [1024 x i32]* @ub, i64 0, i64 %.sum82
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load36 = load <4 x i32>, <4 x i32>* %1, align 4
- %wide.load37 = load <4 x i32>, <4 x i32>* undef, align 4
+ %wide.load37 = load <4 x i32>, <4 x i32>* %addr1, align 4
%.sum84 = add i64 %index, 32
%2 = getelementptr [1024 x i32], [1024 x i32]* @ub, i64 0, i64 %.sum84
%3 = bitcast i32* %2 to <4 x i32>*
@@ -40,7 +40,7 @@ vector.body: ; preds = %vector.body, %entry
%4 = getelementptr [1024 x i32], [1024 x i32]* @ub, i64 0, i64 %.sum85
%5 = bitcast i32* %4 to <4 x i32>*
%wide.load39 = load <4 x i32>, <4 x i32>* %5, align 4
- %6 = getelementptr [1024 x i32], [1024 x i32]* @ub, i64 0, i64 undef
+ %6 = getelementptr [1024 x i32], [1024 x i32]* @ub, i64 0, i64 %input1
%7 = bitcast i32* %6 to <4 x i32>*
%wide.load40 = load <4 x i32>, <4 x i32>* %7, align 4
%.sum87 = add i64 %index, 44
@@ -66,7 +66,7 @@ vector.body: ; preds = %vector.body, %entry
%18 = getelementptr [1024 x i32], [1024 x i32]* @uc, i64 0, i64 %.sum95
%19 = bitcast i32* %18 to <4 x i32>*
%wide.load47 = load <4 x i32>, <4 x i32>* %19, align 4
- %20 = getelementptr [1024 x i32], [1024 x i32]* @uc, i64 0, i64 undef
+ %20 = getelementptr [1024 x i32], [1024 x i32]* @uc, i64 0, i64 %input2
%21 = bitcast i32* %20 to <4 x i32>*
%wide.load48 = load <4 x i32>, <4 x i32>* %21, align 4
%.sum97 = add i64 %index, 28
@@ -126,7 +126,16 @@ middle.block: ; preds = %vector.body
%.lcssa103 = phi <4 x i32> [ %45, %vector.body ]
%.lcssa102 = phi <4 x i32> [ %44, %vector.body ]
%.lcssa = phi <4 x i32> [ %43, %vector.body ]
- ret void
+ %54 = add <4 x i32> %.lcssa112, %.lcssa111
+ %55 = add <4 x i32> %.lcssa110, %54
+ %56 = add <4 x i32> %.lcssa109, %55
+ %57 = add <4 x i32> %.lcssa108, %56
+ %58 = add <4 x i32> %.lcssa107, %57
+ %59 = add <4 x i32> %.lcssa106, %58
+ %60 = add <4 x i32> %.lcssa105, %59
+ %61 = add <4 x i32> %.lcssa103, %60
+ %62 = add <4 x i32> %.lcssa102, %61
+ ret <4 x i32> %62
}
attributes #0 = { noinline nounwind }
diff --git a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
index d6940e46df37..ce8a9bb4e3fa 100644
--- a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
+++ b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
@@ -1,6 +1,4 @@
-; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
-; RUN: grep lxvd2x < %t | count 18
-; RUN: grep stxvd2x < %t | count 18
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
@@ -18,150 +16,60 @@
define void @test1() {
entry:
; CHECK-LABEL: test1
- %__a.addr.i31 = alloca i32, align 4
- %__b.addr.i32 = alloca <4 x i32>*, align 8
- %__a.addr.i29 = alloca i32, align 4
- %__b.addr.i30 = alloca <4 x float>*, align 8
- %__a.addr.i27 = alloca i32, align 4
- %__b.addr.i28 = alloca <2 x i64>*, align 8
- %__a.addr.i25 = alloca i32, align 4
- %__b.addr.i26 = alloca <2 x i64>*, align 8
- %__a.addr.i23 = alloca i32, align 4
- %__b.addr.i24 = alloca <2 x double>*, align 8
- %__a.addr.i20 = alloca <4 x i32>, align 16
- %__b.addr.i21 = alloca i32, align 4
- %__c.addr.i22 = alloca <4 x i32>*, align 8
- %__a.addr.i17 = alloca <4 x i32>, align 16
- %__b.addr.i18 = alloca i32, align 4
- %__c.addr.i19 = alloca <4 x i32>*, align 8
- %__a.addr.i14 = alloca <4 x float>, align 16
- %__b.addr.i15 = alloca i32, align 4
- %__c.addr.i16 = alloca <4 x float>*, align 8
- %__a.addr.i11 = alloca <2 x i64>, align 16
- %__b.addr.i12 = alloca i32, align 4
- %__c.addr.i13 = alloca <2 x i64>*, align 8
- %__a.addr.i8 = alloca <2 x i64>, align 16
- %__b.addr.i9 = alloca i32, align 4
- %__c.addr.i10 = alloca <2 x i64>*, align 8
- %__a.addr.i6 = alloca <2 x double>, align 16
- %__b.addr.i7 = alloca i32, align 4
- %__c.addr.i = alloca <2 x double>*, align 8
- %__a.addr.i = alloca i32, align 4
- %__b.addr.i = alloca <4 x i32>*, align 8
- store i32 0, i32* %__a.addr.i, align 4
- store <4 x i32>* @vsi, <4 x i32>** %__b.addr.i, align 8
- %0 = load i32, i32* %__a.addr.i, align 4
- %1 = load <4 x i32>*, <4 x i32>** %__b.addr.i, align 8
- %2 = bitcast <4 x i32>* %1 to i8*
- %3 = getelementptr i8, i8* %2, i32 %0
- %4 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %3)
- store <4 x i32> %4, <4 x i32>* @res_vsi, align 16
- store i32 0, i32* %__a.addr.i31, align 4
- store <4 x i32>* @vui, <4 x i32>** %__b.addr.i32, align 8
- %5 = load i32, i32* %__a.addr.i31, align 4
- %6 = load <4 x i32>*, <4 x i32>** %__b.addr.i32, align 8
- %7 = bitcast <4 x i32>* %6 to i8*
- %8 = getelementptr i8, i8* %7, i32 %5
- %9 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %8)
- store <4 x i32> %9, <4 x i32>* @res_vui, align 16
- store i32 0, i32* %__a.addr.i29, align 4
- store <4 x float>* @vf, <4 x float>** %__b.addr.i30, align 8
- %10 = load i32, i32* %__a.addr.i29, align 4
- %11 = load <4 x float>*, <4 x float>** %__b.addr.i30, align 8
- %12 = bitcast <4 x float>* %11 to i8*
- %13 = getelementptr i8, i8* %12, i32 %10
- %14 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %13)
- %15 = bitcast <4 x i32> %14 to <4 x float>
- store <4 x float> %15, <4 x float>* @res_vf, align 16
- store i32 0, i32* %__a.addr.i27, align 4
- store <2 x i64>* @vsll, <2 x i64>** %__b.addr.i28, align 8
- %16 = load i32, i32* %__a.addr.i27, align 4
- %17 = load <2 x i64>*, <2 x i64>** %__b.addr.i28, align 8
- %18 = bitcast <2 x i64>* %17 to i8*
- %19 = getelementptr i8, i8* %18, i32 %16
- %20 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %19)
- %21 = bitcast <2 x double> %20 to <2 x i64>
- store <2 x i64> %21, <2 x i64>* @res_vsll, align 16
- store i32 0, i32* %__a.addr.i25, align 4
- store <2 x i64>* @vull, <2 x i64>** %__b.addr.i26, align 8
- %22 = load i32, i32* %__a.addr.i25, align 4
- %23 = load <2 x i64>*, <2 x i64>** %__b.addr.i26, align 8
- %24 = bitcast <2 x i64>* %23 to i8*
- %25 = getelementptr i8, i8* %24, i32 %22
- %26 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %25)
- %27 = bitcast <2 x double> %26 to <2 x i64>
- store <2 x i64> %27, <2 x i64>* @res_vull, align 16
- store i32 0, i32* %__a.addr.i23, align 4
- store <2 x double>* @vd, <2 x double>** %__b.addr.i24, align 8
- %28 = load i32, i32* %__a.addr.i23, align 4
- %29 = load <2 x double>*, <2 x double>** %__b.addr.i24, align 8
- %30 = bitcast <2 x double>* %29 to i8*
- %31 = getelementptr i8, i8* %30, i32 %28
- %32 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %31)
- store <2 x double> %32, <2 x double>* @res_vd, align 16
- %33 = load <4 x i32>, <4 x i32>* @vsi, align 16
- store <4 x i32> %33, <4 x i32>* %__a.addr.i20, align 16
- store i32 0, i32* %__b.addr.i21, align 4
- store <4 x i32>* @res_vsi, <4 x i32>** %__c.addr.i22, align 8
- %34 = load <4 x i32>, <4 x i32>* %__a.addr.i20, align 16
- %35 = load i32, i32* %__b.addr.i21, align 4
- %36 = load <4 x i32>*, <4 x i32>** %__c.addr.i22, align 8
- %37 = bitcast <4 x i32>* %36 to i8*
- %38 = getelementptr i8, i8* %37, i32 %35
- call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %34, i8* %38)
- %39 = load <4 x i32>, <4 x i32>* @vui, align 16
- store <4 x i32> %39, <4 x i32>* %__a.addr.i17, align 16
- store i32 0, i32* %__b.addr.i18, align 4
- store <4 x i32>* @res_vui, <4 x i32>** %__c.addr.i19, align 8
- %40 = load <4 x i32>, <4 x i32>* %__a.addr.i17, align 16
- %41 = load i32, i32* %__b.addr.i18, align 4
- %42 = load <4 x i32>*, <4 x i32>** %__c.addr.i19, align 8
- %43 = bitcast <4 x i32>* %42 to i8*
- %44 = getelementptr i8, i8* %43, i32 %41
- call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %40, i8* %44)
- %45 = load <4 x float>, <4 x float>* @vf, align 16
- store <4 x float> %45, <4 x float>* %__a.addr.i14, align 16
- store i32 0, i32* %__b.addr.i15, align 4
- store <4 x float>* @res_vf, <4 x float>** %__c.addr.i16, align 8
- %46 = load <4 x float>, <4 x float>* %__a.addr.i14, align 16
- %47 = bitcast <4 x float> %46 to <4 x i32>
- %48 = load i32, i32* %__b.addr.i15, align 4
- %49 = load <4 x float>*, <4 x float>** %__c.addr.i16, align 8
- %50 = bitcast <4 x float>* %49 to i8*
- %51 = getelementptr i8, i8* %50, i32 %48
- call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %47, i8* %51) #1
- %52 = load <2 x i64>, <2 x i64>* @vsll, align 16
- store <2 x i64> %52, <2 x i64>* %__a.addr.i11, align 16
- store i32 0, i32* %__b.addr.i12, align 4
- store <2 x i64>* @res_vsll, <2 x i64>** %__c.addr.i13, align 8
- %53 = load <2 x i64>, <2 x i64>* %__a.addr.i11, align 16
- %54 = bitcast <2 x i64> %53 to <2 x double>
- %55 = load i32, i32* %__b.addr.i12, align 4
- %56 = load <2 x i64>*, <2 x i64>** %__c.addr.i13, align 8
- %57 = bitcast <2 x i64>* %56 to i8*
- %58 = getelementptr i8, i8* %57, i32 %55
- call void @llvm.ppc.vsx.stxvd2x(<2 x double> %54, i8* %58)
- %59 = load <2 x i64>, <2 x i64>* @vull, align 16
- store <2 x i64> %59, <2 x i64>* %__a.addr.i8, align 16
- store i32 0, i32* %__b.addr.i9, align 4
- store <2 x i64>* @res_vull, <2 x i64>** %__c.addr.i10, align 8
- %60 = load <2 x i64>, <2 x i64>* %__a.addr.i8, align 16
- %61 = bitcast <2 x i64> %60 to <2 x double>
- %62 = load i32, i32* %__b.addr.i9, align 4
- %63 = load <2 x i64>*, <2 x i64>** %__c.addr.i10, align 8
- %64 = bitcast <2 x i64>* %63 to i8*
- %65 = getelementptr i8, i8* %64, i32 %62
- call void @llvm.ppc.vsx.stxvd2x(<2 x double> %61, i8* %65)
- %66 = load <2 x double>, <2 x double>* @vd, align 16
- store <2 x double> %66, <2 x double>* %__a.addr.i6, align 16
- store i32 0, i32* %__b.addr.i7, align 4
- store <2 x double>* @res_vd, <2 x double>** %__c.addr.i, align 8
- %67 = load <2 x double>, <2 x double>* %__a.addr.i6, align 16
- %68 = load i32, i32* %__b.addr.i7, align 4
- %69 = load <2 x double>*, <2 x double>** %__c.addr.i, align 8
- %70 = bitcast <2 x double>* %69 to i8*
- %71 = getelementptr i8, i8* %70, i32 %68
- call void @llvm.ppc.vsx.stxvd2x(<2 x double> %67, i8* %71)
+; CHECK: lxvd2x
+ %0 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vsi to i8*))
+; CHECK: stxvd2x
+ store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
+; CHECK: lxvd2x
+ %1 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vui to i8*))
+; CHECK: stxvd2x
+ store <4 x i32> %1, <4 x i32>* @res_vui, align 16
+; CHECK: lxvd2x
+ %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x float>* @vf to i8*))
+ %3 = bitcast <4 x i32> %2 to <4 x float>
+; CHECK: stxvd2x
+ store <4 x float> %3, <4 x float>* @res_vf, align 16
+; CHECK: lxvd2x
+ %4 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vsll to i8*))
+ %5 = bitcast <2 x double> %4 to <2 x i64>
+; CHECK: stxvd2x
+ store <2 x i64> %5, <2 x i64>* @res_vsll, align 16
+; CHECK: lxvd2x
+ %6 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vull to i8*))
+ %7 = bitcast <2 x double> %6 to <2 x i64>
+; CHECK: stxvd2x
+ store <2 x i64> %7, <2 x i64>* @res_vull, align 16
+; CHECK: lxvd2x
+ %8 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x double>* @vd to i8*))
+; CHECK: stxvd2x
+ store <2 x double> %8, <2 x double>* @res_vd, align 16
+; CHECK: lxvd2x
+ %9 = load <4 x i32>, <4 x i32>* @vsi, align 16
+; CHECK: stxvd2x
+ call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %9, i8* bitcast (<4 x i32>* @res_vsi to i8*))
+; CHECK: lxvd2x
+ %10 = load <4 x i32>, <4 x i32>* @vui, align 16
+; CHECK: stxvd2x
+ call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %10, i8* bitcast (<4 x i32>* @res_vui to i8*))
+; CHECK: lxvd2x
+ %11 = load <4 x float>, <4 x float>* @vf, align 16
+ %12 = bitcast <4 x float> %11 to <4 x i32>
+; CHECK: stxvd2x
+ call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %12, i8* bitcast (<4 x float>* @res_vf to i8*))
+; CHECK: lxvd2x
+ %13 = load <2 x i64>, <2 x i64>* @vsll, align 16
+ %14 = bitcast <2 x i64> %13 to <2 x double>
+; CHECK: stxvd2x
+ call void @llvm.ppc.vsx.stxvd2x(<2 x double> %14, i8* bitcast (<2 x i64>* @res_vsll to i8*))
+; CHECK: lxvd2x
+ %15 = load <2 x i64>, <2 x i64>* @vull, align 16
+ %16 = bitcast <2 x i64> %15 to <2 x double>
+; CHECK: stxvd2x
+ call void @llvm.ppc.vsx.stxvd2x(<2 x double> %16, i8* bitcast (<2 x i64>* @res_vull to i8*))
+; CHECK: lxvd2x
+ %17 = load <2 x double>, <2 x double>* @vd, align 16
+; CHECK: stxvd2x
+ call void @llvm.ppc.vsx.stxvd2x(<2 x double> %17, i8* bitcast (<2 x double>* @res_vd to i8*))
ret void
}
diff --git a/test/CodeGen/PowerPC/vsx-word-splats.ll b/test/CodeGen/PowerPC/vsx-word-splats.ll
new file mode 100644
index 000000000000..5632011da35d
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-word-splats.ll
@@ -0,0 +1,147 @@
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s \
+; RUN: --check-prefix=CHECK-BE
+
+define <4 x float> @test0f(<4 x float> %a) {
+entry:
+ %0 = bitcast <4 x float> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <16 x i8> %1 to <4 x float>
+ ret <4 x float> %2
+; CHECK-LABEL: test0f
+; CHECK xxspltw: 34, 34, 3
+; CHECK-BE-LABEL: test0f
+; CHECK-BE: xxspltw 34, 34, 0
+}
+
+define <4 x float> @test1f(<4 x float> %a) {
+entry:
+ %0 = bitcast <4 x float> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <16 x i8> %1 to <4 x float>
+ ret <4 x float> %2
+; CHECK-LABEL: test1f
+; CHECK xxspltw: 34, 34, 2
+; CHECK-BE-LABEL: test1f
+; CHECK-BE: xxspltw 34, 34, 1
+}
+
+define <4 x float> @test2f(<4 x float> %a) {
+entry:
+ %0 = bitcast <4 x float> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
+ %2 = bitcast <16 x i8> %1 to <4 x float>
+ ret <4 x float> %2
+; CHECK-LABEL: test2f
+; CHECK xxspltw: 34, 34, 1
+; CHECK-LABEL: test2f
+; CHECK-BE: xxspltw 34, 34, 2
+}
+
+define <4 x float> @test3f(<4 x float> %a) {
+entry:
+ %0 = bitcast <4 x float> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+ %2 = bitcast <16 x i8> %1 to <4 x float>
+ ret <4 x float> %2
+; CHECK-LABEL: test3f
+; CHECK xxspltw: 34, 34, 0
+; CHECK-BE-LABEL: test3f
+; CHECK-BE: xxspltw 34, 34, 3
+}
+
+define <4 x i32> @test0si(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test0si
+; CHECK xxspltw: 34, 34, 3
+; CHECK-BE-LABEL: test0si
+; CHECK-BE: xxspltw 34, 34, 0
+}
+
+define <4 x i32> @test1si(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test1si
+; CHECK xxspltw: 34, 34, 2
+; CHECK-BE-LABEL: test1si
+; CHECK-BE: xxspltw 34, 34, 1
+}
+
+define <4 x i32> @test2si(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test2si
+; CHECK xxspltw: 34, 34, 1
+; CHECK-BE-LABEL: test2si
+; CHECK-BE: xxspltw 34, 34, 2
+}
+
+define <4 x i32> @test3si(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test3si
+; CHECK xxspltw: 34, 34, 0
+; CHECK-BE-LABEL: test3si
+; CHECK-BE: xxspltw 34, 34, 3
+}
+
+define <4 x i32> @test0ui(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test0ui
+; CHECK xxspltw: 34, 34, 3
+; CHECK-BE-LABEL: test0ui
+; CHECK-BE: xxspltw 34, 34, 0
+}
+
+define <4 x i32> @test1ui(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test1ui
+; CHECK xxspltw: 34, 34, 2
+; CHECK-BE-LABEL: test1ui
+; CHECK-BE: xxspltw 34, 34, 1
+}
+
+define <4 x i32> @test2ui(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test2ui
+; CHECK xxspltw: 34, 34, 1
+; CHECK-BE-LABEL: test2ui
+; CHECK-BE: xxspltw 34, 34, 2
+}
+
+define <4 x i32> @test3ui(<4 x i32> %a) {
+entry:
+ %0 = bitcast <4 x i32> %a to <16 x i8>
+ %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ ret <4 x i32> %2
+; CHECK-LABEL: test3ui
+; CHECK xxspltw: 34, 34, 0
+; CHECK-BE-LABEL: test3ui
+; CHECK-BE: xxspltw 34, 34, 3
+}
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index b2eefb666760..9b65649978d3 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -1144,62 +1144,67 @@ define <2 x double> @test68(<2 x i32> %a) {
ret <2 x double> %w
; CHECK-LABEL: @test68
-; CHECK: xxsldwi [[V1:[0-9]+]], 34, 34, 1
+; CHECK: xxmrghw [[V1:[0-9]+]]
; CHECK: xvcvsxwdp 34, [[V1]]
; CHECK: blr
; CHECK-LE-LABEL: @test68
-; CHECK-LE: xxsldwi [[V1:[0-9]+]], 34, 34, 1
+; CHECK-LE: xxmrglw [[V1:[0-9]+]], 34, 34
; CHECK-LE: xvcvsxwdp 34, [[V1]]
; CHECK-LE: blr
}
+; This gets scalarized so the code isn't great
define <2 x double> @test69(<2 x i16> %a) {
%w = sitofp <2 x i16> %a to <2 x double>
ret <2 x double> %w
; CHECK-LABEL: @test69
-; CHECK: vspltisw [[V1:[0-9]+]], 8
-; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
-; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
-; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
-; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
-; CHECK: xvcvsxwdp 34, [[V4]]
+; CHECK-DAG: lfiwax
+; CHECK-DAG: lfiwax
+; CHECK-DAG: xscvsxddp
+; CHECK-DAG: xscvsxddp
+; CHECK: xxmrghd
; CHECK: blr
; CHECK-LE-LABEL: @test69
-; CHECK-LE: vspltisw [[V1:[0-9]+]], 8
-; CHECK-LE: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
-; CHECK-LE: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
-; CHECK-LE: vsraw {{[0-9]+}}, [[V3]], [[V2]]
-; CHECK-LE: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
-; CHECK-LE: xvcvsxwdp 34, [[V4]]
+; CHECK-LE: mfvsrd
+; CHECK-LE: mtvsrwa
+; CHECK-LE: mtvsrwa
+; CHECK-LE: xscvsxddp
+; CHECK-LE: xscvsxddp
+; CHECK-LE: xxspltd
+; CHECK-LE: xxspltd
+; CHECK-LE: xxmrgld
; CHECK-LE: blr
}
+; This gets scalarized so the code isn't great
define <2 x double> @test70(<2 x i8> %a) {
%w = sitofp <2 x i8> %a to <2 x double>
ret <2 x double> %w
; CHECK-LABEL: @test70
-; CHECK: vspltisw [[V1:[0-9]+]], 12
-; CHECK: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
-; CHECK: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
-; CHECK: vsraw {{[0-9]+}}, [[V3]], [[V2]]
-; CHECK: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
-; CHECK: xvcvsxwdp 34, [[V4]]
+; CHECK-DAG: lfiwax
+; CHECK-DAG: lfiwax
+; CHECK-DAG: xscvsxddp
+; CHECK-DAG: xscvsxddp
+; CHECK: xxmrghd
; CHECK: blr
; CHECK-LE-LABEL: @test70
-; CHECK-LE: vspltisw [[V1:[0-9]+]], 12
-; CHECK-LE: vadduwm [[V2:[0-9]+]], [[V1]], [[V1]]
-; CHECK-LE: vslw [[V3:[0-9]+]], {{[0-9]+}}, [[V2]]
-; CHECK-LE: vsraw {{[0-9]+}}, [[V3]], [[V2]]
-; CHECK-LE: xxsldwi [[V4:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}, 1
-; CHECK-LE: xvcvsxwdp 34, [[V4]]
+; CHECK-LE: mfvsrd
+; CHECK-LE: mtvsrwa
+; CHECK-LE: mtvsrwa
+; CHECK-LE: xscvsxddp
+; CHECK-LE: xscvsxddp
+; CHECK-LE: xxspltd
+; CHECK-LE: xxspltd
+; CHECK-LE: xxmrgld
; CHECK-LE: blr
}
+; This gets scalarized so the code isn't great
define <2 x i32> @test80(i32 %v) {
%b1 = insertelement <2 x i32> undef, i32 %v, i32 0
%b2 = shufflevector <2 x i32> %b1, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -1207,31 +1212,38 @@ define <2 x i32> @test80(i32 %v) {
ret <2 x i32> %i
; CHECK-REG-LABEL: @test80
-; CHECK-REG-DAG: addi [[R1:[0-9]+]], 3, 3
-; CHECK-REG-DAG: addi [[R2:[0-9]+]], 1, -16
-; CHECK-REG-DAG: addi [[R3:[0-9]+]], 3, 2
-; CHECK-REG: std [[R1]], -8(1)
-; CHECK-REG: std [[R3]], -16(1)
-; CHECK-REG: lxvd2x 34, 0, [[R2]]
-; CHECK-REG-NOT: stxvd2x
+; CHECK-REG: stw 3, -16(1)
+; CHECK-REG: addi [[R1:[0-9]+]], 1, -16
+; CHECK-REG: addis [[R2:[0-9]+]]
+; CHECK-REG: addi [[R2]], [[R2]]
+; CHECK-REG-DAG: lxvw4x [[VS1:[0-9]+]], 0, [[R1]]
+; CHECK-REG-DAG: lxvw4x 35, 0, [[R2]]
+; CHECK-REG: xxspltw 34, [[VS1]], 0
+; CHECK-REG: vadduwm 2, 2, 3
+; CHECK-REG-NOT: stxvw4x
; CHECK-REG: blr
; CHECK-FISL-LABEL: @test80
-; CHECK-FISL-DAG: addi [[R1:[0-9]+]], 3, 3
-; CHECK-FISL-DAG: addi [[R2:[0-9]+]], 1, -16
-; CHECK-FISL-DAG: addi [[R3:[0-9]+]], 3, 2
-; CHECK-FISL-DAG: std [[R1]], -8(1)
-; CHECK-FISL-DAG: std [[R3]], -16(1)
-; CHECK-FISL-DAG: lxvd2x 0, 0, [[R2]]
+; CHECK-FISL: mr 4, 3
+; CHECK-FISL: stw 4, -16(1)
+; CHECK-FISL: addi [[R1:[0-9]+]], 1, -16
+; CHECK-FISL-DAG: lxvw4x [[VS1:[0-9]+]], 0, [[R1]]
+; CHECK-FISL-DAG: xxspltw {{[0-9]+}}, [[VS1]], 0
+; CHECK-FISL: addis [[R2:[0-9]+]]
+; CHECK-FISL: addi [[R2]], [[R2]]
+; CHECK-FISL-DAG: lxvw4x {{[0-9]+}}, 0, [[R2]]
+; CHECK-FISL: vadduwm
+; CHECK-FISL-NOT: stxvw4x
; CHECK-FISL: blr
; CHECK-LE-LABEL: @test80
; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
+; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]]
; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
-; CHECK-LE-DAG: xxspltd 34, [[R1]]
+; CHECK-LE-DAG: xxspltw 34, [[V1]]
; CHECK-LE-DAG: xxswapd 35, [[V2]]
-; CHECK-LE: vaddudm 2, 2, 3
+; CHECK-LE: vadduwm 2, 2, 3
; CHECK-LE: blr
}
diff --git a/test/CodeGen/PowerPC/weak_def_can_be_hidden.ll b/test/CodeGen/PowerPC/weak_def_can_be_hidden.ll
index 0b87613bb4d8..3760f1bb1657 100644
--- a/test/CodeGen/PowerPC/weak_def_can_be_hidden.ll
+++ b/test/CodeGen/PowerPC/weak_def_can_be_hidden.ll
@@ -3,7 +3,7 @@
; RUN: llc -mtriple=powerpc-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
; RUN: llc -mtriple=powerpc-apple-darwin8 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
-@v1 = linkonce_odr constant i32 32
+@v1 = linkonce_odr local_unnamed_addr constant i32 32
; CHECK: .globl _v1
; CHECK: .weak_def_can_be_hidden _v1
@@ -26,7 +26,7 @@ define i32* @f2() {
ret i32* @v2
}
-@v3 = linkonce_odr unnamed_addr global i32 32
+@v3 = linkonce_odr unnamed_addr constant i32 32
; CHECK: .globl _v3
; CHECK: .weak_def_can_be_hidden _v3
@@ -37,9 +37,9 @@ define i32* @f3() {
ret i32* @v3
}
-@v4 = linkonce_odr global i32 32
+@v4 = linkonce_odr unnamed_addr global i32 32
; CHECK: .globl _v4
-; CHECK: .weak_definition _v4
+; CHECK: .weak_def_can_be_hidden _v4
; CHECK-D89: .globl _v4
; CHECK-D89: .weak_definition _v4
diff --git a/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll b/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
index ef63233e746b..fd8adff5a1d7 100644
--- a/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
+++ b/test/CodeGen/PowerPC/xvcmpeqdp-v2f64.ll
@@ -3,22 +3,24 @@ target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"
; Function Attrs: nounwind
-define void @__fmax_double3_3D_exec() #0 {
+define void @__fmax_double3_3D_exec(<3 x double> %input1, <3 x i64> %input2,
+ <3 x i1> %input3, <3 x i64> %input4,
+ <3 x i64> %input5, <4 x double>* %input6) #0 {
entry:
br i1 undef, label %if.then.i, label %fmax_double3.exit
if.then.i: ; preds = %entry
- %cmp24.i.i = fcmp ord <3 x double> undef, zeroinitializer
+ %cmp24.i.i = fcmp ord <3 x double> %input1, zeroinitializer
%sext25.i.i = sext <3 x i1> %cmp24.i.i to <3 x i64>
%neg.i.i = xor <3 x i64> %sext25.i.i, <i64 -1, i64 -1, i64 -1>
- %or.i.i = or <3 x i64> undef, %neg.i.i
- %neg.i.i.i = select <3 x i1> undef, <3 x i64> zeroinitializer, <3 x i64> %sext25.i.i
- %and.i.i.i = and <3 x i64> undef, %neg.i.i.i
- %and26.i.i.i = and <3 x i64> undef, %or.i.i
+ %or.i.i = or <3 x i64> %input2, %neg.i.i
+ %neg.i.i.i = select <3 x i1> %input3, <3 x i64> zeroinitializer, <3 x i64> %sext25.i.i
+ %and.i.i.i = and <3 x i64> %input4, %neg.i.i.i
+ %and26.i.i.i = and <3 x i64> %input5, %or.i.i
%or.i.i.i = or <3 x i64> %and.i.i.i, %and26.i.i.i
%astype32.i.i.i = bitcast <3 x i64> %or.i.i.i to <3 x double>
%extractVec33.i.i.i = shufflevector <3 x double> %astype32.i.i.i, <3 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
- store <4 x double> %extractVec33.i.i.i, <4 x double>* undef, align 32
+ store <4 x double> %extractVec33.i.i.i, <4 x double>* %input6, align 32
br label %fmax_double3.exit
; CHECK-LABEL: @__fmax_double3_3D_exec
diff --git a/test/CodeGen/SPARC/2011-01-11-CC.ll b/test/CodeGen/SPARC/2011-01-11-CC.ll
index 6ea78dd7e169..6b738e386c3a 100755
--- a/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -70,7 +70,7 @@ entry:
;V8: {{fbe|fbne}}
;V9-LABEL: test_select_int_fcc:
;V9: fcmps
-;V9-NEXT-NOT: nop
+;V9-NOT: nop
;V9-NOT: {{fbe|fbne}}
;V9: mov{{e|ne}} %fcc0
%0 = fcmp une float %f, 0.000000e+00
@@ -101,7 +101,7 @@ entry:
;V8: {{fbne|fbe}}
;V9-LABEL: test_select_dfp_fcc:
;V9: fcmpd
-;V9-NEXT-NOT: nop
+;V9-NOT: nop
;V9-NOT: {{fbne|fbe}}
;V9: fmovd{{e|ne}} %fcc0
%0 = fcmp une double %f, 0.000000e+00
diff --git a/test/CodeGen/SPARC/32abi.ll b/test/CodeGen/SPARC/32abi.ll
index 7ac1de5c0904..09e7a3a09d86 100644
--- a/test/CodeGen/SPARC/32abi.ll
+++ b/test/CodeGen/SPARC/32abi.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=sparc -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
-; RUN: llc < %s -march=sparcel -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc < %s -march=sparc -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s --check-prefix=CHECK --check-prefix=HARD --check-prefix=CHECK-BE
+; RUN: llc < %s -march=sparcel -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s --check-prefix=CHECK --check-prefix=HARD --check-prefix=CHECK-LE
+; RUN: llc < %s -march=sparc -disable-sparc-delay-filler -disable-sparc-leaf-proc -mattr=soft-float | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT --check-prefix=CHECK-BE
; CHECK-LABEL: intarg:
; The save/restore frame is not strictly necessary here, but we would need to
@@ -55,29 +56,82 @@ define void @call_intarg(i32 %i0, i8* %i1) {
;; straddling the boundary of regs and mem, and floats in regs and mem.
;
; CHECK-LABEL: floatarg:
-; CHECK: save %sp, -120, %sp
-; CHECK: mov %i5, %g2
-; CHECK-NEXT: ld [%fp+92], %g3
-; CHECK-NEXT: mov %i4, %i5
-; CHECK-NEXT: std %g2, [%fp+-24]
-; CHECK-NEXT: mov %i3, %i4
-; CHECK-NEXT: std %i4, [%fp+-16]
-; CHECK-NEXT: std %i0, [%fp+-8]
-; CHECK-NEXT: st %i2, [%fp+-28]
-; CHECK-NEXT: ld [%fp+104], %f0
-; CHECK-NEXT: ldd [%fp+96], %f2
-; CHECK-NEXT: ld [%fp+-28], %f1
-; CHECK-NEXT: ldd [%fp+-8], %f4
-; CHECK-NEXT: ldd [%fp+-16], %f6
-; CHECK-NEXT: ldd [%fp+-24], %f8
-; CHECK-NEXT: fstod %f1, %f10
-; CHECK-NEXT: faddd %f4, %f10, %f4
-; CHECK-NEXT: faddd %f6, %f4, %f4
-; CHECK-NEXT: faddd %f8, %f4, %f4
-; CHECK-NEXT: faddd %f2, %f4, %f2
-; CHECK-NEXT: fstod %f0, %f0
-; CHECK-NEXT: faddd %f0, %f2, %f0
-; CHECK-NEXT: restore
+; HARD: save %sp, -120, %sp
+; HARD: mov %i5, %g2
+; HARD-NEXT: ld [%fp+92], %g3
+; HARD-NEXT: mov %i4, %i5
+; HARD-NEXT: ! kill
+; HARD-NEXT: std %g2, [%fp+-24]
+; HARD-NEXT: mov %i3, %i4
+; HARD-NEXT: std %i4, [%fp+-16]
+; HARD-NEXT: ! kill
+; HARD-NEXT: std %i0, [%fp+-8]
+; HARD-NEXT: st %i2, [%fp+-28]
+; HARD-NEXT: ld [%fp+104], %f0
+; HARD-NEXT: ldd [%fp+96], %f2
+; HARD-NEXT: ld [%fp+-28], %f1
+; HARD-NEXT: ldd [%fp+-8], %f4
+; HARD-NEXT: ldd [%fp+-16], %f6
+; HARD-NEXT: ldd [%fp+-24], %f8
+; HARD-NEXT: fstod %f1, %f10
+; HARD-NEXT: faddd %f4, %f10, %f4
+; HARD-NEXT: faddd %f6, %f4, %f4
+; HARD-NEXT: faddd %f8, %f4, %f4
+; HARD-NEXT: faddd %f2, %f4, %f2
+; HARD-NEXT: fstod %f0, %f0
+; HARD-NEXT: faddd %f0, %f2, %f0
+; SOFT: save %sp, -96, %sp
+; SOFT: ld [%fp+104], %l0
+; SOFT-NEXT: ld [%fp+96], %l1
+; SOFT-NEXT: ld [%fp+100], %l2
+; SOFT-NEXT: ld [%fp+92], %l3
+; SOFT-NEXT: mov %i2, %o0
+; SOFT-NEXT: call __extendsfdf2
+; SOFT-NEXT: nop
+; SOFT-NEXT: mov %o0, %i2
+; SOFT-NEXT: mov %o1, %g2
+; SOFT-NEXT: mov %i0, %o0
+; SOFT-NEXT: mov %i1, %o1
+; SOFT-NEXT: mov %i2, %o2
+; SOFT-NEXT: mov %g2, %o3
+; SOFT-NEXT: call __adddf3
+; SOFT-NEXT: nop
+; SOFT-NEXT: mov %o0, %i0
+; SOFT-NEXT: mov %o1, %i1
+; SOFT-NEXT: mov %i3, %o0
+; SOFT-NEXT: mov %i4, %o1
+; SOFT-NEXT: mov %i0, %o2
+; SOFT-NEXT: mov %i1, %o3
+; SOFT-NEXT: call __adddf3
+; SOFT-NEXT: nop
+; SOFT-NEXT: mov %o0, %i0
+; SOFT-NEXT: mov %o1, %i1
+; SOFT-NEXT: mov %i5, %o0
+; SOFT-NEXT: mov %l3, %o1
+; SOFT-NEXT: mov %i0, %o2
+; SOFT-NEXT: mov %i1, %o3
+; SOFT-NEXT: call __adddf3
+; SOFT-NEXT: nop
+; SOFT-NEXT: mov %o0, %i0
+; SOFT-NEXT: mov %o1, %i1
+; SOFT-NEXT: mov %l1, %o0
+; SOFT-NEXT: mov %l2, %o1
+; SOFT-NEXT: mov %i0, %o2
+; SOFT-NEXT: mov %i1, %o3
+; SOFT-NEXT: call __adddf3
+; SOFT-NEXT: nop
+; SOFT-NEXT: mov %o0, %i0
+; SOFT-NEXT: mov %o1, %i1
+; SOFT-NEXT: mov %l0, %o0
+; SOFT-NEXT: call __extendsfdf2
+; SOFT-NEXT: nop
+; SOFT-NEXT: mov %i0, %o2
+; SOFT-NEXT: mov %i1, %o3
+; SOFT-NEXT: call __adddf3
+; SOFT-NEXT: nop
+; SOFT-NEXT: mov %o0, %i0
+; SOFT-NEXT: mov %o1, %i1
+; CHECK: restore
define double @floatarg(double %a0, ; %i0,%i1
float %a1, ; %i2
double %a2, ; %i3, %i4
@@ -95,18 +149,30 @@ define double @floatarg(double %a0, ; %i0,%i1
}
; CHECK-LABEL: call_floatarg:
-; CHECK: save %sp, -112, %sp
-; CHECK: mov %i2, %o1
-; CHECK-NEXT: mov %i1, %o0
-; CHECK-NEXT: st %i0, [%sp+104]
-; CHECK-NEXT: std %o0, [%sp+96]
-; CHECK-NEXT: st %o1, [%sp+92]
-; CHECK-NEXT: mov %i0, %o2
-; CHECK-NEXT: mov %o0, %o3
-; CHECK-NEXT: mov %o1, %o4
-; CHECK-NEXT: mov %o0, %o5
-; CHECK-NEXT: call floatarg
-; CHECK: std %f0, [%i4]
+; HARD: save %sp, -112, %sp
+; HARD: mov %i2, %o1
+; HARD-NEXT: mov %i1, %o0
+; HARD-NEXT: st %i0, [%sp+104]
+; HARD-NEXT: std %o0, [%sp+96]
+; HARD-NEXT: st %o1, [%sp+92]
+; HARD-NEXT: mov %i0, %o2
+; HARD-NEXT: mov %o0, %o3
+; HARD-NEXT: mov %o1, %o4
+; HARD-NEXT: mov %o0, %o5
+; HARD-NEXT: call floatarg
+; HARD: std %f0, [%i4]
+; SOFT: st %i0, [%sp+104]
+; SOFT-NEXT: st %i2, [%sp+100]
+; SOFT-NEXT: st %i1, [%sp+96]
+; SOFT-NEXT: st %i2, [%sp+92]
+; SOFT-NEXT: mov %i1, %o0
+; SOFT-NEXT: mov %i2, %o1
+; SOFT-NEXT: mov %i0, %o2
+; SOFT-NEXT: mov %i1, %o3
+; SOFT-NEXT: mov %i2, %o4
+; SOFT-NEXT: mov %i1, %o5
+; SOFT-NEXT: call floatarg
+; SOFT: std %o0, [%i4]
; CHECK: restore
define void @call_floatarg(float %f1, double %d2, float %f5, double *%p) {
%r = call double @floatarg(double %d2, float %f1, double %d2, double %d2,
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index 96104ecc3c68..b963be2e9853 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=sparcv9 -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s
+; RUN: llc < %s -march=sparcv9 -disable-sparc-delay-filler -disable-sparc-leaf-proc | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+; RUN: llc < %s -march=sparcv9 -disable-sparc-delay-filler -disable-sparc-leaf-proc -mattr=soft-float | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
; CHECK-LABEL: intarg:
; The save/restore frame is not strictly necessary here, but we would need to
@@ -54,13 +55,22 @@ define void @call_intarg(i32 %i0, i8* %i1) {
}
; CHECK-LABEL: floatarg:
-; CHECK: save %sp, -128, %sp
-; CHECK: ld [%fp+2307], [[F:%f[0-9]+]]
-; CHECK: fstod %f1,
-; CHECK: faddd %f2,
-; CHECK: faddd %f4,
-; CHECK: faddd %f6,
-; CHECK: fadds %f31, [[F]]
+; HARD: save %sp, -128, %sp
+; HARD: ld [%fp+2307], [[F:%f[0-9]+]]
+; HARD: fstod %f1,
+; HARD: faddd %f2,
+; HARD: faddd %f4,
+; HARD: faddd %f6,
+; HARD: fadds %f31, [[F]]
+; SOFT: save %sp, -176, %sp
+; SOFT: srl %i0, 0, %o0
+; SOFT-NEXT: call __extendsfdf2
+; SOFT: mov %o0, %i0
+; SOFT: mov %i1, %o0
+; SOFT: mov %i2, %o0
+; SOFT: mov %i3, %o0
+; SOFT: ld [%fp+2299], %o0
+; SOFT: ld [%fp+2307], %o1
define double @floatarg(float %a0, ; %f1
double %a1, ; %d2
double %a2, ; %d4
@@ -92,13 +102,32 @@ define double @floatarg(float %a0, ; %f1
; CHECK-LABEL: call_floatarg:
; CHECK: save %sp, -272, %sp
; Store 8 bytes in full slot.
-; CHECK: std %f2, [%sp+2311]
+; HARD: std %f2, [%sp+2311]
; Store 4 bytes, right-aligned in slot.
-; CHECK: st %f1, [%sp+2307]
-; CHECK: fmovd %f2, %f4
+; HARD: st %f1, [%sp+2307]
+; HARD: fmovd %f2, %f4
+; SOFT: stx %i1, [%sp+2311]
+; SOFT: stx %i0, [%sp+2303]
+; SOFT: stx %i2, [%sp+2295]
+; SOFT: stx %i2, [%sp+2287]
+; SOFT: stx %i2, [%sp+2279]
+; SOFT: stx %i2, [%sp+2271]
+; SOFT: stx %i2, [%sp+2263]
+; SOFT: stx %i2, [%sp+2255]
+; SOFT: stx %i2, [%sp+2247]
+; SOFT: stx %i2, [%sp+2239]
+; SOFT: stx %i2, [%sp+2231]
+; SOFT: stx %i2, [%sp+2223]
+; SOFT: mov %i2, %o0
+; SOFT: mov %i1, %o1
+; SOFT: mov %i1, %o2
+; SOFT: mov %i1, %o3
+; SOFT: mov %i2, %o4
+; SOFT: mov %i2, %o5
; CHECK: call floatarg
; CHECK-NOT: add %sp
; CHECK: restore
+
define void @call_floatarg(float %f1, double %d2, float %f5, double *%p) {
%r = call double @floatarg(float %f5, double %d2, double %d2, double %d2,
float %f5, float %f5, float %f5, float %f5,
@@ -112,9 +141,21 @@ define void @call_floatarg(float %f1, double %d2, float %f5, double *%p) {
; CHECK-LABEL: mixedarg:
; CHECK: ldx [%fp+2247]
; CHECK: ldx [%fp+2231]
-; CHECK: fstod %f3
-; CHECK: faddd %f6
-; CHECK: faddd %f16
+; SOFT: ldx [%fp+2239], %i0
+; HARD: fstod %f3
+; HARD: faddd %f6
+; HARD: faddd %f16
+; SOFT: mov %o0, %i1
+; SOFT-NEXT: mov %i3, %o0
+; SOFT-NEXT: mov %i1, %o1
+; SOFT-NEXT: call __adddf3
+; SOFT: mov %o0, %i1
+; SOFT-NEXT: mov %i0, %o0
+; SOFT-NEXT: mov %i1, %o1
+; SOFT-NEXT: call __adddf3
+; HARD: std %f0, [%i1]
+; SOFT: stx %o0, [%i5]
+
define void @mixedarg(i8 %a0, ; %i0
float %a1, ; %f3
i16 %a2, ; %i2
@@ -135,12 +176,15 @@ define void @mixedarg(i8 %a0, ; %i0
; CHECK-LABEL: call_mixedarg:
; CHECK: stx %i2, [%sp+2247]
+; SOFT: stx %i1, [%sp+2239]
; CHECK: stx %i0, [%sp+2223]
-; CHECK: fmovd %f2, %f6
-; CHECK: fmovd %f2, %f16
+; HARD: fmovd %f2, %f6
+; HARD: fmovd %f2, %f16
+; SOFT: mov %i1, %o3
; CHECK: call mixedarg
; CHECK-NOT: add %sp
; CHECK: restore
+
define void @call_mixedarg(i64 %i0, double %f2, i16* %i2) {
call void @mixedarg(i8 undef,
float undef,
@@ -158,8 +202,10 @@ define void @call_mixedarg(i64 %i0, double %f2, i16* %i2) {
; The inreg attribute is used to indicate 32-bit sized struct elements that
; share an 8-byte slot.
; CHECK-LABEL: inreg_fi:
-; CHECK: fstoi %f1
-; CHECK: srlx %i0, 32, [[R:%[gilo][0-7]]]
+; SOFT: srlx %i0, 32, [[R:%[gilo][0-7]]]
+; HARD: fstoi %f1
+; SOFT: call __fixsfsi
+; HARD: srlx %i0, 32, [[R:%[gilo][0-7]]]
; CHECK: sub [[R]],
define i32 @inreg_fi(i32 inreg %a0, ; high bits of %i0
float inreg %a1) { ; %f1
@@ -171,8 +217,11 @@ define i32 @inreg_fi(i32 inreg %a0, ; high bits of %i0
; CHECK-LABEL: call_inreg_fi:
; Allocate space for 6 arguments, even when only 2 are used.
; CHECK: save %sp, -176, %sp
-; CHECK: sllx %i1, 32, %o0
-; CHECK: fmovs %f5, %f1
+; HARD: sllx %i1, 32, %o0
+; HARD: fmovs %f5, %f1
+; SOFT: srl %i2, 0, %i0
+; SOFT: sllx %i1, 32, %i1
+; SOFT: or %i1, %i0, %o0
; CHECK: call inreg_fi
define void @call_inreg_fi(i32* %p, i32 %i1, float %f5) {
%x = call i32 @inreg_fi(i32 %i1, float %f5)
@@ -180,7 +229,10 @@ define void @call_inreg_fi(i32* %p, i32 %i1, float %f5) {
}
; CHECK-LABEL: inreg_ff:
-; CHECK: fsubs %f0, %f1, %f0
+; HARD: fsubs %f0, %f1, %f0
+; SOFT: srlx %i0, 32, %o0
+; SOFT: srl %i0, 0, %o1
+; SOFT: call __subsf3
define float @inreg_ff(float inreg %a0, ; %f0
float inreg %a1) { ; %f1
%rv = fsub float %a0, %a1
@@ -188,8 +240,11 @@ define float @inreg_ff(float inreg %a0, ; %f0
}
; CHECK-LABEL: call_inreg_ff:
-; CHECK: fmovs %f3, %f0
-; CHECK: fmovs %f5, %f1
+; HARD: fmovs %f3, %f0
+; HARD: fmovs %f5, %f1
+; SOFT: srl %i2, 0, %i0
+; SOFT: sllx %i1, 32, %i1
+; SOFT: or %i1, %i0, %o0
; CHECK: call inreg_ff
define void @call_inreg_ff(i32* %p, float %f3, float %f5) {
%x = call float @inreg_ff(float %f3, float %f5)
@@ -197,7 +252,9 @@ define void @call_inreg_ff(i32* %p, float %f3, float %f5) {
}
; CHECK-LABEL: inreg_if:
-; CHECK: fstoi %f0
+; HARD: fstoi %f0
+; SOFT: srlx %i0, 32, %o0
+; SOFT: call __fixsfsi
; CHECK: sub %i0
define i32 @inreg_if(float inreg %a0, ; %f0
i32 inreg %a1) { ; low bits of %i0
@@ -207,8 +264,11 @@ define i32 @inreg_if(float inreg %a0, ; %f0
}
; CHECK-LABEL: call_inreg_if:
-; CHECK: fmovs %f3, %f0
-; CHECK: mov %i2, %o0
+; HARD: fmovs %f3, %f0
+; HARD: mov %i2, %o0
+; SOFT: srl %i2, 0, %i0
+; SOFT: sllx %i1, 32, %i1
+; SOFT: or %i1, %i0, %o0
; CHECK: call inreg_if
define void @call_inreg_if(i32* %p, float %f3, i32 %i2) {
%x = call i32 @inreg_if(float %f3, i32 %i2)
@@ -265,7 +325,8 @@ define void @call_ret_i64_pair(i64* %i0) {
; This is not a C struct, the i32 member uses 8 bytes, but the float only 4.
; CHECK-LABEL: ret_i32_float_pair:
; CHECK: ld [%i2], %i0
-; CHECK: ld [%i3], %f2
+; HARD: ld [%i3], %f2
+; SOFT: ld [%i3], %i1
define { i32, float } @ret_i32_float_pair(i32 %a0, i32 %a1,
i32* %p, float* %q) {
%r1 = load i32, i32* %p
@@ -279,7 +340,8 @@ define { i32, float } @ret_i32_float_pair(i32 %a0, i32 %a1,
; CHECK-LABEL: call_ret_i32_float_pair:
; CHECK: call ret_i32_float_pair
; CHECK: st %o0, [%i0]
-; CHECK: st %f2, [%i1]
+; HARD: st %f2, [%i1]
+; SOFT: st %o1, [%i1]
define void @call_ret_i32_float_pair(i32* %i0, float* %i1) {
%rv = call { i32, float } @ret_i32_float_pair(i32 undef, i32 undef,
i32* undef, float* undef)
@@ -293,7 +355,8 @@ define void @call_ret_i32_float_pair(i32* %i0, float* %i1) {
; This is a C struct, each member uses 4 bytes.
; CHECK-LABEL: ret_i32_float_packed:
; CHECK: ld [%i2], [[R:%[gilo][0-7]]]
-; CHECK: ld [%i3], %f1
+; HARD: ld [%i3], %f1
+; SOFT: ld [%i3], %i1
; CHECK: sllx [[R]], 32, %i0
define inreg { i32, float } @ret_i32_float_packed(i32 %a0, i32 %a1,
i32* %p, float* %q) {
@@ -309,7 +372,8 @@ define inreg { i32, float } @ret_i32_float_packed(i32 %a0, i32 %a1,
; CHECK: call ret_i32_float_packed
; CHECK: srlx %o0, 32, [[R:%[gilo][0-7]]]
; CHECK: st [[R]], [%i0]
-; CHECK: st %f1, [%i1]
+; HARD: st %f1, [%i1]
+; SOFT: st %o0, [%i1]
define void @call_ret_i32_float_packed(i32* %i0, float* %i1) {
%rv = call { i32, float } @ret_i32_float_packed(i32 undef, i32 undef,
i32* undef, float* undef)
@@ -413,13 +477,21 @@ entry:
declare i32 @use_buf(i32, i8*)
; CHECK-LABEL: test_fp128_args:
-; CHECK-DAG: std %f0, [%fp+{{.+}}]
-; CHECK-DAG: std %f2, [%fp+{{.+}}]
-; CHECK-DAG: std %f6, [%fp+{{.+}}]
-; CHECK-DAG: std %f4, [%fp+{{.+}}]
-; CHECK: add %fp, [[Offset:[0-9]+]], %o0
-; CHECK: call _Qp_add
-; CHECK: ldd [%fp+[[Offset]]], %f0
+; HARD-DAG: std %f0, [%fp+{{.+}}]
+; HARD-DAG: std %f2, [%fp+{{.+}}]
+; HARD-DAG: std %f6, [%fp+{{.+}}]
+; HARD-DAG: std %f4, [%fp+{{.+}}]
+; HARD: add %fp, [[Offset:[0-9]+]], %o0
+; HARD: call _Qp_add
+; HARD: ldd [%fp+[[Offset]]], %f0
+; SOFT-DAG: mov %i0, %o0
+; SOFT-DAG: mov %i1, %o1
+; SOFT-DAG: mov %i2, %o2
+; SOFT-DAG: mov %i3, %o3
+; SOFT: call __addtf3
+; SOFT: mov %o0, %i0
+; SOFT: mov %o1, %i1
+
define fp128 @test_fp128_args(fp128 %a, fp128 %b) {
entry:
%0 = fadd fp128 %a, %b
@@ -429,11 +501,14 @@ entry:
declare i64 @receive_fp128(i64 %a, ...)
; CHECK-LABEL: test_fp128_variable_args:
-; CHECK-DAG: std %f4, [%sp+[[Offset0:[0-9]+]]]
-; CHECK-DAG: std %f6, [%sp+[[Offset1:[0-9]+]]]
-; CHECK-DAG: ldx [%sp+[[Offset0]]], %o2
-; CHECK-DAG: ldx [%sp+[[Offset1]]], %o3
-; CHECK: call receive_fp128
+; HARD-DAG: std %f4, [%sp+[[Offset0:[0-9]+]]]
+; HARD-DAG: std %f6, [%sp+[[Offset1:[0-9]+]]]
+; HARD-DAG: ldx [%sp+[[Offset0]]], %o2
+; HARD-DAG: ldx [%sp+[[Offset1]]], %o3
+; SOFT-DAG: mov %i0, %o0
+; SOFT-DAG: mov %i1, %o1
+; SOFT-DAG: mov %i2, %o2
+; CHECK: call receive_fp128
define i64 @test_fp128_variable_args(i64 %a, fp128 %b) {
entry:
%0 = call i64 (i64, ...) @receive_fp128(i64 %a, fp128 %b)
@@ -441,14 +516,22 @@ entry:
}
; CHECK-LABEL: test_call_libfunc:
-; CHECK: st %f1, [%fp+[[Offset0:[0-9]+]]]
-; CHECK: fmovs %f3, %f1
-; CHECK: call cosf
-; CHECK: st %f0, [%fp+[[Offset1:[0-9]+]]]
-; CHECK: ld [%fp+[[Offset0]]], %f1
-; CHECK: call sinf
-; CHECK: ld [%fp+[[Offset1]]], %f1
-; CHECK: fmuls %f1, %f0, %f0
+; HARD: st %f1, [%fp+[[Offset0:[0-9]+]]]
+; HARD: fmovs %f3, %f1
+; SOFT: srl %i1, 0, %o0
+; CHECK: call cosf
+; HARD: st %f0, [%fp+[[Offset1:[0-9]+]]]
+; HARD: ld [%fp+[[Offset0]]], %f1
+; SOFT: mov %o0, %i1
+; SOFT: srl %i0, 0, %o0
+; CHECK: call sinf
+; HARD: ld [%fp+[[Offset1]]], %f1
+; HARD: fmuls %f1, %f0, %f0
+; SOFT: mov %o0, %i0
+; SOFT: mov %i1, %o0
+; SOFT: mov %i0, %o1
+; SOFT: call __mulsf3
+; SOFT: sllx %o0, 32, %i0
define inreg float @test_call_libfunc(float %arg0, float %arg1) {
entry:
@@ -460,5 +543,3 @@ entry:
declare inreg float @cosf(float %arg) readnone nounwind
declare inreg float @sinf(float %arg) readnone nounwind
-
-
diff --git a/test/CodeGen/SPARC/LeonFixCALLPassUT.ll b/test/CodeGen/SPARC/LeonFixCALLPassUT.ll
new file mode 100644
index 000000000000..697590be4066
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonFixCALLPassUT.ll
@@ -0,0 +1,20 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=at697e -o - | FileCheck %s -check-prefix=FIXCALL
+; RUN: llc %s -O0 -march=sparc -mcpu=leon2 -mattr=+fixcall -o - | FileCheck %s -check-prefix=FIXCALL
+
+; RUN: llc %s -O0 -march=sparc -mcpu=at697e -mattr=-fixcall -o - | FileCheck %s -check-prefix=NO_FIXCALL
+; RUN: llc %s -O0 -march=sparc -mcpu=leon2 -o - | FileCheck %s -check-prefix=NO_FIXCALL
+
+
+; FIXCALL-LABEL: immediate_call_test
+; FIXCALL: call 763288
+
+; NO_FIXCALL-LABEL: immediate_call_test
+; NO_FIXCALL: call 2047583640
+define void @immediate_call_test() nounwind {
+entry:
+ call void asm sideeffect "call $0", "i"(i32 2047583640) nounwind
+ ret void
+}
+
+
+
diff --git a/test/CodeGen/SPARC/LeonFixFSMULDPassUT.ll b/test/CodeGen/SPARC/LeonFixFSMULDPassUT.ll
new file mode 100755
index 000000000000..e2f2323a049c
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonFixFSMULDPassUT.ll
@@ -0,0 +1,31 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=ut699 -o - | FileCheck %s
+
+; CHECK-LABEL: test_fix_fsmuld_1
+; CHECK: fstod %f20, %f2
+; CHECK: fstod %f21, %f3
+; CHECK: fmuld %f2, %f3, %f8
+; CHECK: fstod %f20, %f0
+define double @test_fix_fsmuld_1() {
+entry:
+ %a = alloca float, align 4
+ %b = alloca float, align 4
+ store float 0x402ECCCCC0000000, float* %a, align 4
+ store float 0x4022333340000000, float* %b, align 4
+ %0 = load float, float* %b, align 4
+ %1 = load float, float* %a, align 4
+ %mul = tail call double asm sideeffect "fsmuld $0, $1, $2", "={f20},{f21},{f8}"(float* %a, float* %b)
+
+ ret double %mul
+}
+
+; CHECK-LABEL: test_fix_fsmuld_2
+; CHECK: fstod %f20, %f2
+; CHECK: fstod %f21, %f3
+; CHECK: fmuld %f2, %f3, %f8
+; CHECK: fstod %f20, %f0
+define double @test_fix_fsmuld_2(float* %a, float* %b) {
+entry:
+ %mul = tail call double asm sideeffect "fsmuld $0, $1, $2", "={f20},{f21},{f8}"(float* %a, float* %b)
+
+ ret double %mul
+}
diff --git a/test/CodeGen/SPARC/LeonInsertNOPLoad.ll b/test/CodeGen/SPARC/LeonInsertNOPLoad.ll
new file mode 100644
index 000000000000..315fc85fca38
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonInsertNOPLoad.ll
@@ -0,0 +1,13 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=ut699 -o - | FileCheck %s
+
+; CHECK: ld [%o0+%lo(.LCPI0_0)], %f0
+; CHECK-NEXT: nop
+
+
+define float @X() #0 {
+entry:
+ %f = alloca float, align 4
+ store float 0x3FF3C08320000000, float* %f, align 4
+ %0 = load float, float* %f, align 4
+ ret float %0
+}
diff --git a/test/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll b/test/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll
new file mode 100755
index 000000000000..57ae16227e7d
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll
@@ -0,0 +1,43 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=ut699 -o - | FileCheck %s
+; RUN: llc %s -O0 -march=sparc -mcpu=leon3 -mattr=+insertnopload -o - | FileCheck %s
+
+; CHECK-LABEL: ld_float_test
+; CHECK: ld [%o0+%lo(.LCPI0_0)], %f0
+; CHECK-NEXT: nop
+define float @ld_float_test() #0 {
+entry:
+ %f = alloca float, align 4
+ store float 0x3FF3C08320000000, float* %f, align 4
+ %0 = load float, float* %f, align 4
+ ret float %0
+}
+
+; CHECK-LABEL: ld_i32_test
+; CHECK: ld [%o0], %o0
+; CHECK-NEXT: nop
+define i32 @ld_i32_test(i32 *%p) {
+ %res = load i32, i32* %p
+ ret i32 %res
+}
+
+; CHECK-LABEL: ld_inlineasm_test_1
+; CHECK: ld [%o0], %o0
+; CHECK-NEXT: !NO_APP
+; CHECK-NEXT: nop
+define float @ld_inlineasm_test_1(float* %a) {
+entry:
+ %res = tail call float asm sideeffect "ld [$1], $0", "=r,r"(float* %a)
+
+ ret float %res
+}
+
+; CHECK-LABEL: ld_inlineasm_test_2
+; CHECK: ld [%o0], %o0
+; CHECK-NEXT: !NO_APP
+; CHECK-NEXT: nop
+define i32 @ld_inlineasm_test_2(i32* %a) {
+entry:
+ %res = tail call i32 asm sideeffect "ld [$1], $0", "=r,r"(i32* %a)
+
+ ret i32 %res
+} \ No newline at end of file
diff --git a/test/CodeGen/SPARC/LeonInsertNOPsDoublePrecision.ll b/test/CodeGen/SPARC/LeonInsertNOPsDoublePrecision.ll
new file mode 100644
index 000000000000..0ee3d9071a99
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonInsertNOPsDoublePrecision.ll
@@ -0,0 +1,17 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=at697f -o - | FileCheck %s
+
+; CHECK: ldd
+; CHECK: ldd
+; CHECK-NEXT: nop
+
+define double @mult() #0 {
+entry:
+ %x = alloca double, align 8
+ %y = alloca double, align 8
+ store double 3.141590e+00, double* %x, align 8
+ store double 1.234560e+00, double* %y, align 8
+ %0 = load double, double* %x, align 8
+ %1 = load double, double* %y, align 8
+ %mul = fmul double %0, %1
+ ret double %mul
+}
diff --git a/test/CodeGen/SPARC/LeonItinerariesUT.ll b/test/CodeGen/SPARC/LeonItinerariesUT.ll
new file mode 100644
index 000000000000..87e0c4621c08
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonItinerariesUT.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -O1 -march=sparc | FileCheck %s -check-prefix=NO_ITIN
+; RUN: llc < %s -O1 -march=sparc -mcpu=leon2 | FileCheck %s -check-prefix=LEON2_ITIN
+; RUN: llc < %s -O1 -march=sparc -mcpu=leon3 | FileCheck %s -check-prefix=LEON3_4_ITIN
+; RUN: llc < %s -O1 -march=sparc -mcpu=leon4 | FileCheck %s -check-prefix=LEON3_4_ITIN
+
+; NO_ITIN-LABEL: f32_ops:
+; NO_ITIN: ld
+; NO_ITIN-NEXT: ld
+; NO_ITIN-NEXT: ld
+; NO_ITIN-NEXT: ld
+; NO_ITIN-NEXT: fadds
+; NO_ITIN-NEXT: fsubs
+; NO_ITIN-NEXT: fmuls
+; NO_ITIN-NEXT: retl
+; NO_ITIN-NEXT: fdivs
+
+; LEON2_ITIN-LABEL: f32_ops:
+; LEON2_ITIN: ld
+; LEON2_ITIN-NEXT: ld
+; LEON2_ITIN-NEXT: fadds
+; LEON2_ITIN-NEXT: ld
+; LEON2_ITIN-NEXT: fsubs
+; LEON2_ITIN-NEXT: ld
+; LEON2_ITIN-NEXT: fmuls
+; LEON2_ITIN-NEXT: retl
+; LEON2_ITIN-NEXT: fdivs
+
+; LEON3_4_ITIN-LABEL: f32_ops:
+; LEON3_4_ITIN: ld
+; LEON3_4_ITIN-NEXT: ld
+; LEON3_4_ITIN-NEXT: ld
+; LEON3_4_ITIN-NEXT: fadds
+; LEON3_4_ITIN-NEXT: ld
+; LEON3_4_ITIN-NEXT: fsubs
+; LEON3_4_ITIN-NEXT: fmuls
+; LEON3_4_ITIN-NEXT: retl
+; LEON3_4_ITIN-NEXT: fdivs
+
+define float @f32_ops(float* byval %a, float* byval %b, float* byval %c, float* byval %d) {
+entry:
+ %0 = load float, float* %a, align 8
+ %1 = load float, float* %b, align 8
+ %2 = load float, float* %c, align 8
+ %3 = load float, float* %d, align 8
+ %4 = fadd float %0, %1
+ %5 = fsub float %4, %2
+ %6 = fmul float %5, %3
+ %7 = fdiv float %6, %4
+ ret float %7
+} \ No newline at end of file
diff --git a/test/CodeGen/SPARC/LeonPreventRoundChangePassUT.ll b/test/CodeGen/SPARC/LeonPreventRoundChangePassUT.ll
new file mode 100644
index 000000000000..07172fdb9451
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonPreventRoundChangePassUT.ll
@@ -0,0 +1,65 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=ut699 -o - | FileCheck %s -check-prefix=NO_ROUND_FUNC
+; RUN: llc %s -O0 -march=sparc -mcpu=leon3 -mattr=+prvntroundchange -o - | FileCheck %s -check-prefix=NO_ROUND_FUNC
+
+; RUN: llc %s -O0 -march=sparc -mcpu=ut699 -mattr=-prvntroundchange -o - | FileCheck %s -check-prefix=ROUND_FUNC
+; RUN: llc %s -O0 -march=sparc -mcpu=leon3 -o - | FileCheck %s -check-prefix=ROUND_FUNC
+
+
+; NO_ROUND_FUNC-LABEL: test_round_change
+; NO_ROUND_FUNC-NOT: fesetround
+
+; ROUND_FUNC-LABEL: test_round_change
+; ROUND_FUNC: fesetround
+
+; ModuleID = '<stdin>'
+target datalayout = "E-m:e-p:32:32-i64:64-f128:64-n32-S64"
+target triple = "sparc-unknown--eabi"
+
+@.str = private unnamed_addr constant [17 x i8] c"-((-a)*b) != a*b\00", align 1
+@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", align 1
+@__PRETTY_FUNCTION__.mult = private unnamed_addr constant [12 x i8] c"void mult()\00", align 1
+
+; Function Attrs: nounwind
+define void @test_round_change() #0 {
+entry:
+ %a = alloca double, align 8
+ %b = alloca double, align 8
+ %x = alloca float, align 4
+ store double 1.100000e+00, double* %a, align 8
+ store double 1.010000e+01, double* %b, align 8
+ store float 0x400921FA00000000, float* %x, align 4
+ %call = call i32 @fesetround(i32 2048) #2
+ %0 = load double, double* %a, align 8
+ %sub = fsub double -0.000000e+00, %0
+ %1 = load double, double* %b, align 8
+ %mul = fmul double %sub, %1
+ %sub1 = fsub double -0.000000e+00, %mul
+ %2 = load double, double* %a, align 8
+ %3 = load double, double* %b, align 8
+ %mul2 = fmul double %2, %3
+ %cmp = fcmp une double %sub1, %mul2
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true: ; preds = %entry
+ br label %cond.end
+
+cond.false: ; preds = %entry
+ call void @__assert_fail(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str.1, i32 0, i32 0), i32 10, i8* getelementptr inbounds ([12 x i8], [12 x i8]* @__PRETTY_FUNCTION__.mult, i32 0, i32 0)) #3
+ unreachable
+ ; No predecessors!
+ br label %cond.end
+
+cond.end: ; preds = %4, %cond.true
+ ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @fesetround(i32) #0
+
+; Function Attrs: noreturn nounwind
+declare void @__assert_fail(i8*, i8*, i32, i8*) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+attributes #3 = { noreturn nounwind } \ No newline at end of file
diff --git a/test/CodeGen/SPARC/LeonReplaceFMULSPassUT.ll b/test/CodeGen/SPARC/LeonReplaceFMULSPassUT.ll
new file mode 100755
index 000000000000..7d0950cb1c88
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonReplaceFMULSPassUT.ll
@@ -0,0 +1,19 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=ut699 -o - | FileCheck %s
+
+; CHECK-LABEL: fmuls_fix_test
+; CHECK: fstod %f20, %f2
+; CHECK: fstod %f21, %f3
+; CHECK: fmuld %f2, %f3, %f8
+; CHECK: fstod %f20, %f0
+define double @fmuls_fix_test() {
+entry:
+ %a = alloca float, align 4
+ %b = alloca float, align 4
+ store float 0x402ECCCCC0000000, float* %a, align 4
+ store float 0x4022333340000000, float* %b, align 4
+ %0 = load float, float* %b, align 4
+ %1 = load float, float* %a, align 4
+ %mul = tail call double asm sideeffect "fmuls $0, $1, $2", "={f20},{f21},{f8}"(float* %a, float* %b)
+
+ ret double %mul
+}
diff --git a/test/CodeGen/SPARC/LeonReplaceSDIVPassUT.ll b/test/CodeGen/SPARC/LeonReplaceSDIVPassUT.ll
new file mode 100644
index 000000000000..67232d777039
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonReplaceSDIVPassUT.ll
@@ -0,0 +1,9 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=at697e -o - | FileCheck %s
+
+; CHECK: sdivcc %o0, %o1, %o0
+
+define i32 @lbr59(i32 %a, i32 %b)
+{
+ %r = sdiv i32 %a, %b
+ ret i32 %r
+}
diff --git a/test/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll b/test/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll
new file mode 100755
index 000000000000..281113b58a05
--- /dev/null
+++ b/test/CodeGen/SPARC/LeonSMACUMACInstructionUT.ll
@@ -0,0 +1,20 @@
+; RUN: llc %s -O0 -march=sparc -mcpu=leon2 -o - | FileCheck %s
+; RUN: llc %s -O0 -march=sparc -mcpu=leon3 -o - | FileCheck %s
+; RUN: llc %s -O0 -march=sparc -mcpu=leon4 -o - | FileCheck %s
+
+; CHECK-LABEL: smac_test:
+; CHECK: smac %o1, %o0, %o0
+define i32 @smac_test(i16* %a, i16* %b) {
+entry:
+; %0 = tail call i32 asm sideeffect "smac $2, $1, $0", "={r2},{r3},{r4}"(i16* %a, i16* %b)
+ %0 = tail call i32 asm sideeffect "smac $2, $1, $0", "=r,rI,r"(i16* %a, i16* %b)
+ ret i32 %0
+}
+
+; CHECK-LABEL: umac_test:
+; CHECK: umac %o1, %o0, %o0
+define i32 @umac_test(i16* %a, i16* %b) {
+entry:
+ %0 = tail call i32 asm sideeffect "umac $2, $1, $0", "=r,rI,r"(i16* %a, i16* %b)
+ ret i32 %0
+}
diff --git a/test/CodeGen/SPARC/atomics.ll b/test/CodeGen/SPARC/atomics.ll
index bea9a3374696..5e608e728c37 100644
--- a/test/CodeGen/SPARC/atomics.ll
+++ b/test/CodeGen/SPARC/atomics.ll
@@ -1,5 +1,37 @@
; RUN: llc < %s -march=sparcv9 -verify-machineinstrs | FileCheck %s
+; CHECK-LABEL: test_atomic_i8
+; CHECK: ldub [%o0]
+; CHECK: membar
+; CHECK: ldub [%o1]
+; CHECK: membar
+; CHECK: membar
+; CHECK: stb {{.+}}, [%o2]
+define i8 @test_atomic_i8(i8* %ptr1, i8* %ptr2, i8* %ptr3) {
+entry:
+ %0 = load atomic i8, i8* %ptr1 acquire, align 1
+ %1 = load atomic i8, i8* %ptr2 acquire, align 1
+ %2 = add i8 %0, %1
+ store atomic i8 %2, i8* %ptr3 release, align 1
+ ret i8 %2
+}
+
+; CHECK-LABEL: test_atomic_i16
+; CHECK: lduh [%o0]
+; CHECK: membar
+; CHECK: lduh [%o1]
+; CHECK: membar
+; CHECK: membar
+; CHECK: sth {{.+}}, [%o2]
+define i16 @test_atomic_i16(i16* %ptr1, i16* %ptr2, i16* %ptr3) {
+entry:
+ %0 = load atomic i16, i16* %ptr1 acquire, align 2
+ %1 = load atomic i16, i16* %ptr2 acquire, align 2
+ %2 = add i16 %0, %1
+ store atomic i16 %2, i16* %ptr3 release, align 2
+ ret i16 %2
+}
+
; CHECK-LABEL: test_atomic_i32
; CHECK: ld [%o0]
; CHECK: membar
@@ -9,10 +41,10 @@
; CHECK: st {{.+}}, [%o2]
define i32 @test_atomic_i32(i32* %ptr1, i32* %ptr2, i32* %ptr3) {
entry:
- %0 = load atomic i32, i32* %ptr1 acquire, align 8
- %1 = load atomic i32, i32* %ptr2 acquire, align 8
+ %0 = load atomic i32, i32* %ptr1 acquire, align 4
+ %1 = load atomic i32, i32* %ptr2 acquire, align 4
%2 = add i32 %0, %1
- store atomic i32 %2, i32* %ptr3 release, align 8
+ store atomic i32 %2, i32* %ptr3 release, align 4
ret i32 %2
}
@@ -32,6 +64,90 @@ entry:
ret i64 %2
}
+;; TODO: the "move %icc" and related instructions are totally
+;; redundant here. There's something weird happening in optimization
+;; of the success value of cmpxchg.
+
+; CHECK-LABEL: test_cmpxchg_i8
+; CHECK: and %o1, -4, %o2
+; CHECK: mov 3, %o3
+; CHECK: andn %o3, %o1, %o1
+; CHECK: sll %o1, 3, %o1
+; CHECK: mov 255, %o3
+; CHECK: sll %o3, %o1, %o5
+; CHECK: xor %o5, -1, %o3
+; CHECK: mov 123, %o4
+; CHECK: ld [%o2], %g2
+; CHECK: sll %o4, %o1, %o4
+; CHECK: and %o0, 255, %o0
+; CHECK: sll %o0, %o1, %o0
+; CHECK: andn %g2, %o5, %g2
+; CHECK: sethi 0, %o5
+; CHECK: [[LABEL1:\.L.*]]:
+; CHECK: or %g2, %o4, %g3
+; CHECK: or %g2, %o0, %g4
+; CHECK: cas [%o2], %g4, %g3
+; CHECK: cmp %g3, %g4
+; CHECK: mov %o5, %g4
+; CHECK: move %icc, 1, %g4
+; CHECK: cmp %g4, 0
+; CHECK: bne [[LABEL2:\.L.*]]
+; CHECK: nop
+; CHECK: and %g3, %o3, %g4
+; CHECK: cmp %g2, %g4
+; CHECK: bne [[LABEL1]]
+; CHECK: mov %g4, %g2
+; CHECK: [[LABEL2]]:
+; CHECK: retl
+; CHECK: srl %g3, %o1, %o0
+define i8 @test_cmpxchg_i8(i8 %a, i8* %ptr) {
+entry:
+ %pair = cmpxchg i8* %ptr, i8 %a, i8 123 monotonic monotonic
+ %b = extractvalue { i8, i1 } %pair, 0
+ ret i8 %b
+}
+
+; CHECK-LABEL: test_cmpxchg_i16
+
+; CHECK: and %o1, -4, %o2
+; CHECK: and %o1, 3, %o1
+; CHECK: xor %o1, 2, %o1
+; CHECK: sll %o1, 3, %o1
+; CHECK: sethi 63, %o3
+; CHECK: or %o3, 1023, %o4
+; CHECK: sll %o4, %o1, %o5
+; CHECK: xor %o5, -1, %o3
+; CHECK: and %o0, %o4, %o4
+; CHECK: ld [%o2], %g2
+; CHECK: mov 123, %o0
+; CHECK: sll %o0, %o1, %o0
+; CHECK: sll %o4, %o1, %o4
+; CHECK: andn %g2, %o5, %g2
+; CHECK: sethi 0, %o5
+; CHECK: [[LABEL1:\.L.*]]:
+; CHECK: or %g2, %o0, %g3
+; CHECK: or %g2, %o4, %g4
+; CHECK: cas [%o2], %g4, %g3
+; CHECK: cmp %g3, %g4
+; CHECK: mov %o5, %g4
+; CHECK: move %icc, 1, %g4
+; CHECK: cmp %g4, 0
+; CHECK: bne [[LABEL2:\.L.*]]
+; CHECK: nop
+; CHECK: and %g3, %o3, %g4
+; CHECK: cmp %g2, %g4
+; CHECK: bne [[LABEL1]]
+; CHECK: mov %g4, %g2
+; CHECK: [[LABEL2]]:
+; CHECK: retl
+; CHECK: srl %g3, %o1, %o0
+define i16 @test_cmpxchg_i16(i16 %a, i16* %ptr) {
+entry:
+ %pair = cmpxchg i16* %ptr, i16 %a, i16 123 monotonic monotonic
+ %b = extractvalue { i16, i1 } %pair, 0
+ ret i16 %b
+}
+
; CHECK-LABEL: test_cmpxchg_i32
; CHECK: mov 123, [[R:%[gilo][0-7]]]
; CHECK: cas [%o1], %o0, [[R]]
@@ -54,6 +170,26 @@ entry:
ret i64 %b
}
+; CHECK-LABEL: test_swap_i8
+; CHECK: mov 42, [[R:%[gilo][0-7]]]
+; CHECK: cas
+
+define i8 @test_swap_i8(i8 %a, i8* %ptr) {
+entry:
+ %b = atomicrmw xchg i8* %ptr, i8 42 monotonic
+ ret i8 %b
+}
+
+; CHECK-LABEL: test_swap_i16
+; CHECK: mov 42, [[R:%[gilo][0-7]]]
+; CHECK: cas
+
+define i16 @test_swap_i16(i16 %a, i16* %ptr) {
+entry:
+ %b = atomicrmw xchg i16* %ptr, i16 42 monotonic
+ ret i16 %b
+}
+
; CHECK-LABEL: test_swap_i32
; CHECK: mov 42, [[R:%[gilo][0-7]]]
; CHECK: swap [%o1], [[R]]
@@ -73,12 +209,36 @@ entry:
ret i64 %b
}
-; CHECK-LABEL: test_load_add_32
+; CHECK-LABEL: test_load_sub_i8
+; CHECK: membar
+; CHECK: .L{{.*}}:
+; CHECK: sub
+; CHECK: cas [{{%[gilo][0-7]}}]
+; CHECK: membar
+define zeroext i8 @test_load_sub_i8(i8* %p, i8 zeroext %v) {
+entry:
+ %0 = atomicrmw sub i8* %p, i8 %v seq_cst
+ ret i8 %0
+}
+
+; CHECK-LABEL: test_load_sub_i16
+; CHECK: membar
+; CHECK: .L{{.*}}:
+; CHECK: sub
+; CHECK: cas [{{%[gilo][0-7]}}]
+; CHECK: membar
+define zeroext i16 @test_load_sub_i16(i16* %p, i16 zeroext %v) {
+entry:
+ %0 = atomicrmw sub i16* %p, i16 %v seq_cst
+ ret i16 %0
+}
+
+; CHECK-LABEL: test_load_add_i32
; CHECK: membar
; CHECK: add [[V:%[gilo][0-7]]], %o1, [[U:%[gilo][0-7]]]
; CHECK: cas [%o0], [[V]], [[U]]
; CHECK: membar
-define zeroext i32 @test_load_add_32(i32* %p, i32 zeroext %v) {
+define zeroext i32 @test_load_add_i32(i32* %p, i32 zeroext %v) {
entry:
%0 = atomicrmw add i32* %p, i32 %v seq_cst
ret i32 %0
diff --git a/test/CodeGen/SPARC/float.ll b/test/CodeGen/SPARC/float.ll
index c4cc04420ad7..248e98549c9f 100644
--- a/test/CodeGen/SPARC/float.ll
+++ b/test/CodeGen/SPARC/float.ll
@@ -1,11 +1,13 @@
-; RUN: llc -march=sparc < %s | FileCheck %s -check-prefix=V8
+; RUN: llc -march=sparc < %s | FileCheck %s -check-prefix=V8 -check-prefix=V8-BE
+; RUN: llc -march=sparcel < %s | FileCheck %s -check-prefix=V8 -check-prefix=V8-EL
; RUN: llc -march=sparc -O0 < %s | FileCheck %s -check-prefix=V8-UNOPT
; RUN: llc -march=sparc -mattr=v9 < %s | FileCheck %s -check-prefix=V9
; RUN: llc -mtriple=sparc64-unknown-linux < %s | FileCheck %s -check-prefix=SPARC64
; V8-LABEL: test_neg:
; V8: call get_double
-; V8: fnegs %f0, %f0
+; V8-BE: fnegs %f0, %f0
+; V8-EL: fnegs %f1, %f1
; V8-UNOPT-LABEL: test_neg:
; V8-UNOPT: fnegs
@@ -27,7 +29,8 @@ entry:
}
; V8-LABEL: test_abs:
-; V8: fabss %f0, %f0
+; V8-BE: fabss %f0, %f0
+; V8-EL: fabss %f1, %f1
; V8-UNOPT-LABEL: test_abs:
; V8-UNOPT: fabss
diff --git a/test/CodeGen/SPARC/fp128.ll b/test/CodeGen/SPARC/fp128.ll
index e0eaf93a733e..bcc013b73575 100644
--- a/test/CodeGen/SPARC/fp128.ll
+++ b/test/CodeGen/SPARC/fp128.ll
@@ -1,30 +1,24 @@
-; RUN: llc < %s -march=sparc -mattr=hard-quad-float | FileCheck %s --check-prefix=HARD
-; RUN: llc < %s -march=sparc -mattr=-hard-quad-float | FileCheck %s --check-prefix=SOFT
+; RUN: llc < %s -march=sparc -mattr=hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=HARD --check-prefix=BE
+; RUN: llc < %s -march=sparcel -mattr=hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=HARD --check-prefix=EL
+; RUN: llc < %s -march=sparc -mattr=-hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT --check-prefix=BE
+; RUN: llc < %s -march=sparcel -mattr=-hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT --check-prefix=EL
-; HARD-LABEL: f128_ops
-; HARD: ldd
-; HARD: ldd
-; HARD: ldd
-; HARD: ldd
+; CHECK-LABEL: f128_ops:
+; CHECK: ldd
+; CHECK: ldd
+; CHECK: ldd
+; CHECK: ldd
; HARD: faddq [[R0:.+]], [[R1:.+]], [[R2:.+]]
; HARD: fsubq [[R2]], [[R3:.+]], [[R4:.+]]
; HARD: fmulq [[R4]], [[R5:.+]], [[R6:.+]]
; HARD: fdivq [[R6]], [[R2]]
-; HARD: std
-; HARD: std
-
-; SOFT-LABEL: f128_ops
-; SOFT: ldd
-; SOFT: ldd
-; SOFT: ldd
-; SOFT: ldd
; SOFT: call _Q_add
; SOFT: call _Q_sub
; SOFT: call _Q_mul
; SOFT: call _Q_div
-; SOFT: std
-; SOFT: std
+; CHECK: std
+; CHECK: std
define void @f128_ops(fp128* noalias sret %scalar.result, fp128* byval %a, fp128* byval %b, fp128* byval %c, fp128* byval %d) {
entry:
@@ -40,19 +34,12 @@ entry:
ret void
}
-; HARD-LABEL: f128_spill
-; HARD: std %f{{.+}}, [%[[S0:.+]]]
-; HARD: std %f{{.+}}, [%[[S1:.+]]]
-; HARD-DAG: ldd [%[[S0]]], %f{{.+}}
-; HARD-DAG: ldd [%[[S1]]], %f{{.+}}
-; HARD: jmp {{%[oi]7}}+12
-
-; SOFT-LABEL: f128_spill
-; SOFT: std %f{{.+}}, [%[[S0:.+]]]
-; SOFT: std %f{{.+}}, [%[[S1:.+]]]
-; SOFT-DAG: ldd [%[[S0]]], %f{{.+}}
-; SOFT-DAG: ldd [%[[S1]]], %f{{.+}}
-; SOFT: jmp {{%[oi]7}}+12
+; CHECK-LABEL: f128_spill:
+; CHECK: std %f{{.+}}, [%[[S0:.+]]]
+; CHECK: std %f{{.+}}, [%[[S1:.+]]]
+; CHECK-DAG: ldd [%[[S0]]], %f{{.+}}
+; CHECK-DAG: ldd [%[[S1]]], %f{{.+}}
+; CHECK: jmp {{%[oi]7}}+12
define void @f128_spill(fp128* noalias sret %scalar.result, fp128* byval %a) {
entry:
@@ -62,11 +49,9 @@ entry:
ret void
}
-; HARD-LABEL: f128_compare
+; CHECK-LABEL: f128_compare:
; HARD: fcmpq
; HARD-NEXT: nop
-
-; SOFT-LABEL: f128_compare
; SOFT: _Q_cmp
define i32 @f128_compare(fp128* byval %f0, fp128* byval %f1, i32 %a, i32 %b) {
@@ -78,11 +63,9 @@ entry:
ret i32 %ret
}
-; HARD-LABEL: f128_compare2
-; HARD: fcmpq
-; HARD: fb{{ule|g}}
-
-; SOFT-LABEL: f128_compare2
+; CHECK-LABEL: f128_compare2:
+; HARD: fcmpq
+; HARD: fb{{ule|g}}
; SOFT: _Q_cmp
; SOFT: cmp
@@ -99,11 +82,11 @@ entry:
}
-; HARD-LABEL: f128_abs
-; HARD: fabss
-
-; SOFT-LABEL: f128_abs
-; SOFT: fabss
+; CHECK-LABEL: f128_abs:
+; CHECK: ldd [%o0], %f0
+; CHECK: ldd [%o0+8], %f2
+; BE: fabss %f0, %f0
+; EL: fabss %f3, %f3
define void @f128_abs(fp128* noalias sret %scalar.result, fp128* byval %a) {
entry:
@@ -115,10 +98,8 @@ entry:
declare fp128 @llvm.fabs.f128(fp128) nounwind readonly
-; HARD-LABEL: int_to_f128
+; CHECK-LABEL: int_to_f128:
; HARD: fitoq
-
-; SOFT-LABEL: int_to_f128
; SOFT: _Q_itoq
define void @int_to_f128(fp128* noalias sret %scalar.result, i32 %i) {
@@ -128,17 +109,12 @@ entry:
ret void
}
-; HARD-LABEL: fp128_unaligned
-; HARD: ldub
-; HARD: faddq
-; HARD: stb
-; HARD: ret
-
-; SOFT-LABEL: fp128_unaligned
-; SOFT: ldub
+; CHECK-LABEL: fp128_unaligned:
+; CHECK: ldub
+; HARD: faddq
; SOFT: call _Q_add
-; SOFT: stb
-; SOFT: ret
+; CHECK: stb
+; CHECK: ret
define void @fp128_unaligned(fp128* %a, fp128* %b, fp128* %c) {
entry:
@@ -149,10 +125,8 @@ entry:
ret void
}
-; HARD-LABEL: uint_to_f128
+; CHECK-LABEL: uint_to_f128:
; HARD: fdtoq
-
-; SOFT-LABEL: uint_to_f128
; SOFT: _Q_utoq
define void @uint_to_f128(fp128* noalias sret %scalar.result, i32 %i) {
@@ -162,11 +136,9 @@ entry:
ret void
}
-; HARD-LABEL: f128_to_i32
+; CHECK-LABEL: f128_to_i32:
; HARD: fqtoi
; HARD: fqtoi
-
-; SOFT-LABEL: f128_to_i32
; SOFT: call _Q_qtou
; SOFT: call _Q_qtoi
@@ -181,13 +153,11 @@ entry:
ret i32 %4
}
-; HARD-LABEL: test_itoq_qtoi
+; CHECK-LABEL: test_itoq_qtoi
; HARD-DAG: call _Q_lltoq
; HARD-DAG: call _Q_qtoll
; HARD-DAG: fitoq
; HARD-DAG: fqtoi
-
-; SOFT-LABEL: test_itoq_qtoi
; SOFT-DAG: call _Q_lltoq
; SOFT-DAG: call _Q_qtoll
; SOFT-DAG: call _Q_itoq
@@ -209,15 +179,11 @@ entry:
ret void
}
-; HARD-LABEL: test_utoq_qtou
-; HARD-DAG: call _Q_ulltoq
-; HARD-DAG: call _Q_qtoull
+; CHECK-LABEL: test_utoq_qtou:
+; CHECK-DAG: call _Q_ulltoq
+; CHECK-DAG: call _Q_qtoull
; HARD-DAG: fdtoq
; HARD-DAG: fqtoi
-
-; SOFT-LABEL: test_utoq_qtou
-; SOFT-DAG: call _Q_ulltoq
-; SOFT-DAG: call _Q_qtoull
; SOFT-DAG: call _Q_utoq
; SOFT-DAG: call _Q_qtou
@@ -237,8 +203,11 @@ entry:
ret void
}
-; SOFT-LABEL: f128_neg
-; SOFT: fnegs
+; CHECK-LABEL: f128_neg:
+; CHECK: ldd [%o0], %f0
+; CHECK: ldd [%o0+8], %f2
+; BE: fnegs %f0, %f0
+; EL: fnegs %f3, %f3
define void @f128_neg(fp128* noalias sret %scalar.result, fp128* byval %a) {
entry:
diff --git a/test/CodeGen/SPARC/func-addr.ll b/test/CodeGen/SPARC/func-addr.ll
new file mode 100644
index 000000000000..3d1cd9c30690
--- /dev/null
+++ b/test/CodeGen/SPARC/func-addr.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -march=sparc -relocation-model=static -code-model=small | FileCheck --check-prefix=abs32 %s
+; RUN: llc < %s -march=sparcv9 -relocation-model=static -code-model=small | FileCheck --check-prefix=abs32 %s
+; RUN: llc < %s -march=sparcv9 -relocation-model=static -code-model=medium | FileCheck --check-prefix=abs44 %s
+; RUN: llc < %s -march=sparcv9 -relocation-model=static -code-model=large | FileCheck --check-prefix=abs64 %s
+; RUN: llc < %s -march=sparc -relocation-model=pic -code-model=medium | FileCheck --check-prefix=v8pic32 %s
+; RUN: llc < %s -march=sparcv9 -relocation-model=pic -code-model=medium | FileCheck --check-prefix=v9pic32 %s
+
+define void @func1() #0 {
+entry:
+ ret void
+}
+
+define void @test() #0 {
+entry:
+ %pFunc = alloca void (...)*, align 4
+ store void (...)* bitcast (void ()* @func1 to void (...)*), void (...)** %pFunc, align 4
+ %0 = load void (...)*, void (...)** %pFunc, align 4
+ %callee.knr.cast = bitcast void (...)* %0 to void ()*
+ call void %callee.knr.cast()
+
+; abs32-LABEL: test
+; abs32: sethi %hi(func1), %i0
+; abs32: add %i0, %lo(func1), %i1
+; abs32: call %i0+%lo(func1)
+
+; abs44-LABEL: test
+; abs44: sethi %h44(func1), %i0
+; abs44: add %i0, %m44(func1), %i0
+; abs44: sllx %i0, 12, %i0
+; abs44: add %i0, %l44(func1), %i1
+; abs44: call %i0+%l44(func1)
+
+; abs64-LABEL: test
+; abs64: sethi %hi(func1), %i0
+; abs64: add %i0, %lo(func1), %i0
+; abs64: sethi %hh(func1), %i1
+; abs64: add %i1, %hm(func1), %i1
+
+; v8pic32-LABEL: test
+; v8pic32: sethi %hi(func1), %i1
+; v8pic32: add %i1, %lo(func1), %i1
+; v8pic32: ld [%i0+%i1], %i0
+
+; v9pic32-LABEL: test
+; v9pic32: sethi %hi(func1), %i1
+; v9pic32: add %i1, %lo(func1), %i1
+; v9pic32: ldx [%i0+%i1], %i0
+; v9pic32: call %i0
+
+ ret void
+}
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
index d54c5c6bc780..af631f0d29f5 100644
--- a/test/CodeGen/SPARC/inlineasm.ll
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -8,6 +8,18 @@ entry:
ret i32 %0
}
+;; Check tests only that the constraints are accepted without a compiler failure.
+; CHECK-LABEL: test_constraints_nro:
+%struct.anon = type { i32, i32 }
+@v = external global %struct.anon, align 4
+define void @test_constraints_nro() {
+entry:
+ %0 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @v, i32 0, i32 0);
+ %1 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @v, i32 0, i32 1);
+ tail call void asm sideeffect "", "nro,nro"(i32 %0, i32 %1)
+ ret void
+}
+
; CHECK-LABEL: test_constraint_I:
; CHECK: add %o0, 1023, %o0
define i32 @test_constraint_I(i32 %a) {
diff --git a/test/CodeGen/SPARC/missinglabel.ll b/test/CodeGen/SPARC/missinglabel.ll
index bcf384b7ad29..3626feee4c38 100644
--- a/test/CodeGen/SPARC/missinglabel.ll
+++ b/test/CodeGen/SPARC/missinglabel.ll
@@ -4,14 +4,14 @@ target triple = "sparc64-unknown-linux-gnu"
define void @f() align 2 {
entry:
-; CHECK: %xcc, .LBB0_1
+; CHECK: %xcc, .LBB0_2
%cmp = icmp eq i64 undef, 0
br i1 %cmp, label %targetblock, label %cond.false
cond.false:
unreachable
-; CHECK: .LBB0_1: ! %targetblock
+; CHECK: .LBB0_2: ! %targetblock
targetblock:
br i1 undef, label %cond.false.i83, label %exit.i85
diff --git a/test/CodeGen/SPARC/sjlj.ll b/test/CodeGen/SPARC/sjlj.ll
new file mode 100755
index 000000000000..3bf583aa4754
--- /dev/null
+++ b/test/CodeGen/SPARC/sjlj.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+; RUN: llc < %s -march=sparc -mcpu=leon2 | FileCheck %s
+; RUN: llc < %s -march=sparc -mcpu=leon3 | FileCheck %s
+; RUN: llc < %s -march=sparc -mcpu=leon4 | FileCheck %s
+
+%struct.__jmp_buf_tag = type { [64 x i64], i32, %struct.__sigset_t, [8 x i8] }
+%struct.__sigset_t = type { [16 x i64] }
+
+@env_sigill = internal global [1 x %struct.__jmp_buf_tag] zeroinitializer, align 16
+
+define void @foo() #0 {
+entry:
+ call void @llvm.eh.sjlj.longjmp(i8* bitcast ([1 x %struct.__jmp_buf_tag]* @env_sigill to i8*))
+ unreachable
+
+; CHECK: @foo
+; CHECK: ta 3
+; CHECK: ld [%i0], %fp
+; CHECK: ld [%i0+4], %i1
+; CHECK: ld [%i0+8], %sp
+; CHECK: jmp %i1
+; CHECK: ld [%i0+12], %i7
+
+return: ; No predecessors!
+ ret void
+}
+
+declare void @llvm.eh.sjlj.longjmp(i8*) #1
+
+define signext i32 @main() #0 {
+entry:
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval
+ %0 = call i8* @llvm.frameaddress(i32 0)
+ store i8* %0, i8** bitcast ([1 x %struct.__jmp_buf_tag]* @env_sigill to i8**)
+ %1 = call i8* @llvm.stacksave()
+ store i8* %1, i8** getelementptr (i8*, i8** bitcast ([1 x %struct.__jmp_buf_tag]* @env_sigill to i8**), i32 2)
+ %2 = call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([1 x %struct.__jmp_buf_tag]* @env_sigill to i8*))
+ %tobool = icmp ne i32 %2, 0
+ br i1 %tobool, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ store i32 1, i32* %retval
+ br label %return
+
+if.else: ; preds = %entry
+ call void @foo()
+ br label %if.end
+
+if.end: ; preds = %if.else
+ store i32 0, i32* %retval
+ br label %return
+
+return: ; preds = %if.end, %if.then
+ %3 = load i32, i32* %retval
+ ret i32 %3
+
+; CHECK: @main
+; CHECK: st %fp, [%i0]
+; CHECK: sethi %hi(.LBB1_2), %i1
+; CHECK: or %i1, %lo(.LBB1_2), %i1
+; CHECK: st %i1, [%i0+4]
+; CHECK: st %sp, [%i0+8]
+; CHECK: bn .LBB1_2
+; CHECK: st %i7, [%i0+12]
+; CHECK: ba .LBB1_1
+; CHECK: nop
+; CHECK:.LBB1_1: ! %entry
+; CHECK: ba .LBB1_3
+; CHECK: mov %g0, %i0
+; CHECK:.LBB1_2: ! Block address taken
+; CHECK: mov 1, %i0
+; CHECK:.LBB1_3: ! %entry
+; CHECK: cmp %i0, 0
+; CHECK: be .LBB1_5
+; CHECK: nop
+}
+declare i8* @llvm.frameaddress(i32) #2
+
+declare i8* @llvm.stacksave() #3
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) #3
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
diff --git a/test/CodeGen/SPARC/soft-float.ll b/test/CodeGen/SPARC/soft-float.ll
new file mode 100644
index 000000000000..53ca1974659e
--- /dev/null
+++ b/test/CodeGen/SPARC/soft-float.ll
@@ -0,0 +1,235 @@
+; RUN: llc -march=sparc -mattr=soft-float -O0 < %s | FileCheck %s
+
+; Arithmetic functions
+
+define float @test_addsf3(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_addsf3:
+ ; CHECK: call __addsf3
+ %add = fadd float %a, %b
+ ret float %add
+}
+
+define double @test_adddf3(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_adddf3:
+ ; CHECK: call __adddf3
+ %add = fadd double %a, %b
+ ret double %add
+}
+
+define fp128 @test_addtf3(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_addtf3:
+ ; CHECK: call __addtf3
+ %add = fadd fp128 %a, %b
+ ret fp128 %add
+}
+
+define float @test_mulsf3(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_mulsf3:
+ ; CHECK: call __mulsf3
+ %mul = fmul float %a, %b
+ ret float %mul
+}
+
+define double @test_muldf3(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_muldf3:
+ ; CHECK: call __muldf3
+ %mul = fmul double %a, %b
+ ret double %mul
+}
+
+define fp128 @test_multf3(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_multf3:
+ ; CHECK: call __multf3
+ %mul = fmul fp128 %a, %b
+ ret fp128 %mul
+}
+
+define float @test_subsf3(float %a, float %b) #0 {
+ ; CHCEK-LABEL: test_subsf3:
+ ; CHECK: call __subsf3
+ %sub = fsub float %a, %b
+ ret float %sub
+}
+
+define double @test_subdf3(double %a, double %b) #0 {
+ ; CHCEK-LABEL: test_subdf3:
+ ; CHECK: call __subdf3
+ %sub = fsub double %a, %b
+ ret double %sub
+}
+
+define fp128 @test_subtf3(fp128 %a, fp128 %b) #0 {
+ ; CHCEK-LABEL: test_subtf3:
+ ; CHECK: call __subtf3
+ %sub = fsub fp128 %a, %b
+ ret fp128 %sub
+}
+
+define float @test_divsf3(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_divsf3:
+ ; CHECK: call __divsf3
+ %div = fdiv float %a, %b
+ ret float %div
+}
+
+define double @test_divdf3(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_divdf3:
+ ; CHECK: call __divdf3
+ %div = fdiv double %a, %b
+ ret double %div
+}
+
+define fp128 @test_divtf3(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_divtf3:
+ ; CHECK: call __divtf3
+ %div = fdiv fp128 %a, %b
+ ret fp128 %div
+}
+
+; Comparison functions
+define i1 @test_unordsf2(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_unordsf2:
+ ; CHECK: call __unordsf2
+ %cmp = fcmp uno float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_unorddf2(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_unorddf2:
+ ; CHECK: call __unorddf2
+ %cmp = fcmp uno double %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_unordtf2(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_unordtf2:
+ ; CHECK: call __unordtf2
+ %cmp = fcmp uno fp128 %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_eqsf2(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_eqsf2:
+ ; CHECK: call __eqsf2
+ %cmp = fcmp oeq float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_eqdf2(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_eqdf2:
+ ; CHECK: call __eqdf2
+ %cmp = fcmp oeq double %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_eqtf2(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_eqtf2:
+ ; CHECK: call __eqtf2
+ %cmp = fcmp oeq fp128 %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_nesf2(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_nesf2:
+ ; CHECK: call __nesf2
+ %cmp = fcmp une float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_nedf2(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_nedf2:
+ ; CHECK: call __nedf2
+ %cmp = fcmp une double %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_netf2(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_netf2:
+ ; CHECK: call __netf2
+ %cmp = fcmp une fp128 %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_gesf2(float %a, float %b) #0 {
+ ; CHECK-LABLE: test_gesf2:
+ ; CHECK: call __gesf2
+ %cmp = fcmp oge float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_gedf2(double %a, double %b) #0 {
+ ; CHECK-LABLE: test_gedf2:
+ ; CHECK: call __gedf2
+ %cmp = fcmp oge double %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_getf2(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABLE: test_getf2:
+ ; CHECK: call __getf2
+ %cmp = fcmp oge fp128 %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_ltsf2(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_ltsf2:
+ ; CHECK: call __ltsf2
+ %cmp = fcmp olt float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_ltdf2(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_ltdf2:
+ ; CHECK: call __ltdf2
+ %cmp = fcmp olt double %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_lttf2(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_lttf2:
+ ; CHECK: call __lttf2
+ %cmp = fcmp olt fp128 %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_lesf2(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_lesf2:
+ ; CHECK: call __lesf2
+ %cmp = fcmp ole float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_ledf2(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_ledf2:
+ ; CHECK: call __ledf2
+ %cmp = fcmp ole double %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_letf2(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_letf2:
+ ; CHECK: call __letf2
+ %cmp = fcmp ole fp128 %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_gtsf2(float %a, float %b) #0 {
+ ; CHECK-LABEL: test_gtsf2:
+ ; CHECK: call __gtsf2
+ %cmp = fcmp ogt float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_gtdf2(double %a, double %b) #0 {
+ ; CHECK-LABEL: test_gtdf2:
+ ; CHECK: call __gtdf2
+ %cmp = fcmp ogt double %a, %b
+ ret i1 %cmp
+}
+
+define i1 @test_gttf2(fp128 %a, fp128 %b) #0 {
+ ; CHECK-LABEL: test_gttf2:
+ ; CHECK: call __gttf2
+ %cmp = fcmp ogt fp128 %a, %b
+ ret i1 %cmp
+}
diff --git a/test/CodeGen/SPARC/stack-protector.ll b/test/CodeGen/SPARC/stack-protector.ll
new file mode 100644
index 000000000000..70a73664aa16
--- /dev/null
+++ b/test/CodeGen/SPARC/stack-protector.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=sparc-unknown-linux < %s | FileCheck %s --check-prefix=LINUX-32
+; RUN: llc -mtriple=sparc64-unknown-linux < %s | FileCheck %s --check-prefix=LINUX-64
+; RUN: llc -mtriple=sparc-unknown-solaris < %s | FileCheck %s --check-prefix=GENERIC
+; RUN: llc -mtriple=sparc64-unknown-solaris < %s | FileCheck %s --check-prefix=GENERIC
+
+; LINUX-32: ld [%g7+20], [[REG1:%[ilo][0-9]*]]
+; LINUX-64: ldx [%g7+40], [[REG1:%[ilo][0-9]*]]
+; LINUX-32-NOT: __stack_chk_guard
+; LINUX-64-NOT: __stack_chk_guard
+; GENERIC: __stack_chk_guard
+
+@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00" ; <[11 x i8]*> [#uses=1]
+
+define void @test(i8* %a) nounwind ssp {
+entry:
+ %a_addr = alloca i8* ; <i8**> [#uses=2]
+ %buf = alloca [8 x i8] ; <[8 x i8]*> [#uses=2]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ store i8* %a, i8** %a_addr
+ %buf1 = bitcast [8 x i8]* %buf to i8* ; <i8*> [#uses=1]
+ %0 = load i8*, i8** %a_addr, align 4 ; <i8*> [#uses=1]
+ %1 = call i8* @strcpy(i8* %buf1, i8* %0) nounwind ; <i8*> [#uses=0]
+ %buf2 = bitcast [8 x i8]* %buf to i8* ; <i8*> [#uses=1]
+ %2 = call i32 (i8*, ...) @printf(i8* getelementptr ([11 x i8], [11 x i8]* @"\01LC", i32 0, i32 0), i8* %buf2) nounwind ; <i32> [#uses=0]
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+declare i8* @strcpy(i8*, i8*) nounwind
+
+declare i32 @printf(i8*, ...) nounwind
diff --git a/test/CodeGen/SPARC/thread-pointer.ll b/test/CodeGen/SPARC/thread-pointer.ll
new file mode 100644
index 000000000000..33e99aa94747
--- /dev/null
+++ b/test/CodeGen/SPARC/thread-pointer.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.thread.pointer() #1
+
+define i8* @thread_pointer() {
+; CHECK: mov %g7, %o0
+ %1 = tail call i8* @llvm.thread.pointer()
+ ret i8* %1
+}
diff --git a/test/CodeGen/SPARC/vector-call.ll b/test/CodeGen/SPARC/vector-call.ll
new file mode 100644
index 000000000000..3b004b6230f8
--- /dev/null
+++ b/test/CodeGen/SPARC/vector-call.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+
+; Verify that we correctly handle vector types that appear directly
+; during call lowering. These may cause issue as v2i32 is a legal type
+; for the implementation of LDD
+
+; CHECK-LABEL: fun16v:
+; CHECK: foo1_16v
+; CHECK: foo2_16v
+
+define <2 x i16> @fun16v() #0 {
+ %1 = tail call <2 x i16> @foo1_16v()
+ %2 = tail call <2 x i16> @foo2_16v()
+ %3 = and <2 x i16> %2, %1
+ ret <2 x i16> %3
+}
+
+declare <2 x i16> @foo1_16v() #0
+declare <2 x i16> @foo2_16v() #0
+
+; CHECK-LABEL: fun32v:
+; CHECK: foo1_32v
+; CHECK: foo2_32v
+
+define <2 x i32> @fun32v() #0 {
+ %1 = tail call <2 x i32> @foo1_32v()
+ %2 = tail call <2 x i32> @foo2_32v()
+ %3 = and <2 x i32> %2, %1
+ ret <2 x i32> %3
+}
+
+declare <2 x i32> @foo1_32v() #0
+declare <2 x i32> @foo2_32v() #0
diff --git a/test/CodeGen/SPARC/zerostructcall.ll b/test/CodeGen/SPARC/zerostructcall.ll
new file mode 100644
index 000000000000..0a8ff65e1585
--- /dev/null
+++ b/test/CodeGen/SPARC/zerostructcall.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+
+; CHECK-LABEL: struct_ptr_test
+; CHECK: call struct_ptr_fn
+; CHECK-NEXT: st %i0, [%fp+-4]
+; CHECK-NEXT: ret
+
+%struct.S = type {}
+
+define void @struct_ptr_test(i32 %i) {
+entry:
+ %i.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ %0 = bitcast i32* %i.addr to %struct.S*
+ call void @struct_ptr_fn(%struct.S* byval align 1 %0)
+ ret void
+}
+
+declare void @struct_ptr_fn(%struct.S* byval align 1)
+
+; CHECK-LABEL: struct_test
+; CHECK: call struct_fn
+; CHECK-NEXT: nop
+; CHECK-NEXT: ret
+
+%struct.U = type {}
+
+@a = internal global [1 x %struct.U] zeroinitializer, align 1
+
+define void @struct_test() {
+entry:
+ tail call void @struct_fn(%struct.U* byval align 1 getelementptr inbounds ([1 x %struct.U], [1 x %struct.U]* @a, i32 0, i32 0))
+ ret void
+}
+
+; CHECK-LABEL: struct_arg_test
+; CHECK: call struct_arg_fn
+; CHECK-NEXT: nop
+; CHECK-NEXT: ret
+
+declare void @struct_fn(%struct.U* byval align 1)
+
+@b = internal global [1 x %struct.U] zeroinitializer, align 1
+
+define void @struct_arg_test() {
+entry:
+ tail call void @struct_arg_fn(%struct.U* byval align 1 getelementptr inbounds ([1 x %struct.U], [1 x %struct.U]* @b, i32 0, i32 0))
+ ret void
+}
+
+declare void @struct_arg_fn(%struct.U* byval align 1)
diff --git a/test/CodeGen/SystemZ/Large/branch-range-01.py b/test/CodeGen/SystemZ/Large/branch-range-01.py
index 365d7e420818..344d26121afb 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-01.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-01.py
@@ -70,6 +70,8 @@
branch_blocks = 10
main_size = 0xffd8
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i32 *%stop, i32 %limit) {'
print 'entry:'
print ' br label %before0'
@@ -101,5 +103,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-03.py b/test/CodeGen/SystemZ/Large/branch-range-03.py
index 745d733211ff..75c9ea4a0510 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-03.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-03.py
@@ -70,6 +70,8 @@
branch_blocks = 8
main_size = 0xffcc
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i8 *%stop, i32 %limit) {'
print 'entry:'
print ' br label %before0'
@@ -103,5 +105,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-04.py b/test/CodeGen/SystemZ/Large/branch-range-04.py
index a0c9c4426456..d475c9565e41 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-04.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-04.py
@@ -74,6 +74,8 @@
branch_blocks = 8
main_size = 0xffcc
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i8 *%stop, i64 %limit) {'
print 'entry:'
print ' br label %before0'
@@ -107,5 +109,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-05.py b/test/CodeGen/SystemZ/Large/branch-range-05.py
index 69a8112162a0..0a56eff85e1a 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-05.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-05.py
@@ -74,6 +74,8 @@
branch_blocks = 8
main_size = 0xffcc
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i8 *%stop) {'
print 'entry:'
print ' br label %before0'
@@ -105,5 +107,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-06.py b/test/CodeGen/SystemZ/Large/branch-range-06.py
index b08bc119c454..5b054345b083 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-06.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-06.py
@@ -74,6 +74,8 @@
branch_blocks = 8
main_size = 0xffcc
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i8 *%stop) {'
print 'entry:'
print ' br label %before0'
@@ -105,5 +107,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-09.py b/test/CodeGen/SystemZ/Large/branch-range-09.py
index d4693358f502..6b568a6e6409 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-09.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-09.py
@@ -70,6 +70,8 @@
branch_blocks = 8
main_size = 0xffcc
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i8 *%stop, i32 %limit) {'
print 'entry:'
print ' br label %before0'
@@ -103,5 +105,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-10.py b/test/CodeGen/SystemZ/Large/branch-range-10.py
index c928081f5544..c6f8945e2940 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-10.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-10.py
@@ -74,6 +74,8 @@
branch_blocks = 8
main_size = 0xffcc
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i8 *%stop, i64 %limit) {'
print 'entry:'
print ' br label %before0'
@@ -107,5 +109,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-11.py b/test/CodeGen/SystemZ/Large/branch-range-11.py
index 85166bc15656..10466df8baa7 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-11.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-11.py
@@ -90,6 +90,8 @@
branch_blocks = 8
main_size = 0xffc6
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i32 *%stopa, i32 *%stopb) {'
print 'entry:'
print ' br label %before0'
@@ -123,5 +125,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-12.py b/test/CodeGen/SystemZ/Large/branch-range-12.py
index e1d9e2977d41..809483a9fcd2 100644
--- a/test/CodeGen/SystemZ/Large/branch-range-12.py
+++ b/test/CodeGen/SystemZ/Large/branch-range-12.py
@@ -90,6 +90,8 @@
branch_blocks = 8
main_size = 0xffb4
+print '@global = global i32 0'
+
print 'define void @f1(i8 *%base, i64 *%stopa, i64 *%stopb) {'
print 'entry:'
print ' br label %before0'
@@ -123,5 +125,6 @@ for i in xrange(branch_blocks):
print ''
print 'after%d:' % i
+print ' %dummy = load volatile i32, i32 *@global'
print ' ret void'
print '}'
diff --git a/test/CodeGen/SystemZ/alloca-01.ll b/test/CodeGen/SystemZ/alloca-01.ll
index 06c336a331d8..9ffe59567caf 100644
--- a/test/CodeGen/SystemZ/alloca-01.ll
+++ b/test/CodeGen/SystemZ/alloca-01.ll
@@ -1,7 +1,7 @@
; Test variable-sized allocas and addresses based on them in cases where
; stack arguments are needed.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-A
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-B
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-C
diff --git a/test/CodeGen/SystemZ/and-xor-01.ll b/test/CodeGen/SystemZ/and-xor-01.ll
new file mode 100644
index 000000000000..f29c7d576d9b
--- /dev/null
+++ b/test/CodeGen/SystemZ/and-xor-01.ll
@@ -0,0 +1,14 @@
+; Testing peephole for generating shorter code for (and (xor b, -1), a)
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define i64 @f1(i64 %a, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: ngr %r3, %r2
+; CHECK: xgr %r2, %r3
+; CHECK: br %r14
+ %neg = xor i64 %b, -1
+ %and = and i64 %neg, %a
+ ret i64 %and
+}
+
diff --git a/test/CodeGen/SystemZ/args-09.ll b/test/CodeGen/SystemZ/args-09.ll
new file mode 100644
index 000000000000..333b1daec2ad
--- /dev/null
+++ b/test/CodeGen/SystemZ/args-09.ll
@@ -0,0 +1,53 @@
+; Test the handling of i128 argument values
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-INT
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-I128-1
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-I128-2
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-STACK
+
+declare void @bar(i64, i64, i64, i64, i128,
+ i64, i64, i64, i64, i128)
+
+; There are two indirect i128 slots, one at offset 200 (the first available
+; byte after the outgoing arguments) and one immediately after it at 216.
+; These slots should be set up outside the glued call sequence, so would
+; normally use %f0/%f2 as the first available 128-bit pair. This choice
+; is hard-coded in the I128 tests.
+;
+; The order of the CHECK-STACK stores doesn't matter. It would be OK to reorder
+; them in response to future code changes.
+define void @foo() {
+; CHECK-INT-LABEL: foo:
+; CHECK-INT-DAG: lghi %r2, 1
+; CHECK-INT-DAG: lghi %r3, 2
+; CHECK-INT-DAG: lghi %r4, 3
+; CHECK-INT-DAG: lghi %r5, 4
+; CHECK-INT-DAG: la %r6, {{200|216}}(%r15)
+; CHECK-INT: brasl %r14, bar@PLT
+;
+; CHECK-I128-1-LABEL: foo:
+; CHECK-I128-1: aghi %r15, -232
+; CHECK-I128-1-DAG: mvghi 200(%r15), 0
+; CHECK-I128-1-DAG: mvghi 208(%r15), 0
+; CHECK-I128-1: brasl %r14, bar@PLT
+;
+; CHECK-I128-2-LABEL: foo:
+; CHECK-I128-2: aghi %r15, -232
+; CHECK-I128-2-DAG: mvghi 216(%r15), 0
+; CHECK-I128-2-DAG: mvghi 224(%r15), 0
+; CHECK-I128-2: brasl %r14, bar@PLT
+;
+; CHECK-STACK-LABEL: foo:
+; CHECK-STACK: aghi %r15, -232
+; CHECK-STACK: la [[REGISTER:%r[0-5]+]], {{200|216}}(%r15)
+; CHECK-STACK: stg [[REGISTER]], 192(%r15)
+; CHECK-STACK: mvghi 184(%r15), 8
+; CHECK-STACK: mvghi 176(%r15), 7
+; CHECK-STACK: mvghi 168(%r15), 6
+; CHECK-STACK: mvghi 160(%r15), 5
+; CHECK-STACK: brasl %r14, bar@PLT
+
+ call void @bar (i64 1, i64 2, i64 3, i64 4, i128 0,
+ i64 5, i64 6, i64 7, i64 8, i128 0)
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/args-10.ll b/test/CodeGen/SystemZ/args-10.ll
new file mode 100644
index 000000000000..6083c4415b33
--- /dev/null
+++ b/test/CodeGen/SystemZ/args-10.ll
@@ -0,0 +1,50 @@
+; Test incoming i128 arguments.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Do some arithmetic so that we can see the register being used.
+define void @f1(i128 *%r2, i16 %r3, i32 %r4, i64 %r5, i128 %r6) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lg [[REGL:%r[0-5]+]], 8(%r6)
+; CHECK-DAG: lg [[REGH:%r[0-5]+]], 0(%r6)
+; CHECK: algr [[REGL]], [[REGL]]
+; CHECK-NEXT: alcgr [[REGH]], [[REGH]]
+; CHECK-DAG: stg [[REGL]], 8(%r2)
+; CHECK-DAG: stg [[REGH]], 0(%r2)
+; CHECK: br %r14
+ %y = add i128 %r6, %r6
+ store i128 %y, i128 *%r2
+ ret void
+}
+
+; Test a case where the i128 address is passed on the stack.
+define void @f2(i128 *%r2, i16 %r3, i32 %r4, i64 %r5,
+ i128 %r6, i64 %s1, i64 %s2, i128 %s4) {
+; CHECK-LABEL: f2:
+; CHECK: lg [[ADDR:%r[1-5]+]], 176(%r15)
+; CHECK-DAG: lg [[REGL:%r[0-5]+]], 8([[ADDR]])
+; CHECK-DAG: lg [[REGH:%r[0-5]+]], 0([[ADDR]])
+; CHECK: algr [[REGL]], [[REGL]]
+; CHECK-NEXT: alcgr [[REGH]], [[REGH]]
+; CHECK-DAG: stg [[REGL]], 8(%r2)
+; CHECK-DAG: stg [[REGH]], 0(%r2)
+; CHECK: br %r14
+ %y = add i128 %s4, %s4
+ store i128 %y, i128 *%r2
+ ret void
+}
+
+; Explicit i128 return values are likewise passed indirectly.
+define i128 @f14(i128 %r3) {
+; CHECK-LABEL: f14:
+; CHECK-DAG: lg [[REGL:%r[0-5]+]], 8(%r3)
+; CHECK-DAG: lg [[REGH:%r[0-5]+]], 0(%r3)
+; CHECK: algr [[REGL]], [[REGL]]
+; CHECK-NEXT: alcgr [[REGH]], [[REGH]]
+; CHECK-DAG: stg [[REGL]], 8(%r2)
+; CHECK-DAG: stg [[REGH]], 0(%r2)
+; CHECK: br %r14
+ %y = add i128 %r3, %r3
+ ret i128 %y
+}
+
diff --git a/test/CodeGen/SystemZ/asm-02.ll b/test/CodeGen/SystemZ/asm-02.ll
index 458bfeb49753..426d84882900 100644
--- a/test/CodeGen/SystemZ/asm-02.ll
+++ b/test/CodeGen/SystemZ/asm-02.ll
@@ -48,5 +48,38 @@ define void @f4(i64 %base) {
ret void
}
-; FIXME: at the moment the precise constraint is not passed down to
-; target code, so we must conservatively treat "R" as "Q".
+; Check that indices are allowed
+define void @f5(i64 %base, i64 %index) {
+; CHECK-LABEL: f5:
+; CHECK: blah 0(%r3,%r2)
+; CHECK: br %r14
+ %add = add i64 %base, %index
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*R" (i64 *%addr)
+ ret void
+}
+
+; Check that indices and displacements are allowed simultaneously
+define void @f6(i64 %base, i64 %index) {
+; CHECK-LABEL: f6:
+; CHECK: blah 4095(%r3,%r2)
+; CHECK: br %r14
+ %add = add i64 %base, 4095
+ %addi = add i64 %add, %index
+ %addr = inttoptr i64 %addi to i64 *
+ call void asm "blah $0", "=*R" (i64 *%addr)
+ ret void
+}
+
+; Check that LAY is used if there is an index but the displacement is too large
+define void @f7(i64 %base, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: lay %r0, 4096(%r3,%r2)
+; CHECK: blah 0(%r0)
+; CHECK: br %r14
+ %add = add i64 %base, 4096
+ %addi = add i64 %add, %index
+ %addr = inttoptr i64 %addi to i64 *
+ call void asm "blah $0", "=*R" (i64 *%addr)
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/asm-03.ll b/test/CodeGen/SystemZ/asm-03.ll
index 2e60ad61ef40..d4fd564ce193 100644
--- a/test/CodeGen/SystemZ/asm-03.ll
+++ b/test/CodeGen/SystemZ/asm-03.ll
@@ -3,14 +3,48 @@
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
+; Check the lowest range.
define void @f1(i64 %base) {
; CHECK-LABEL: f1:
+; CHECK: blah -524288(%r2)
+; CHECK: br %r14
+ %add = add i64 %base, -524288
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*S" (i64 *%addr)
+ ret void
+}
+
+; Check the next lowest byte.
+define void @f2(i64 %base) {
+; CHECK-LABEL: f2:
+; CHECK: agfi %r2, -524289
; CHECK: blah 0(%r2)
; CHECK: br %r14
- %addr = inttoptr i64 %base to i64 *
+ %add = add i64 %base, -524289
+ %addr = inttoptr i64 %add to i64 *
call void asm "blah $0", "=*S" (i64 *%addr)
ret void
}
-; FIXME: at the moment the precise constraint is not passed down to
-; target code, so we must conservatively treat "S" as "Q".
+; Check the highest range.
+define void @f3(i64 %base) {
+; CHECK-LABEL: f3:
+; CHECK: blah 524287(%r2)
+; CHECK: br %r14
+ %add = add i64 %base, 524287
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*S" (i64 *%addr)
+ ret void
+}
+
+; Check the next highest byte.
+define void @f4(i64 %base) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r2, 524288
+; CHECK: blah 0(%r2)
+; CHECK: br %r14
+ %add = add i64 %base, 524288
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*S" (i64 *%addr)
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/asm-04.ll b/test/CodeGen/SystemZ/asm-04.ll
index b212253dbd9c..eb91bef83769 100644
--- a/test/CodeGen/SystemZ/asm-04.ll
+++ b/test/CodeGen/SystemZ/asm-04.ll
@@ -3,14 +3,71 @@
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
+; Check the lowest range.
define void @f1(i64 %base) {
; CHECK-LABEL: f1:
+; CHECK: blah -524288(%r2)
+; CHECK: br %r14
+ %add = add i64 %base, -524288
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*T" (i64 *%addr)
+ ret void
+}
+
+; Check the next lowest byte.
+define void @f2(i64 %base) {
+; CHECK-LABEL: f2:
+; CHECK: agfi %r2, -524289
; CHECK: blah 0(%r2)
; CHECK: br %r14
- %addr = inttoptr i64 %base to i64 *
+ %add = add i64 %base, -524289
+ %addr = inttoptr i64 %add to i64 *
call void asm "blah $0", "=*T" (i64 *%addr)
ret void
}
-; FIXME: at the moment the precise constraint is not passed down to
-; target code, so we must conservatively treat "T" as "Q".
+; Check the highest range.
+define void @f3(i64 %base) {
+; CHECK-LABEL: f3:
+; CHECK: blah 524287(%r2)
+; CHECK: br %r14
+ %add = add i64 %base, 524287
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*T" (i64 *%addr)
+ ret void
+}
+
+; Check the next highest byte.
+define void @f4(i64 %base) {
+; CHECK-LABEL: f4:
+; CHECK: agfi %r2, 524288
+; CHECK: blah 0(%r2)
+; CHECK: br %r14
+ %add = add i64 %base, 524288
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*T" (i64 *%addr)
+ ret void
+}
+
+; Check that indices are allowed
+define void @f5(i64 %base, i64 %index) {
+; CHECK-LABEL: f5:
+; CHECK: blah 0(%r3,%r2)
+; CHECK: br %r14
+ %add = add i64 %base, %index
+ %addr = inttoptr i64 %add to i64 *
+ call void asm "blah $0", "=*T" (i64 *%addr)
+ ret void
+}
+
+; Check that indices and displacements are allowed simultaneously
+define void @f6(i64 %base, i64 %index) {
+; CHECK-LABEL: f6:
+; CHECK: blah 524287(%r3,%r2)
+; CHECK: br %r14
+ %add = add i64 %base, 524287
+ %addi = add i64 %add, %index
+ %addr = inttoptr i64 %addi to i64 *
+ call void asm "blah $0", "=*T" (i64 *%addr)
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/asm-05.ll b/test/CodeGen/SystemZ/asm-05.ll
index db99b10853ed..9b23ac781750 100644
--- a/test/CodeGen/SystemZ/asm-05.ll
+++ b/test/CodeGen/SystemZ/asm-05.ll
@@ -10,6 +10,3 @@ define void @f1(i64 %base) {
call void asm "blah $0", "=*m" (i64 *%addr)
ret void
}
-
-; FIXME: at the moment the precise constraint is not passed down to
-; target code, so we must conservatively treat "m" as "Q".
diff --git a/test/CodeGen/SystemZ/atomic-fence-01.ll b/test/CodeGen/SystemZ/atomic-fence-01.ll
new file mode 100644
index 000000000000..25566db9078b
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomic-fence-01.ll
@@ -0,0 +1,16 @@
+; Test (fast) serialization.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=Z10
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s --check-prefix=Z196
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s --check-prefix=ZEC12
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=Z13
+
+define void @test() {
+; Z10: bcr 15, %r0
+; Z196: bcr 14, %r0
+; ZEC12: bcr 14, %r0
+; Z13: bcr 14, %r0
+ fence seq_cst
+ ret void
+}
+
diff --git a/test/CodeGen/SystemZ/atomic-fence-02.ll b/test/CodeGen/SystemZ/atomic-fence-02.ll
new file mode 100644
index 000000000000..4c4375ef6696
--- /dev/null
+++ b/test/CodeGen/SystemZ/atomic-fence-02.ll
@@ -0,0 +1,13 @@
+; Serialization is emitted only for fence seq_cst.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @test() {
+; CHECK: #MEMBARRIER
+ fence acquire
+; CHECK: #MEMBARRIER
+ fence release
+; CHECK: #MEMBARRIER
+ fence acq_rel
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-add-01.ll b/test/CodeGen/SystemZ/atomicrmw-add-01.ll
index 25f71f31ef1b..63c28ebb9872 100644
--- a/test/CodeGen/SystemZ/atomicrmw-add-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-add-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic additions.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,20 +14,20 @@
; instructions.
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg %r1, %r2, 0, 189, 0{{$}}
+; CHECK: sll [[SHIFT:%r[0-9]+]], 3
+; CHECK: l [[OLD:%r[0-9]+]], 0(%r1)
; CHECK: [[LABEL:\.[^:]*]]:
; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
; CHECK: ar [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0(%r1)
; CHECK: jl [[LABEL]]
; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
+; CHECK-SHIFT1: sll [[SHIFT:%r[1-9]+]], 3
; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
@@ -48,20 +48,20 @@ define i8 @f1(i8 *%src, i8 %b) {
; Check the minimum signed value. We add 0x80000000 to the rotated word.
define i8 @f2(i8 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: afi [[ROT]], -2147483648
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
+; CHECK-SHIFT1: sll [[SHIFT:%r[1-9]+]], 3
; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
diff --git a/test/CodeGen/SystemZ/atomicrmw-add-02.ll b/test/CodeGen/SystemZ/atomicrmw-add-02.ll
index cd4e4784c372..8f5e1b4998e5 100644
--- a/test/CodeGen/SystemZ/atomicrmw-add-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-add-02.ll
@@ -1,6 +1,6 @@
; Test 16-bit atomic additions.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,20 +14,20 @@
; instructions.
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: ar [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
+; CHECK-SHIFT1: sll [[SHIFT:%r[1-9]+]], 3
; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
@@ -48,20 +48,20 @@ define i16 @f1(i16 *%src, i16 %b) {
; Check the minimum signed value. We add 0x80000000 to the rotated word.
define i16 @f2(i16 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: afi [[ROT]], -2147483648
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
+; CHECK-SHIFT1: sll [[SHIFT:%r[1-9]+]], 3
; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
diff --git a/test/CodeGen/SystemZ/atomicrmw-and-01.ll b/test/CodeGen/SystemZ/atomicrmw-and-01.ll
index 6d2f541c3a35..c16071669f40 100644
--- a/test/CodeGen/SystemZ/atomicrmw-and-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-and-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic ANDs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,20 @@
; independent of the other loop prologue instructions.
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nr [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -49,21 +48,21 @@ define i8 @f1(i8 *%src, i8 %b) {
; Check the minimum signed value. We AND the rotated word with 0x80ffffff.
define i8 @f2(i8 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nilh [[ROT]], 33023
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-and-02.ll b/test/CodeGen/SystemZ/atomicrmw-and-02.ll
index 572b22484b28..f827c4409fe5 100644
--- a/test/CodeGen/SystemZ/atomicrmw-and-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-and-02.ll
@@ -1,6 +1,6 @@
; Test 16-bit atomic ANDs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,21 @@
; independent of the other loop prologue instructions.
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nr [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -49,21 +49,21 @@ define i16 @f1(i16 *%src, i16 %b) {
; Check the minimum signed value. We AND the rotated word with 0x8000ffff.
define i16 @f2(i16 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nilh [[ROT]], 32768
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
index 2b750c46e261..b304335391ee 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic min/max operations.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,23 +14,23 @@
; independent of the other loop prologue instructions.
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: crjle [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 39, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -50,23 +50,23 @@ define i8 @f1(i8 *%src, i8 %b) {
; Check signed maximum.
define i8 @f2(i8 *%src, i8 %b) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: crjhe [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 39, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -86,23 +86,23 @@ define i8 @f2(i8 *%src, i8 %b) {
; Check unsigned minimum.
define i8 @f3(i8 *%src, i8 %b) {
; CHECK-LABEL: f3:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: clrjle [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 39, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f3:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -122,23 +122,23 @@ define i8 @f3(i8 *%src, i8 %b) {
; Check unsigned maximum.
define i8 @f4(i8 *%src, i8 %b) {
; CHECK-LABEL: f4:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: clrjhe [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 39, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f4:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
index 98ffedf28c69..ccb51316552a 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic min/max operations.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,23 +14,23 @@
; independent of the other loop prologue instructions.
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: crjle [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 47, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -50,23 +50,23 @@ define i16 @f1(i16 *%src, i16 %b) {
; Check signed maximum.
define i16 @f2(i16 *%src, i16 %b) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: crjhe [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 47, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -86,23 +86,23 @@ define i16 @f2(i16 *%src, i16 %b) {
; Check unsigned minimum.
define i16 @f3(i16 *%src, i16 %b) {
; CHECK-LABEL: f3:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: clrjle [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 47, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f3:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -122,23 +122,23 @@ define i16 @f3(i16 *%src, i16 %b) {
; Check unsigned maximum.
define i16 @f4(i16 *%src, i16 %b) {
; CHECK-LABEL: f4:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: clrjhe [[ROT]], %r3, [[KEEP:\..*]]
; CHECK: risbg [[ROT]], %r3, 32, 47, 0
; CHECK: [[KEEP]]:
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LOOP]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f4:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
index 4ab48e46fc82..b53633a5e063 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
@@ -12,8 +12,8 @@ define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
; CHECK: crjle %r2, %r4, [[KEEP:\..*]]
; CHECK: lr [[NEW]], %r4
; CHECK: cs %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw min i32 *%src, i32 %b seq_cst
ret i32 %res
}
@@ -27,8 +27,8 @@ define i32 @f2(i32 %dummy, i32 *%src, i32 %b) {
; CHECK: crjhe %r2, %r4, [[KEEP:\..*]]
; CHECK: lr [[NEW]], %r4
; CHECK: cs %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw max i32 *%src, i32 %b seq_cst
ret i32 %res
}
@@ -42,8 +42,8 @@ define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
; CHECK: clrjle %r2, %r4, [[KEEP:\..*]]
; CHECK: lr [[NEW]], %r4
; CHECK: cs %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw umin i32 *%src, i32 %b seq_cst
ret i32 %res
}
@@ -57,8 +57,8 @@ define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
; CHECK: clrjhe %r2, %r4, [[KEEP:\..*]]
; CHECK: lr [[NEW]], %r4
; CHECK: cs %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw umax i32 *%src, i32 %b seq_cst
ret i32 %res
}
@@ -68,7 +68,7 @@ define i32 @f5(i32 %dummy, i32 *%src, i32 %b) {
; CHECK-LABEL: f5:
; CHECK: l %r2, 4092(%r3)
; CHECK: cs %r2, {{%r[0-9]+}}, 4092(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i32, i32 *%src, i64 1023
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
ret i32 %res
@@ -79,7 +79,7 @@ define i32 @f6(i32 %dummy, i32 *%src, i32 %b) {
; CHECK-LABEL: f6:
; CHECK: ly %r2, 4096(%r3)
; CHECK: csy %r2, {{%r[0-9]+}}, 4096(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i32, i32 *%src, i64 1024
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
ret i32 %res
@@ -90,7 +90,7 @@ define i32 @f7(i32 %dummy, i32 *%src, i32 %b) {
; CHECK-LABEL: f7:
; CHECK: ly %r2, 524284(%r3)
; CHECK: csy %r2, {{%r[0-9]+}}, 524284(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i32, i32 *%src, i64 131071
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
ret i32 %res
@@ -102,7 +102,7 @@ define i32 @f8(i32 %dummy, i32 *%src, i32 %b) {
; CHECK: agfi %r3, 524288
; CHECK: l %r2, 0(%r3)
; CHECK: cs %r2, {{%r[0-9]+}}, 0(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i32, i32 *%src, i64 131072
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
ret i32 %res
@@ -113,7 +113,7 @@ define i32 @f9(i32 %dummy, i32 *%src, i32 %b) {
; CHECK-LABEL: f9:
; CHECK: ly %r2, -4(%r3)
; CHECK: csy %r2, {{%r[0-9]+}}, -4(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i32, i32 *%src, i64 -1
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
ret i32 %res
@@ -124,7 +124,7 @@ define i32 @f10(i32 %dummy, i32 *%src, i32 %b) {
; CHECK-LABEL: f10:
; CHECK: ly %r2, -524288(%r3)
; CHECK: csy %r2, {{%r[0-9]+}}, -524288(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i32, i32 *%src, i64 -131072
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
ret i32 %res
@@ -136,7 +136,7 @@ define i32 @f11(i32 %dummy, i32 *%src, i32 %b) {
; CHECK: agfi %r3, -524292
; CHECK: l %r2, 0(%r3)
; CHECK: cs %r2, {{%r[0-9]+}}, 0(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i32, i32 *%src, i64 -131073
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
ret i32 %res
@@ -148,7 +148,7 @@ define i32 @f12(i32 %dummy, i64 %base, i64 %index, i32 %b) {
; CHECK: agr %r3, %r4
; CHECK: l %r2, 0(%r3)
; CHECK: cs %r2, {{%r[0-9]+}}, 0(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to i32 *
%res = atomicrmw min i32 *%ptr, i32 %b seq_cst
@@ -165,8 +165,8 @@ define i32 @f13(i32 %dummy, i32 *%ptr) {
; CHECK: crjle %r2, [[LIMIT]], [[KEEP:\..*]]
; CHECK: lhi [[NEW]], 42
; CHECK: cs %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw min i32 *%ptr, i32 42 seq_cst
ret i32 %res
}
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
index afd88a3dd42d..444dc915c0fe 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
@@ -12,8 +12,8 @@ define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
; CHECK: cgrjle %r2, %r4, [[KEEP:\..*]]
; CHECK: lgr [[NEW]], %r4
; CHECK: csg %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw min i64 *%src, i64 %b seq_cst
ret i64 %res
}
@@ -27,8 +27,8 @@ define i64 @f2(i64 %dummy, i64 *%src, i64 %b) {
; CHECK: cgrjhe %r2, %r4, [[KEEP:\..*]]
; CHECK: lgr [[NEW]], %r4
; CHECK: csg %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw max i64 *%src, i64 %b seq_cst
ret i64 %res
}
@@ -42,8 +42,8 @@ define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
; CHECK: clgrjle %r2, %r4, [[KEEP:\..*]]
; CHECK: lgr [[NEW]], %r4
; CHECK: csg %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw umin i64 *%src, i64 %b seq_cst
ret i64 %res
}
@@ -57,8 +57,8 @@ define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
; CHECK: clgrjhe %r2, %r4, [[KEEP:\..*]]
; CHECK: lgr [[NEW]], %r4
; CHECK: csg %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw umax i64 *%src, i64 %b seq_cst
ret i64 %res
}
@@ -68,7 +68,7 @@ define i64 @f5(i64 %dummy, i64 *%src, i64 %b) {
; CHECK-LABEL: f5:
; CHECK: lg %r2, 524280(%r3)
; CHECK: csg %r2, {{%r[0-9]+}}, 524280(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i64, i64 *%src, i64 65535
%res = atomicrmw min i64 *%ptr, i64 %b seq_cst
ret i64 %res
@@ -80,7 +80,7 @@ define i64 @f6(i64 %dummy, i64 *%src, i64 %b) {
; CHECK: agfi %r3, 524288
; CHECK: lg %r2, 0(%r3)
; CHECK: csg %r2, {{%r[0-9]+}}, 0(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i64, i64 *%src, i64 65536
%res = atomicrmw min i64 *%ptr, i64 %b seq_cst
ret i64 %res
@@ -91,7 +91,7 @@ define i64 @f7(i64 %dummy, i64 *%src, i64 %b) {
; CHECK-LABEL: f7:
; CHECK: lg %r2, -524288(%r3)
; CHECK: csg %r2, {{%r[0-9]+}}, -524288(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i64, i64 *%src, i64 -65536
%res = atomicrmw min i64 *%ptr, i64 %b seq_cst
ret i64 %res
@@ -103,7 +103,7 @@ define i64 @f8(i64 %dummy, i64 *%src, i64 %b) {
; CHECK: agfi %r3, -524296
; CHECK: lg %r2, 0(%r3)
; CHECK: csg %r2, {{%r[0-9]+}}, 0(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%ptr = getelementptr i64, i64 *%src, i64 -65537
%res = atomicrmw min i64 *%ptr, i64 %b seq_cst
ret i64 %res
@@ -115,7 +115,7 @@ define i64 @f9(i64 %dummy, i64 %base, i64 %index, i64 %b) {
; CHECK: agr %r3, %r4
; CHECK: lg %r2, 0(%r3)
; CHECK: csg %r2, {{%r[0-9]+}}, 0(%r3)
-; CHECK: br %r14
+; CHECK: ber %r14
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to i64 *
%res = atomicrmw min i64 *%ptr, i64 %b seq_cst
@@ -132,8 +132,8 @@ define i64 @f10(i64 %dummy, i64 *%ptr) {
; CHECK: cgrjle %r2, [[LIMIT]], [[KEEP:\..*]]
; CHECK: lghi [[NEW]], 42
; CHECK: csg %r2, [[NEW]], 0(%r3)
-; CHECK: jl [[LOOP]]
-; CHECK: br %r14
+; CHECK: ber %r14
+; CHECK: j [[LOOP]]
%res = atomicrmw min i64 *%ptr, i64 42 seq_cst
ret i64 %res
}
diff --git a/test/CodeGen/SystemZ/atomicrmw-nand-01.ll b/test/CodeGen/SystemZ/atomicrmw-nand-01.ll
index db5bb8ff9e79..f0fbd9d59a14 100644
--- a/test/CodeGen/SystemZ/atomicrmw-nand-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-nand-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic NANDs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,22 +14,22 @@
; independent of the other loop prologue instructions.
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nr [[ROT]], %r3
; CHECK: xilf [[ROT]], 4278190080
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -50,22 +50,22 @@ define i8 @f1(i8 *%src, i8 %b) {
; Check the minimum signed value. We AND the rotated word with 0x80ffffff.
define i8 @f2(i8 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nilh [[ROT]], 33023
; CHECK: xilf [[ROT]], 4278190080
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-nand-02.ll b/test/CodeGen/SystemZ/atomicrmw-nand-02.ll
index 6141543e0db2..45b22d4a6f18 100644
--- a/test/CodeGen/SystemZ/atomicrmw-nand-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-nand-02.ll
@@ -1,6 +1,6 @@
; Test 16-bit atomic NANDs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,22 +14,22 @@
; independent of the other loop prologue instructions.
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nr [[ROT]], %r3
; CHECK: xilf [[ROT]], 4294901760
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -50,22 +50,22 @@ define i16 @f1(i16 *%src, i16 %b) {
; Check the minimum signed value. We AND the rotated word with 0x8000ffff.
define i16 @f2(i16 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: nilh [[ROT]], 32768
; CHECK: xilf [[ROT]], 4294901760
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-or-01.ll b/test/CodeGen/SystemZ/atomicrmw-or-01.ll
index caba621addc0..e4d790ebfcb7 100644
--- a/test/CodeGen/SystemZ/atomicrmw-or-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-or-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic ORs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,21 @@
; instructions.
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: or [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -48,21 +48,21 @@ define i8 @f1(i8 *%src, i8 %b) {
; Check the minimum signed value. We OR the rotated word with 0x80000000.
define i8 @f2(i8 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: oilh [[ROT]], 32768
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-or-02.ll b/test/CodeGen/SystemZ/atomicrmw-or-02.ll
index 877c642a35ae..5029e7925bb1 100644
--- a/test/CodeGen/SystemZ/atomicrmw-or-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-or-02.ll
@@ -1,6 +1,6 @@
; Test 16-bit atomic ORs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,21 @@
; instructions.
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: or [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -48,21 +48,21 @@ define i16 @f1(i16 *%src, i16 %b) {
; Check the minimum signed value. We OR the rotated word with 0x80000000.
define i16 @f2(i16 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: oilh [[ROT]], 32768
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-sub-01.ll b/test/CodeGen/SystemZ/atomicrmw-sub-01.ll
index 2c08ebd9f5fc..a12203cd7224 100644
--- a/test/CodeGen/SystemZ/atomicrmw-sub-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-sub-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic subtractions.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,21 @@
; instructions.
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: sr [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -48,21 +48,21 @@ define i8 @f1(i8 *%src, i8 %b) {
; Check the minimum signed value. We add 0x80000000 to the rotated word.
define i8 @f2(i8 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: afi [[ROT]], -2147483648
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-sub-02.ll b/test/CodeGen/SystemZ/atomicrmw-sub-02.ll
index f82ebd9aaaae..1fe1bac18bef 100644
--- a/test/CodeGen/SystemZ/atomicrmw-sub-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-sub-02.ll
@@ -1,6 +1,6 @@
; Test 16-bit atomic subtractions.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,21 @@
; instructions.
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: sr [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -48,21 +48,21 @@ define i16 @f1(i16 *%src, i16 %b) {
; Check the minimum signed value. We add 0x80000000 to the rotated word.
define i16 @f2(i16 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: afi [[ROT]], -2147483648
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-xchg-01.ll b/test/CodeGen/SystemZ/atomicrmw-xchg-01.ll
index 52575c634971..e7d47ed9c433 100644
--- a/test/CodeGen/SystemZ/atomicrmw-xchg-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-xchg-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic exchange.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT
; Check exchange with a variable.
@@ -12,23 +12,23 @@
; which shift %r3 left so that %b is at the high end of the word).
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: risbg [[ROT]], %r3, 32, 39, 24
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT-LABEL: f1:
; CHECK-SHIFT-NOT: %r3
-; CHECK-SHIFT: sllg [[SHIFT:%r[1-9]+]], %r2, 3
+; CHECK-SHIFT: sll %r2, 3
; CHECK-SHIFT-NOT: %r3
-; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT-NOT: %r3
; CHECK-SHIFT: rll
; CHECK-SHIFT-NOT: %r3
diff --git a/test/CodeGen/SystemZ/atomicrmw-xchg-02.ll b/test/CodeGen/SystemZ/atomicrmw-xchg-02.ll
index 04be623ada89..97d16c072bb6 100644
--- a/test/CodeGen/SystemZ/atomicrmw-xchg-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-xchg-02.ll
@@ -1,6 +1,6 @@
; Test 16-bit atomic exchange.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT
; Check exchange with a variable.
@@ -12,23 +12,23 @@
; which shift %r3 left so that %b is at the high end of the word).
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: risbg [[ROT]], %r3, 32, 47, 16
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT-LABEL: f1:
; CHECK-SHIFT-NOT: %r3
-; CHECK-SHIFT: sllg [[SHIFT:%r[1-9]+]], %r2, 3
+; CHECK-SHIFT: sll %r2, 3
; CHECK-SHIFT-NOT: %r3
-; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT-NOT: %r3
; CHECK-SHIFT: rll
; CHECK-SHIFT-NOT: %r3
diff --git a/test/CodeGen/SystemZ/atomicrmw-xor-01.ll b/test/CodeGen/SystemZ/atomicrmw-xor-01.ll
index e8fef2d31d2c..49bc7d7b0634 100644
--- a/test/CodeGen/SystemZ/atomicrmw-xor-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-xor-01.ll
@@ -1,6 +1,6 @@
; Test 8-bit atomic XORs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,21 @@
; instructions.
define i8 @f1(i8 *%src, i8 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: xr [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -48,21 +48,21 @@ define i8 @f1(i8 *%src, i8 %b) {
; Check the minimum signed value. We XOR the rotated word with 0x80000000.
define i8 @f2(i8 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: xilf [[ROT]], 2147483648
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 8(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/atomicrmw-xor-02.ll b/test/CodeGen/SystemZ/atomicrmw-xor-02.ll
index 9405c2ec0c08..ca60e4189bad 100644
--- a/test/CodeGen/SystemZ/atomicrmw-xor-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-xor-02.ll
@@ -1,6 +1,6 @@
; Test 16-bit atomic XORs.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
@@ -14,21 +14,21 @@
; instructions.
define i16 @f1(i16 *%src, i16 %b) {
; CHECK-LABEL: f1:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: xr [[ROT]], %r3
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f1:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
@@ -48,21 +48,21 @@ define i16 @f1(i16 *%src, i16 %b) {
; Check the minimum signed value. We XOR the rotated word with 0x80000000.
define i16 @f2(i16 *%src) {
; CHECK-LABEL: f2:
-; CHECK: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK: nill %r2, 65532
-; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
+; CHECK: risbg [[RISBG:%r[1-9]+]], %r2, 0, 189, 0{{$}}
+; CHECK: sll %r2, 3
+; CHECK: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
+; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0(%r2)
; CHECK: xilf [[ROT]], 2147483648
; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0([[NEGSHIFT:%r[1-9]+]])
-; CHECK: cs [[OLD]], [[NEW]], 0(%r2)
+; CHECK: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK: jl [[LABEL]]
-; CHECK: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK: rll %r2, [[OLD]], 16(%r2)
; CHECK: br %r14
;
; CHECK-SHIFT1-LABEL: f2:
-; CHECK-SHIFT1: sllg [[SHIFT:%r[1-9]+]], %r2, 3
-; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT1: sll %r2, 3
+; CHECK-SHIFT1: lcr [[NEGSHIFT:%r[1-9]+]], %r2
; CHECK-SHIFT1: rll
; CHECK-SHIFT1: rll {{%r[0-9]+}}, {{%r[0-9]+}}, 0([[NEGSHIFT]])
; CHECK-SHIFT1: rll
diff --git a/test/CodeGen/SystemZ/backchain.ll b/test/CodeGen/SystemZ/backchain.ll
new file mode 100644
index 000000000000..45775dbf273d
--- /dev/null
+++ b/test/CodeGen/SystemZ/backchain.ll
@@ -0,0 +1,84 @@
+; Test the backchain attribute.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@llvm.stacksave()
+declare void @llvm.stackrestore(i8 *)
+declare void @g()
+
+; nothing should happen if no stack frame is needed.
+define void @f1() "backchain" {
+; CHECK-LABEL: f1:
+; CHECK-NOT: stg
+ ret void
+}
+
+; check that backchain is saved if we call someone
+define void @f2() "backchain" {
+; CHECK-LABEL: f2:
+; CHECK: stmg %r14, %r15, 112(%r15)
+; CHECK: lgr %r1, %r15
+; CHECK: aghi %r15, -160
+; CHECK: stg %r1, 0(%r15)
+ call void @g()
+ call void @g()
+ ret void
+}
+
+; check that backchain is saved if we have an alloca
+define void @f3() "backchain" {
+; CHECK-LABEL: f3:
+; CHECK-NOT: stmg
+; CHECK: lgr %r1, %r15
+; CHECK: aghi %r15, -168
+; CHECK: stg %r1, 0(%r15)
+ %ign = alloca i8, i32 4
+ ret void
+}
+
+; check that alloca copies the backchain
+define void @f4(i32 %len) "backchain" {
+; CHECK-LABEL: f4:
+; CHECK: stmg %r11, %r15, 88(%r15)
+; CHECK: lgr %r1, %r15
+; CHECK: aghi %r15, -160
+; CHECK: stg %r1, 0(%r15)
+; CHECK: lgr %r11, %r15
+; CHECK: lg [[BC:%r[0-9]+]], 0(%r15)
+; CHECK: lgr [[NEWSP:%r[0-9]+]], %r15
+; CHECK: lgr %r15, [[NEWSP]]
+; CHECK: stg [[BC]], 0([[NEWSP]])
+ %ign = alloca i8, i32 %len
+ ret void
+}
+
+; check that llvm.stackrestore restores the backchain
+define void @f5(i32 %count1, i32 %count2) "backchain" {
+; CHECK-LABEL: f5:
+; CHECK: stmg %r11, %r15, 88(%r15)
+; CHECK: lgr %r1, %r15
+; CHECK: aghi %r15, -160
+; CHECK: stg %r1, 0(%r15)
+; CHECK: lgr %r11, %r15
+; CHECK: lgr [[SAVESP:%r[0-9]+]], %r15
+; CHECK: lg [[BC:%r[0-9]+]], 0(%r15)
+; CHECK: lgr [[NEWSP:%r[0-9]+]], %r15
+; CHECK: lgr %r15, [[NEWSP]]
+; CHECK: stg [[BC]], 0([[NEWSP]])
+; CHECK: lg [[BC2:%r[0-9]+]], 0(%r15)
+; CHECK: lgr %r15, [[SAVESP]]
+; CHECK: stg [[BC2]], 0([[SAVESP]])
+; CHECK: lg [[BC3:%r[0-9]+]], 0(%r15)
+; CHECK: lgr [[NEWSP2:%r[0-9]+]], %r15
+; CHECK: lgr %r15, [[NEWSP2]]
+; CHECK: stg [[BC3]], 0([[NEWSP2]])
+; CHECK: lmg %r11, %r15, 248(%r11)
+; CHECK: br %r14
+ %src = call i8 *@llvm.stacksave()
+ %array1 = alloca i8, i32 %count1
+ store volatile i8 0, i8 *%array1
+ call void @llvm.stackrestore(i8 *%src)
+ %array2 = alloca i8, i32 %count2
+ store volatile i8 0, i8 *%array2
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/branch-05.ll b/test/CodeGen/SystemZ/branch-05.ll
index b2157b5ac778..4a4aa2a9eb36 100644
--- a/test/CodeGen/SystemZ/branch-05.ll
+++ b/test/CodeGen/SystemZ/branch-05.ll
@@ -5,7 +5,7 @@
define i32 @f1(i32 %x, i32 %y, i32 %op) {
; CHECK-LABEL: f1:
; CHECK: ahi %r4, -1
-; CHECK: clijh %r4, 5,
+; CHECK: clibh %r4, 5, 0(%r14)
; CHECK: llgfr [[OP64:%r[0-5]]], %r4
; CHECK: sllg [[INDEX:%r[1-5]]], [[OP64]], 3
; CHECK: larl [[BASE:%r[1-5]]]
diff --git a/test/CodeGen/SystemZ/bswap-06.ll b/test/CodeGen/SystemZ/bswap-06.ll
new file mode 100644
index 000000000000..19aafe2ca17b
--- /dev/null
+++ b/test/CodeGen/SystemZ/bswap-06.ll
@@ -0,0 +1,99 @@
+; Test 16-bit byteswaps from memory to registers.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i16 @llvm.bswap.i16(i16 %a)
+
+; Check LRVH with no displacement.
+define i16 @f1(i16 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: lrvh %r2, 0(%r2)
+; CHECK: br %r14
+ %a = load i16 , i16 *%src
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
+
+; Check the high end of the aligned LRVH range.
+define i16 @f2(i16 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lrvh %r2, 524286(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%src, i64 262143
+ %a = load i16 , i16 *%ptr
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i16 @f3(i16 *%src) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r2, 524288
+; CHECK: lrvh %r2, 0(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%src, i64 262144
+ %a = load i16 , i16 *%ptr
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
+
+; Check the high end of the negative aligned LRVH range.
+define i16 @f4(i16 *%src) {
+; CHECK-LABEL: f4:
+; CHECK: lrvh %r2, -2(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%src, i64 -1
+ %a = load i16 , i16 *%ptr
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
+
+; Check the low end of the LRVH range.
+define i16 @f5(i16 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: lrvh %r2, -524288(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%src, i64 -262144
+ %a = load i16 , i16 *%ptr
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i16 @f6(i16 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r2, -524290
+; CHECK: lrvh %r2, 0(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%src, i64 -262145
+ %a = load i16 , i16 *%ptr
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
+
+; Check that LRVH allows an index.
+define i16 @f7(i64 %src, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: lrvh %r2, 524287({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+ %add1 = add i64 %src, %index
+ %add2 = add i64 %add1, 524287
+ %ptr = inttoptr i64 %add2 to i16 *
+ %a = load i16 , i16 *%ptr
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
+
+; Check that volatile accesses do not use LRVH, which might access the
+; storage multple times.
+define i16 @f8(i16 *%src) {
+; CHECK-LABEL: f8:
+; CHECK: lh [[REG:%r[0-5]]], 0(%r2)
+; CHECK: lrvr %r2, [[REG]]
+; CHECK: br %r14
+ %a = load volatile i16 , i16 *%src
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %swapped
+}
diff --git a/test/CodeGen/SystemZ/bswap-07.ll b/test/CodeGen/SystemZ/bswap-07.ll
new file mode 100644
index 000000000000..7f0a265de756
--- /dev/null
+++ b/test/CodeGen/SystemZ/bswap-07.ll
@@ -0,0 +1,100 @@
+; Test 32-bit byteswaps from registers to memory.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i16 @llvm.bswap.i16(i16 %a)
+
+; Check STRVH with no displacement.
+define void @f1(i16 *%dst, i16 %a) {
+; CHECK-LABEL: f1:
+; CHECK: strvh %r3, 0(%r2)
+; CHECK: br %r14
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store i16 %swapped, i16 *%dst
+ ret void
+}
+
+; Check the high end of the aligned STRVH range.
+define void @f2(i16 *%dst, i16 %a) {
+; CHECK-LABEL: f2:
+; CHECK: strvh %r3, 524286(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%dst, i64 262143
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store i16 %swapped, i16 *%ptr
+ ret void
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f3(i16 *%dst, i16 %a) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r2, 524288
+; CHECK: strvh %r3, 0(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%dst, i64 262144
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store i16 %swapped, i16 *%ptr
+ ret void
+}
+
+; Check the high end of the negative aligned STRVH range.
+define void @f4(i16 *%dst, i16 %a) {
+; CHECK-LABEL: f4:
+; CHECK: strvh %r3, -2(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%dst, i64 -1
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store i16 %swapped, i16 *%ptr
+ ret void
+}
+
+; Check the low end of the STRVH range.
+define void @f5(i16 *%dst, i16 %a) {
+; CHECK-LABEL: f5:
+; CHECK: strvh %r3, -524288(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%dst, i64 -262144
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store i16 %swapped, i16 *%ptr
+ ret void
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f6(i16 *%dst, i16 %a) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r2, -524290
+; CHECK: strvh %r3, 0(%r2)
+; CHECK: br %r14
+ %ptr = getelementptr i16, i16 *%dst, i64 -262145
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store i16 %swapped, i16 *%ptr
+ ret void
+}
+
+; Check that STRVH allows an index.
+define void @f7(i64 %src, i64 %index, i16 %a) {
+; CHECK-LABEL: f7:
+; CHECK: strvh %r4, 524287({{%r3,%r2|%r2,%r3}})
+; CHECK: br %r14
+ %add1 = add i64 %src, %index
+ %add2 = add i64 %add1, 524287
+ %ptr = inttoptr i64 %add2 to i16 *
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store i16 %swapped, i16 *%ptr
+ ret void
+}
+
+; Check that volatile stores do not use STRVH, which might access the
+; storage multple times.
+define void @f8(i16 *%dst, i16 %a) {
+; CHECK-LABEL: f8:
+; CHECK: lrvr [[REG:%r[0-5]]], %r3
+; CHECK: srl [[REG]], 16
+; CHECK: sth [[REG]], 0(%r2)
+; CHECK: br %r14
+ %swapped = call i16 @llvm.bswap.i16(i16 %a)
+ store volatile i16 %swapped, i16 *%dst
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/builtins.ll b/test/CodeGen/SystemZ/builtins.ll
new file mode 100644
index 000000000000..86546c08488a
--- /dev/null
+++ b/test/CodeGen/SystemZ/builtins.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.thread.pointer() #1
+
+define i8* @thread_pointer() {
+; CHECK: thread_pointer:
+; CHECK: ear [[REG1:%r[0-5]]], %a0
+; CHECK: sllg %r2, [[REG1]], 32
+; CHECK: ear %r2, %a1
+; CHECK: br %r14
+ %1 = tail call i8* @llvm.thread.pointer()
+ ret i8* %1
+}
diff --git a/test/CodeGen/SystemZ/call-04.ll b/test/CodeGen/SystemZ/call-04.ll
new file mode 100644
index 000000000000..12e0d5966765
--- /dev/null
+++ b/test/CodeGen/SystemZ/call-04.ll
@@ -0,0 +1,369 @@
+; Test conditional sibling calls.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @fun_a()
+declare void @fun_b()
+declare void @fun_c(i32)
+
+@var = global i32 1;
+
+; Check a conditional sibling call.
+define void @f1(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f1:
+; CHECK: cr %r2, %r3
+; CHECK: jgl fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp slt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call when there are two possibilities.
+define void @f2(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f2:
+; CHECK: cr %r2, %r3
+; CHECK: jghe fun_b@PLT
+; CHECK: jg fun_a@PLT
+ %cond = icmp slt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ tail call void @fun_b()
+ ret void
+}
+
+; Check a conditional sibling call with an argument - not supported.
+define void @f3(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f3:
+; CHECK: crjhe %r2, %r3
+; CHECK: jg fun_c@PLT
+; CHECK: br %r14
+ %cond = icmp slt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_c(i32 1)
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - unsigned compare.
+define void @f4(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f4:
+; CHECK: clr %r2, %r3
+; CHECK: jgl fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp ult i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - 64-bit compare.
+define void @f5(i64 %val1, i64 %val2) {
+; CHECK-LABEL: f5:
+; CHECK: cgr %r2, %r3
+; CHECK: jgl fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp slt i64 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - unsigned 64-bit compare.
+define void @f6(i64 %val1, i64 %val2) {
+; CHECK-LABEL: f6:
+; CHECK: clgr %r2, %r3
+; CHECK: jgl fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp ult i64 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - less-equal compare.
+define void @f7(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f7:
+; CHECK: cr %r2, %r3
+; CHECK: jgle fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp sle i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - high compare.
+define void @f8(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f8:
+; CHECK: cr %r2, %r3
+; CHECK: jgh fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp sgt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - high-equal compare.
+define void @f9(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f9:
+; CHECK: cr %r2, %r3
+; CHECK: jghe fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp sge i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - equal compare.
+define void @f10(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f10:
+; CHECK: cr %r2, %r3
+; CHECK: jge fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp eq i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - unequal compare.
+define void @f11(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f11:
+; CHECK: cr %r2, %r3
+; CHECK: jglh fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp ne i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate slt.
+define void @f12(i32 %val1) {
+; CHECK-LABEL: f12:
+; CHECK: chi %r2, 4
+; CHECK: jgle fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp slt i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate sle.
+define void @f13(i32 %val1) {
+; CHECK-LABEL: f13:
+; CHECK: chi %r2, 5
+; CHECK: jgle fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp sle i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate sgt.
+define void @f14(i32 %val1) {
+; CHECK-LABEL: f14:
+; CHECK: chi %r2, 6
+; CHECK: jghe fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp sgt i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate sge.
+define void @f15(i32 %val1) {
+; CHECK-LABEL: f15:
+; CHECK: chi %r2, 5
+; CHECK: jghe fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp sge i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate eq.
+define void @f16(i32 %val1) {
+; CHECK-LABEL: f16:
+; CHECK: chi %r2, 5
+; CHECK: jge fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp eq i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate ne.
+define void @f17(i32 %val1) {
+; CHECK-LABEL: f17:
+; CHECK: chi %r2, 5
+; CHECK: jglh fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp ne i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate ult.
+define void @f18(i32 %val1) {
+; CHECK-LABEL: f18:
+; CHECK: clfi %r2, 4
+; CHECK: jgle fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp ult i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate 64-bit slt.
+define void @f19(i64 %val1) {
+; CHECK-LABEL: f19:
+; CHECK: cghi %r2, 4
+; CHECK: jgle fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp slt i64 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate 64-bit ult.
+define void @f20(i64 %val1) {
+; CHECK-LABEL: f20:
+; CHECK: clgfi %r2, 4
+; CHECK: jgle fun_a@PLT
+; CHECK: br %r14
+ %cond = icmp ult i64 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void @fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/call-05.ll b/test/CodeGen/SystemZ/call-05.ll
new file mode 100644
index 000000000000..15704531d96a
--- /dev/null
+++ b/test/CodeGen/SystemZ/call-05.ll
@@ -0,0 +1,467 @@
+; Test conditional sibling calls.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+
+@var = global i32 1;
+@fun_a = global void()* null;
+@fun_b = global void()* null;
+@fun_c = global void(i32)* null;
+
+; Check a conditional sibling call.
+define void @f1(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f1:
+; CHECK: crbl %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp slt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call when there are two possibilities.
+define void @f2(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f2:
+; CHECK: crbl %r2, %r3, 0(%r1)
+; CHECK: br %r1
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %fun_b = load volatile void() *, void()** @fun_b;
+ %cond = icmp slt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ tail call void %fun_b()
+ ret void
+}
+
+; Check a conditional sibling call with an argument - not supported.
+define void @f3(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f3:
+; CHECK: crjhe %r2, %r3
+; CHECK: br %r1
+; CHECK: br %r14
+ %fun_c = load volatile void(i32) *, void(i32)** @fun_c;
+ %cond = icmp slt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_c(i32 1)
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - unsigned compare.
+define void @f4(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f4:
+; CHECK: clrbl %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp ult i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - 64-bit compare.
+define void @f5(i64 %val1, i64 %val2) {
+; CHECK-LABEL: f5:
+; CHECK: cgrbl %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp slt i64 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - unsigned 64-bit compare.
+define void @f6(i64 %val1, i64 %val2) {
+; CHECK-LABEL: f6:
+; CHECK: clgrbl %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp ult i64 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - less-equal compare.
+define void @f7(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f7:
+; CHECK: crble %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp sle i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - high compare.
+define void @f8(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f8:
+; CHECK: crbh %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp sgt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - high-equal compare.
+define void @f9(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f9:
+; CHECK: crbhe %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp sge i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - equal compare.
+define void @f10(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f10:
+; CHECK: crbe %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp eq i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - unequal compare.
+define void @f11(i32 %val1, i32 %val2) {
+; CHECK-LABEL: f11:
+; CHECK: crblh %r2, %r3, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp ne i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate slt.
+define void @f12(i32 %val1) {
+; CHECK-LABEL: f12:
+; CHECK: cible %r2, 4, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp slt i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate sle.
+define void @f13(i32 %val1) {
+; CHECK-LABEL: f13:
+; CHECK: cible %r2, 5, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp sle i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate sgt.
+define void @f14(i32 %val1) {
+; CHECK-LABEL: f14:
+; CHECK: cibhe %r2, 6, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp sgt i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate sge.
+define void @f15(i32 %val1) {
+; CHECK-LABEL: f15:
+; CHECK: cibhe %r2, 5, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp sge i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate eq.
+define void @f16(i32 %val1) {
+; CHECK-LABEL: f16:
+; CHECK: cibe %r2, 5, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp eq i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate ne.
+define void @f17(i32 %val1) {
+; CHECK-LABEL: f17:
+; CHECK: ciblh %r2, 5, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp ne i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate ult.
+define void @f18(i32 %val1) {
+; CHECK-LABEL: f18:
+; CHECK: clible %r2, 4, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp ult i32 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate 64-bit slt.
+define void @f19(i64 %val1) {
+; CHECK-LABEL: f19:
+; CHECK: cgible %r2, 4, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp slt i64 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - immediate 64-bit ult.
+define void @f20(i64 %val1) {
+; CHECK-LABEL: f20:
+; CHECK: clgible %r2, 4, 0(%r1)
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = icmp ult i64 %val1, 5;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call to an argument - will fail due to
+; intervening lgr.
+define void @f21(i32 %val1, i32 %val2, void()* %fun) {
+; CHECK-LABEL: f21:
+; CHECK: crjhe %r2, %r3
+; CHECK: lgr %r1, %r4
+; CHECK: br %r1
+; CHECK: br %r14
+ %cond = icmp slt i32 %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - float olt compare.
+define void @f22(float %val1, float %val2) {
+; CHECK-LABEL: f22:
+; CHECK: cebr %f0, %f2
+; CHECK: blr %r1
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = fcmp olt float %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - float ult compare.
+define void @f23(float %val1, float %val2) {
+; CHECK-LABEL: f23:
+; CHECK: cebr %f0, %f2
+; CHECK: bnher %r1
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = fcmp ult float %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - float ord compare.
+define void @f24(float %val1, float %val2) {
+; CHECK-LABEL: f24:
+; CHECK: cebr %f0, %f2
+; CHECK: bnor %r1
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = fcmp ord float %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
+
+; Check a conditional sibling call - float uno compare.
+define void @f25(float %val1, float %val2) {
+; CHECK-LABEL: f25:
+; CHECK: cebr %f0, %f2
+; CHECK: bor %r1
+; CHECK: br %r14
+ %fun_a = load volatile void() *, void()** @fun_a;
+ %cond = fcmp uno float %val1, %val2;
+ br i1 %cond, label %a, label %b;
+
+a:
+ tail call void %fun_a()
+ ret void
+
+b:
+ store i32 1, i32 *@var;
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/cmpxchg-01.ll b/test/CodeGen/SystemZ/cmpxchg-01.ll
index 5118aadcf2ad..a74c2ff878e7 100644
--- a/test/CodeGen/SystemZ/cmpxchg-01.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-01.ll
@@ -12,23 +12,23 @@
; which shift %r3 left so that %b is at the high end of the word).
define i8 @f1(i8 %dummy, i8 *%src, i8 %cmp, i8 %swap) {
; CHECK-MAIN-LABEL: f1:
-; CHECK-MAIN: sllg [[SHIFT:%r[1-9]+]], %r3, 3
-; CHECK-MAIN: nill %r3, 65532
-; CHECK-MAIN: l [[OLD:%r[0-9]+]], 0(%r3)
+; CHECK-MAIN: risbg [[RISBG:%r[1-9]+]], %r3, 0, 189, 0{{$}}
+; CHECK-MAIN: sll %r3, 3
+; CHECK-MAIN: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK-MAIN: [[LOOP:\.[^ ]*]]:
-; CHECK-MAIN: rll %r2, [[OLD]], 8([[SHIFT]])
+; CHECK-MAIN: rll %r2, [[OLD]], 8(%r3)
; CHECK-MAIN: risbg %r4, %r2, 32, 55, 0
; CHECK-MAIN: crjlh %r2, %r4, [[EXIT:\.[^ ]*]]
; CHECK-MAIN: risbg %r5, %r2, 32, 55, 0
; CHECK-MAIN: rll [[NEW:%r[0-9]+]], %r5, -8({{%r[1-9]+}})
-; CHECK-MAIN: cs [[OLD]], [[NEW]], 0(%r3)
+; CHECK-MAIN: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK-MAIN: jl [[LOOP]]
; CHECK-MAIN: [[EXIT]]:
; CHECK-MAIN-NOT: %r2
; CHECK-MAIN: br %r14
;
; CHECK-SHIFT-LABEL: f1:
-; CHECK-SHIFT: sllg [[SHIFT:%r[1-9]+]], %r3, 3
+; CHECK-SHIFT: sll [[SHIFT:%r[1-9]+]], 3
; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
; CHECK-SHIFT: rll
; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -8([[NEGSHIFT]])
diff --git a/test/CodeGen/SystemZ/cmpxchg-02.ll b/test/CodeGen/SystemZ/cmpxchg-02.ll
index 9eb0628b5a30..2445c0deab14 100644
--- a/test/CodeGen/SystemZ/cmpxchg-02.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-02.ll
@@ -12,24 +12,24 @@
; which shift %r3 left so that %b is at the high end of the word).
define i16 @f1(i16 %dummy, i16 *%src, i16 %cmp, i16 %swap) {
; CHECK-MAIN-LABEL: f1:
-; CHECK-MAIN: sllg [[SHIFT:%r[1-9]+]], %r3, 3
-; CHECK-MAIN: nill %r3, 65532
-; CHECK-MAIN: l [[OLD:%r[0-9]+]], 0(%r3)
+; CHECK-MAIN: risbg [[RISBG:%r[1-9]+]], %r3, 0, 189, 0{{$}}
+; CHECK-MAIN: sll %r3, 3
+; CHECK-MAIN: l [[OLD:%r[0-9]+]], 0([[RISBG]])
; CHECK-MAIN: [[LOOP:\.[^ ]*]]:
-; CHECK-MAIN: rll %r2, [[OLD]], 16([[SHIFT]])
+; CHECK-MAIN: rll %r2, [[OLD]], 16(%r3)
; CHECK-MAIN: risbg %r4, %r2, 32, 47, 0
; CHECK-MAIN: crjlh %r2, %r4, [[EXIT:\.[^ ]*]]
; CHECK-MAIN: risbg %r5, %r2, 32, 47, 0
; CHECK-MAIN: rll [[NEW:%r[0-9]+]], %r5, -16({{%r[1-9]+}})
-; CHECK-MAIN: cs [[OLD]], [[NEW]], 0(%r3)
+; CHECK-MAIN: cs [[OLD]], [[NEW]], 0([[RISBG]])
; CHECK-MAIN: jl [[LOOP]]
; CHECK-MAIN: [[EXIT]]:
; CHECK-MAIN-NOT: %r2
; CHECK-MAIN: br %r14
;
; CHECK-SHIFT-LABEL: f1:
-; CHECK-SHIFT: sllg [[SHIFT:%r[1-9]+]], %r3, 3
-; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
+; CHECK-SHIFT: sll %r3, 3
+; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], %r3
; CHECK-SHIFT: rll
; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -16([[NEGSHIFT]])
%pair = cmpxchg i16 *%src, i16 %cmp, i16 %swap seq_cst seq_cst
diff --git a/test/CodeGen/SystemZ/cmpxchg-05.ll b/test/CodeGen/SystemZ/cmpxchg-05.ll
new file mode 100644
index 000000000000..68261efa6384
--- /dev/null
+++ b/test/CodeGen/SystemZ/cmpxchg-05.ll
@@ -0,0 +1,81 @@
+; Test proper extension of 8-bit/16-bit cmpxchg.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; CHECK-LABEL: f1
+; CHECK: crjlh
+; CHECK-NOT: llcr
+; CHECK-NOT: cr
+; CHECK: llgcr %r2, [[RES:%r[0-9]+]]
+; CHECK-NOT: llcr
+; CHECK-NOT: cr
+define zeroext i8 @f1(i8* nocapture, i8 zeroext, i8 zeroext) {
+ %cx = cmpxchg i8* %0, i8 %1, i8 %2 seq_cst seq_cst
+ %res = extractvalue { i8, i1 } %cx, 0
+ ret i8 %res
+}
+
+; CHECK-LABEL: f2
+; CHECK: crjlh
+; CHECK-NOT: llhr
+; CHECK-NOT: cr
+; CHECK: llghr %r2, [[RES:%r[0-9]+]]
+; CHECK-NOT: llhr
+; CHECK-NOT: cr
+define zeroext i16 @f2(i16* nocapture, i16 zeroext, i16 zeroext) {
+ %cx = cmpxchg i16* %0, i16 %1, i16 %2 seq_cst seq_cst
+ %res = extractvalue { i16, i1 } %cx, 0
+ ret i16 %res
+}
+
+; CHECK-LABEL: f3
+; CHECK: crjlh
+; CHECK-NOT: llcr
+; CHECK-NOT: cr
+; CHECK: lgbr %r2, [[RES:%r[0-9]+]]
+; CHECK-NOT: llcr
+; CHECK-NOT: cr
+define signext i8 @f3(i8* nocapture, i8 signext, i8 signext) {
+ %cx = cmpxchg i8* %0, i8 %1, i8 %2 seq_cst seq_cst
+ %res = extractvalue { i8, i1 } %cx, 0
+ ret i8 %res
+}
+
+; CHECK-LABEL: f4
+; CHECK: crjlh
+; CHECK-NOT: llhr
+; CHECK-NOT: cr
+; CHECK: lghr %r2, [[RES:%r[0-9]+]]
+; CHECK-NOT: llhr
+; CHECK-NOT: cr
+define signext i16 @f4(i16* nocapture, i16 signext, i16 signext) {
+ %cx = cmpxchg i16* %0, i16 %1, i16 %2 seq_cst seq_cst
+ %res = extractvalue { i16, i1 } %cx, 0
+ ret i16 %res
+}
+
+; Now use the comparison result.
+; CHECK-LABEL: f5
+; CHECK: llcr [[REG:%r[0-9]+]], [[RES:%r[0-9]+]]
+; CHECK: cr [[REG]], %r3
+define zeroext i8 @f5(i8* nocapture, i8 zeroext, i8 zeroext) {
+ %cx = cmpxchg i8* %0, i8 %1, i8 %2 seq_cst seq_cst
+ %res = extractvalue { i8, i1 } %cx, 1
+ %xres = sext i1 %res to i8
+ ret i8 %xres
+}
+
+; Now use the comparison result and zero-extended old value.
+; CHECK-LABEL: f6
+; CHECK: llcr [[REG:%r[0-9]+]], [[RES:%r[0-9]+]]
+; CHECK: st [[REG]], 0(%r5)
+; CHECK: cr [[REG]], %r3
+define zeroext i8 @f6(i8* nocapture, i8 zeroext, i8 zeroext, i32*) {
+ %cx = cmpxchg i8* %0, i8 %1, i8 %2 seq_cst seq_cst
+ %old = extractvalue { i8, i1 } %cx, 0
+ %xold = zext i8 %old to i32
+ store i32 %xold, i32* %3
+ %res = extractvalue { i8, i1 } %cx, 1
+ %xres = sext i1 %res to i8
+ ret i8 %xres
+}
diff --git a/test/CodeGen/SystemZ/cond-li.ll b/test/CodeGen/SystemZ/cond-li.ll
new file mode 100644
index 000000000000..a3e2f3fd1252
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-li.ll
@@ -0,0 +1,23 @@
+; Test LOCHI/LOCGHI
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; CHECK-LABEL: bar1:
+; CHECK: lhi [[REG:%r[0-5]]], 42
+; CHECK: chi %r2, 0
+; CHECK: lochie [[REG]], 0
+define signext i32 @bar1(i32 signext %x) {
+ %cmp = icmp ne i32 %x, 0
+ %.x = select i1 %cmp, i32 42, i32 0
+ ret i32 %.x
+}
+
+; CHECK-LABEL: bar2:
+; CHECK: ltgr [[REG:%r[0-5]]], %r2
+; CHECK: lghi %r2, 42
+; CHECK: locghie %r2, 0
+define signext i64 @bar2(i64 signext %x) {
+ %cmp = icmp ne i64 %x, 0
+ %.x = select i1 %cmp, i64 42, i64 0
+ ret i64 %.x
+}
diff --git a/test/CodeGen/SystemZ/cond-store-01.ll b/test/CodeGen/SystemZ/cond-store-01.ll
index ec7fc4a31fcd..a682d222add5 100644
--- a/test/CodeGen/SystemZ/cond-store-01.ll
+++ b/test/CodeGen/SystemZ/cond-store-01.ll
@@ -9,10 +9,9 @@ declare void @foo(i8 *)
define void @f1(i8 *%ptr, i8 %alt, i32 %limit) {
; CHECK-LABEL: f1:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -25,10 +24,9 @@ define void @f1(i8 *%ptr, i8 %alt, i32 %limit) {
define void @f2(i8 *%ptr, i8 %alt, i32 %limit) {
; CHECK-LABEL: f2:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -42,10 +40,9 @@ define void @f2(i8 *%ptr, i8 %alt, i32 %limit) {
define void @f3(i8 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f3:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -60,10 +57,9 @@ define void @f3(i8 *%ptr, i32 %alt, i32 %limit) {
define void @f4(i8 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f4:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -79,10 +75,9 @@ define void @f4(i8 *%ptr, i32 %alt, i32 %limit) {
define void @f5(i8 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f5:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -97,10 +92,9 @@ define void @f5(i8 *%ptr, i32 %alt, i32 %limit) {
define void @f6(i8 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f6:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -116,10 +110,9 @@ define void @f6(i8 *%ptr, i32 %alt, i32 %limit) {
define void @f7(i8 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f7:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -134,10 +127,9 @@ define void @f7(i8 *%ptr, i64 %alt, i32 %limit) {
define void @f8(i8 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f8:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -153,10 +145,9 @@ define void @f8(i8 *%ptr, i64 %alt, i32 %limit) {
define void @f9(i8 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f9:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -171,10 +162,9 @@ define void @f9(i8 *%ptr, i64 %alt, i32 %limit) {
define void @f10(i8 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f10:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i8 , i8 *%ptr
@@ -189,10 +179,9 @@ define void @f10(i8 *%ptr, i64 %alt, i32 %limit) {
define void @f11(i8 *%base, i8 %alt, i32 %limit) {
; CHECK-LABEL: f11:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stc %r3, 4095(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i8, i8 *%base, i64 4095
%cond = icmp ult i32 %limit, 420
@@ -206,10 +195,9 @@ define void @f11(i8 *%base, i8 %alt, i32 %limit) {
define void @f12(i8 *%base, i8 %alt, i32 %limit) {
; CHECK-LABEL: f12:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stcy %r3, 4096(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i8, i8 *%base, i64 4096
%cond = icmp ult i32 %limit, 420
@@ -223,10 +211,9 @@ define void @f12(i8 *%base, i8 %alt, i32 %limit) {
define void @f13(i8 *%base, i8 %alt, i32 %limit) {
; CHECK-LABEL: f13:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stcy %r3, 524287(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i8, i8 *%base, i64 524287
%cond = icmp ult i32 %limit, 420
@@ -241,11 +228,10 @@ define void @f13(i8 *%base, i8 %alt, i32 %limit) {
define void @f14(i8 *%base, i8 %alt, i32 %limit) {
; CHECK-LABEL: f14:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, 524288
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i8, i8 *%base, i64 524288
%cond = icmp ult i32 %limit, 420
@@ -259,10 +245,9 @@ define void @f14(i8 *%base, i8 %alt, i32 %limit) {
define void @f15(i8 *%base, i8 %alt, i32 %limit) {
; CHECK-LABEL: f15:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stcy %r3, -524288(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i8, i8 *%base, i64 -524288
%cond = icmp ult i32 %limit, 420
@@ -277,11 +262,10 @@ define void @f15(i8 *%base, i8 %alt, i32 %limit) {
define void @f16(i8 *%base, i8 %alt, i32 %limit) {
; CHECK-LABEL: f16:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, -524289
; CHECK: stc %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i8, i8 *%base, i64 -524289
%cond = icmp ult i32 %limit, 420
@@ -295,10 +279,9 @@ define void @f16(i8 *%base, i8 %alt, i32 %limit) {
define void @f17(i64 %base, i64 %index, i8 %alt, i32 %limit) {
; CHECK-LABEL: f17:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stcy %r4, 4096(%r3,%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 4096
diff --git a/test/CodeGen/SystemZ/cond-store-02.ll b/test/CodeGen/SystemZ/cond-store-02.ll
index 22bdfa3c27dc..5cb024d8b4e8 100644
--- a/test/CodeGen/SystemZ/cond-store-02.ll
+++ b/test/CodeGen/SystemZ/cond-store-02.ll
@@ -9,10 +9,9 @@ declare void @foo(i16 *)
define void @f1(i16 *%ptr, i16 %alt, i32 %limit) {
; CHECK-LABEL: f1:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -25,10 +24,9 @@ define void @f1(i16 *%ptr, i16 %alt, i32 %limit) {
define void @f2(i16 *%ptr, i16 %alt, i32 %limit) {
; CHECK-LABEL: f2:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -42,10 +40,9 @@ define void @f2(i16 *%ptr, i16 %alt, i32 %limit) {
define void @f3(i16 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f3:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -60,10 +57,9 @@ define void @f3(i16 *%ptr, i32 %alt, i32 %limit) {
define void @f4(i16 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f4:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -79,10 +75,9 @@ define void @f4(i16 *%ptr, i32 %alt, i32 %limit) {
define void @f5(i16 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f5:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -97,10 +92,9 @@ define void @f5(i16 *%ptr, i32 %alt, i32 %limit) {
define void @f6(i16 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f6:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -116,10 +110,9 @@ define void @f6(i16 *%ptr, i32 %alt, i32 %limit) {
define void @f7(i16 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f7:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -134,10 +127,9 @@ define void @f7(i16 *%ptr, i64 %alt, i32 %limit) {
define void @f8(i16 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f8:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -153,10 +145,9 @@ define void @f8(i16 *%ptr, i64 %alt, i32 %limit) {
define void @f9(i16 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f9:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -171,10 +162,9 @@ define void @f9(i16 *%ptr, i64 %alt, i32 %limit) {
define void @f10(i16 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f10:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i16 , i16 *%ptr
@@ -189,10 +179,9 @@ define void @f10(i16 *%ptr, i64 %alt, i32 %limit) {
define void @f11(i16 *%base, i16 %alt, i32 %limit) {
; CHECK-LABEL: f11:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sth %r3, 4094(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 2047
%cond = icmp ult i32 %limit, 420
@@ -206,10 +195,9 @@ define void @f11(i16 *%base, i16 %alt, i32 %limit) {
define void @f12(i16 *%base, i16 %alt, i32 %limit) {
; CHECK-LABEL: f12:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sthy %r3, 4096(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 2048
%cond = icmp ult i32 %limit, 420
@@ -223,10 +211,9 @@ define void @f12(i16 *%base, i16 %alt, i32 %limit) {
define void @f13(i16 *%base, i16 %alt, i32 %limit) {
; CHECK-LABEL: f13:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sthy %r3, 524286(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 262143
%cond = icmp ult i32 %limit, 420
@@ -241,11 +228,10 @@ define void @f13(i16 *%base, i16 %alt, i32 %limit) {
define void @f14(i16 *%base, i16 %alt, i32 %limit) {
; CHECK-LABEL: f14:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, 524288
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 262144
%cond = icmp ult i32 %limit, 420
@@ -259,10 +245,9 @@ define void @f14(i16 *%base, i16 %alt, i32 %limit) {
define void @f15(i16 *%base, i16 %alt, i32 %limit) {
; CHECK-LABEL: f15:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sthy %r3, -524288(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 -262144
%cond = icmp ult i32 %limit, 420
@@ -277,11 +262,10 @@ define void @f15(i16 *%base, i16 %alt, i32 %limit) {
define void @f16(i16 *%base, i16 %alt, i32 %limit) {
; CHECK-LABEL: f16:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, -524290
; CHECK: sth %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 -262145
%cond = icmp ult i32 %limit, 420
@@ -295,10 +279,9 @@ define void @f16(i16 *%base, i16 %alt, i32 %limit) {
define void @f17(i64 %base, i64 %index, i16 %alt, i32 %limit) {
; CHECK-LABEL: f17:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sthy %r4, 4096(%r3,%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 4096
diff --git a/test/CodeGen/SystemZ/cond-store-03.ll b/test/CodeGen/SystemZ/cond-store-03.ll
index 7207164a6314..46cdbff312c4 100644
--- a/test/CodeGen/SystemZ/cond-store-03.ll
+++ b/test/CodeGen/SystemZ/cond-store-03.ll
@@ -8,10 +8,9 @@ declare void @foo(i32 *)
define void @f1(i32 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f1:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i32 , i32 *%ptr
@@ -24,10 +23,9 @@ define void @f1(i32 *%ptr, i32 %alt, i32 %limit) {
define void @f2(i32 *%ptr, i32 %alt, i32 %limit) {
; CHECK-LABEL: f2:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i32 , i32 *%ptr
@@ -41,10 +39,9 @@ define void @f2(i32 *%ptr, i32 %alt, i32 %limit) {
define void @f3(i32 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f3:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i32 , i32 *%ptr
@@ -59,10 +56,9 @@ define void @f3(i32 *%ptr, i64 %alt, i32 %limit) {
define void @f4(i32 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f4:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i32 , i32 *%ptr
@@ -78,10 +74,9 @@ define void @f4(i32 *%ptr, i64 %alt, i32 %limit) {
define void @f5(i32 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f5:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i32 , i32 *%ptr
@@ -96,10 +91,9 @@ define void @f5(i32 *%ptr, i64 %alt, i32 %limit) {
define void @f6(i32 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f6:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i32 , i32 *%ptr
@@ -114,10 +108,9 @@ define void @f6(i32 *%ptr, i64 %alt, i32 %limit) {
define void @f7(i32 *%base, i32 %alt, i32 %limit) {
; CHECK-LABEL: f7:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: st %r3, 4092(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1023
%cond = icmp ult i32 %limit, 420
@@ -131,10 +124,9 @@ define void @f7(i32 *%base, i32 %alt, i32 %limit) {
define void @f8(i32 *%base, i32 %alt, i32 %limit) {
; CHECK-LABEL: f8:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sty %r3, 4096(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1024
%cond = icmp ult i32 %limit, 420
@@ -148,10 +140,9 @@ define void @f8(i32 *%base, i32 %alt, i32 %limit) {
define void @f9(i32 *%base, i32 %alt, i32 %limit) {
; CHECK-LABEL: f9:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sty %r3, 524284(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131071
%cond = icmp ult i32 %limit, 420
@@ -166,11 +157,10 @@ define void @f9(i32 *%base, i32 %alt, i32 %limit) {
define void @f10(i32 *%base, i32 %alt, i32 %limit) {
; CHECK-LABEL: f10:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, 524288
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131072
%cond = icmp ult i32 %limit, 420
@@ -184,10 +174,9 @@ define void @f10(i32 *%base, i32 %alt, i32 %limit) {
define void @f11(i32 *%base, i32 %alt, i32 %limit) {
; CHECK-LABEL: f11:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sty %r3, -524288(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131072
%cond = icmp ult i32 %limit, 420
@@ -202,11 +191,10 @@ define void @f11(i32 *%base, i32 %alt, i32 %limit) {
define void @f12(i32 *%base, i32 %alt, i32 %limit) {
; CHECK-LABEL: f12:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, -524292
; CHECK: st %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131073
%cond = icmp ult i32 %limit, 420
@@ -220,10 +208,9 @@ define void @f12(i32 *%base, i32 %alt, i32 %limit) {
define void @f13(i64 %base, i64 %index, i32 %alt, i32 %limit) {
; CHECK-LABEL: f13:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: sty %r4, 4096(%r3,%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 4096
diff --git a/test/CodeGen/SystemZ/cond-store-04.ll b/test/CodeGen/SystemZ/cond-store-04.ll
index 7e25bb5c14a0..70124f9ecee4 100644
--- a/test/CodeGen/SystemZ/cond-store-04.ll
+++ b/test/CodeGen/SystemZ/cond-store-04.ll
@@ -8,10 +8,9 @@ declare void @foo(i64 *)
define void @f1(i64 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f1:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stg %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i64 , i64 *%ptr
@@ -24,10 +23,9 @@ define void @f1(i64 *%ptr, i64 %alt, i32 %limit) {
define void @f2(i64 *%ptr, i64 %alt, i32 %limit) {
; CHECK-LABEL: f2:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: stg %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load i64 , i64 *%ptr
@@ -40,10 +38,9 @@ define void @f2(i64 *%ptr, i64 %alt, i32 %limit) {
define void @f3(i64 *%base, i64 %alt, i32 %limit) {
; CHECK-LABEL: f3:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stg %r3, 524280(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 65535
%cond = icmp ult i32 %limit, 420
@@ -58,11 +55,10 @@ define void @f3(i64 *%base, i64 %alt, i32 %limit) {
define void @f4(i64 *%base, i64 %alt, i32 %limit) {
; CHECK-LABEL: f4:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, 524288
; CHECK: stg %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 65536
%cond = icmp ult i32 %limit, 420
@@ -76,10 +72,9 @@ define void @f4(i64 *%base, i64 %alt, i32 %limit) {
define void @f5(i64 *%base, i64 %alt, i32 %limit) {
; CHECK-LABEL: f5:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stg %r3, -524288(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -65536
%cond = icmp ult i32 %limit, 420
@@ -94,11 +89,10 @@ define void @f5(i64 *%base, i64 %alt, i32 %limit) {
define void @f6(i64 *%base, i64 %alt, i32 %limit) {
; CHECK-LABEL: f6:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, -524296
; CHECK: stg %r3, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -65537
%cond = icmp ult i32 %limit, 420
@@ -112,10 +106,9 @@ define void @f6(i64 *%base, i64 %alt, i32 %limit) {
define void @f7(i64 %base, i64 %index, i64 %alt, i32 %limit) {
; CHECK-LABEL: f7:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stg %r4, 524287(%r3,%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 524287
diff --git a/test/CodeGen/SystemZ/cond-store-05.ll b/test/CodeGen/SystemZ/cond-store-05.ll
index 0cc068380e07..51a9f6c42ab0 100644
--- a/test/CodeGen/SystemZ/cond-store-05.ll
+++ b/test/CodeGen/SystemZ/cond-store-05.ll
@@ -8,10 +8,9 @@ declare void @foo(float *)
define void @f1(float *%ptr, float %alt, i32 %limit) {
; CHECK-LABEL: f1:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: ste %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load float , float *%ptr
@@ -24,10 +23,9 @@ define void @f1(float *%ptr, float %alt, i32 %limit) {
define void @f2(float *%ptr, float %alt, i32 %limit) {
; CHECK-LABEL: f2:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: ste %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load float , float *%ptr
@@ -40,10 +38,9 @@ define void @f2(float *%ptr, float %alt, i32 %limit) {
define void @f3(float *%base, float %alt, i32 %limit) {
; CHECK-LABEL: f3:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: ste %f0, 4092(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 1023
%cond = icmp ult i32 %limit, 420
@@ -57,10 +54,9 @@ define void @f3(float *%base, float %alt, i32 %limit) {
define void @f4(float *%base, float %alt, i32 %limit) {
; CHECK-LABEL: f4:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stey %f0, 4096(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 1024
%cond = icmp ult i32 %limit, 420
@@ -74,10 +70,9 @@ define void @f4(float *%base, float %alt, i32 %limit) {
define void @f5(float *%base, float %alt, i32 %limit) {
; CHECK-LABEL: f5:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stey %f0, 524284(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 131071
%cond = icmp ult i32 %limit, 420
@@ -92,11 +87,10 @@ define void @f5(float *%base, float %alt, i32 %limit) {
define void @f6(float *%base, float %alt, i32 %limit) {
; CHECK-LABEL: f6:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, 524288
; CHECK: ste %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 131072
%cond = icmp ult i32 %limit, 420
@@ -110,10 +104,9 @@ define void @f6(float *%base, float %alt, i32 %limit) {
define void @f7(float *%base, float %alt, i32 %limit) {
; CHECK-LABEL: f7:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stey %f0, -524288(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 -131072
%cond = icmp ult i32 %limit, 420
@@ -128,11 +121,10 @@ define void @f7(float *%base, float %alt, i32 %limit) {
define void @f8(float *%base, float %alt, i32 %limit) {
; CHECK-LABEL: f8:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, -524292
; CHECK: ste %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 -131073
%cond = icmp ult i32 %limit, 420
@@ -146,10 +138,9 @@ define void @f8(float *%base, float %alt, i32 %limit) {
define void @f9(i64 %base, i64 %index, float %alt, i32 %limit) {
; CHECK-LABEL: f9:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stey %f0, 4096(%r3,%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 4096
diff --git a/test/CodeGen/SystemZ/cond-store-06.ll b/test/CodeGen/SystemZ/cond-store-06.ll
index 01948b811504..1eac79401bd3 100644
--- a/test/CodeGen/SystemZ/cond-store-06.ll
+++ b/test/CodeGen/SystemZ/cond-store-06.ll
@@ -8,10 +8,9 @@ declare void @foo(double *)
define void @f1(double *%ptr, double %alt, i32 %limit) {
; CHECK-LABEL: f1:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: std %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load double , double *%ptr
@@ -24,10 +23,9 @@ define void @f1(double *%ptr, double %alt, i32 %limit) {
define void @f2(double *%ptr, double %alt, i32 %limit) {
; CHECK-LABEL: f2:
; CHECK-NOT: %r2
-; CHECK: jhe [[LABEL:[^ ]*]]
+; CHECK: bher %r14
; CHECK-NOT: %r2
; CHECK: std %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%cond = icmp ult i32 %limit, 420
%orig = load double , double *%ptr
@@ -40,10 +38,9 @@ define void @f2(double *%ptr, double %alt, i32 %limit) {
define void @f3(double *%base, double %alt, i32 %limit) {
; CHECK-LABEL: f3:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: std %f0, 4088(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 511
%cond = icmp ult i32 %limit, 420
@@ -57,10 +54,9 @@ define void @f3(double *%base, double %alt, i32 %limit) {
define void @f4(double *%base, double %alt, i32 %limit) {
; CHECK-LABEL: f4:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stdy %f0, 4096(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 512
%cond = icmp ult i32 %limit, 420
@@ -74,10 +70,9 @@ define void @f4(double *%base, double %alt, i32 %limit) {
define void @f5(double *%base, double %alt, i32 %limit) {
; CHECK-LABEL: f5:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stdy %f0, 524280(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 65535
%cond = icmp ult i32 %limit, 420
@@ -92,11 +87,10 @@ define void @f5(double *%base, double %alt, i32 %limit) {
define void @f6(double *%base, double %alt, i32 %limit) {
; CHECK-LABEL: f6:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, 524288
; CHECK: std %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 65536
%cond = icmp ult i32 %limit, 420
@@ -110,10 +104,9 @@ define void @f6(double *%base, double %alt, i32 %limit) {
define void @f7(double *%base, double %alt, i32 %limit) {
; CHECK-LABEL: f7:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stdy %f0, -524288(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 -65536
%cond = icmp ult i32 %limit, 420
@@ -128,11 +121,10 @@ define void @f7(double *%base, double %alt, i32 %limit) {
define void @f8(double *%base, double %alt, i32 %limit) {
; CHECK-LABEL: f8:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: agfi %r2, -524296
; CHECK: std %f0, 0(%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%ptr = getelementptr double, double *%base, i64 -65537
%cond = icmp ult i32 %limit, 420
@@ -146,10 +138,9 @@ define void @f8(double *%base, double %alt, i32 %limit) {
define void @f9(i64 %base, i64 %index, double %alt, i32 %limit) {
; CHECK-LABEL: f9:
; CHECK-NOT: %r2
-; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK: blr %r14
; CHECK-NOT: %r2
; CHECK: stdy %f0, 524287(%r3,%r2)
-; CHECK: [[LABEL]]:
; CHECK: br %r14
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 524287
diff --git a/test/CodeGen/SystemZ/dyn-alloca-offset.ll b/test/CodeGen/SystemZ/dyn-alloca-offset.ll
new file mode 100644
index 000000000000..b9997ac0ec9e
--- /dev/null
+++ b/test/CodeGen/SystemZ/dyn-alloca-offset.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @llvm.get.dynamic.area.offset.i64()
+
+declare void @use(i64)
+
+define void @f1() {
+; CHECK-LABEL: f1
+; CHECK: la %r2, 160
+; CHECK: brasl %r14, use
+; CHECK: br %r14
+ %tmp = alloca i64, align 32
+ %dynamic_area_offset = call i64 @llvm.get.dynamic.area.offset.i64()
+ call void @use(i64 %dynamic_area_offset)
+ ret void
+}
+
+define void @f2(i64 %arg) {
+; CHECK-LABEL: f2
+; CHECK: la %r2, 160(%r2)
+; CHECK: brasl %r14, use
+; CHECK: br %r14
+ %tmp = alloca i64, align 32
+ %dynamic_area_offset = call i64 @llvm.get.dynamic.area.offset.i64()
+ %param = add i64 %dynamic_area_offset, %arg
+ call void @use(i64 %param)
+ ret void
+}
+
+declare void @eatsalot(i64, i64, i64, i64, i64, i64)
+
+define void @f3() {
+; CHECK-LABEL: f3
+; CHECK: la %r2, 168
+; CHECK: brasl %r14, use
+; CHECK: br %r14
+ %tmp = alloca i64, align 32
+ call void @eatsalot(i64 0, i64 0, i64 0, i64 0, i64 0, i64 0)
+ %dynamic_area_offset = call i64 @llvm.get.dynamic.area.offset.i64()
+ call void @use(i64 %dynamic_area_offset)
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-cmp-01.ll b/test/CodeGen/SystemZ/fp-cmp-01.ll
index ed58103e59a5..075c7aa3dd84 100644
--- a/test/CodeGen/SystemZ/fp-cmp-01.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-01.ll
@@ -9,7 +9,7 @@ declare float @foo()
define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) {
; CHECK-LABEL: f1:
; CHECK: cebr %f0, %f2
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%cond = fcmp oeq float %f1, %f2
@@ -21,7 +21,7 @@ define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) {
define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) {
; CHECK-LABEL: f2:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f2 = load float , float *%ptr
@@ -34,7 +34,7 @@ define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) {
define i64 @f3(i64 %a, i64 %b, float %f1, float *%base) {
; CHECK-LABEL: f3:
; CHECK: ceb %f0, 4092(%r4)
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 1023
@@ -50,7 +50,7 @@ define i64 @f4(i64 %a, i64 %b, float %f1, float *%base) {
; CHECK-LABEL: f4:
; CHECK: aghi %r4, 4096
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 1024
@@ -65,7 +65,7 @@ define i64 @f5(i64 %a, i64 %b, float %f1, float *%base) {
; CHECK-LABEL: f5:
; CHECK: aghi %r4, -4
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%ptr = getelementptr float, float *%base, i64 -1
@@ -80,7 +80,7 @@ define i64 @f6(i64 %a, i64 %b, float %f1, float *%base, i64 %index) {
; CHECK-LABEL: f6:
; CHECK: sllg %r1, %r5, 2
; CHECK: ceb %f0, 400(%r1,%r4)
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%ptr1 = getelementptr float, float *%base, i64 %index
@@ -153,7 +153,7 @@ define float @f7(float *%ptr0) {
define i64 @f8(i64 %a, i64 %b, float %f) {
; CHECK-LABEL: f8:
; CHECK: ltebr %f0, %f0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%cond = fcmp oeq float %f, 0.0
@@ -166,7 +166,7 @@ define i64 @f8(i64 %a, i64 %b, float %f) {
define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f9:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: je {{\.L.*}}
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -179,7 +179,7 @@ define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f10:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jlh {{\.L.*}}
+; CHECK-NEXT: blhr %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -192,7 +192,7 @@ define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f11:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -205,7 +205,7 @@ define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f12:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jhe {{\.L.*}}
+; CHECK-NEXT: bher %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -218,7 +218,7 @@ define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f13:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jle {{\.L.*}}
+; CHECK-NEXT: bler %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -231,7 +231,7 @@ define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f14:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jl {{\.L.*}}
+; CHECK-NEXT: blr %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -244,7 +244,7 @@ define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f15:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jnlh {{\.L.*}}
+; CHECK-NEXT: bnlhr %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -257,7 +257,7 @@ define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f16:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jne {{\.L.*}}
+; CHECK-NEXT: bner %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -270,7 +270,7 @@ define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f17:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jnle {{\.L.*}}
+; CHECK-NEXT: bnler %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -283,7 +283,7 @@ define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f18:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jnl {{\.L.*}}
+; CHECK-NEXT: bnlr %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -296,7 +296,7 @@ define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f19:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jnh {{\.L.*}}
+; CHECK-NEXT: bnhr %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
@@ -309,7 +309,7 @@ define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
define i64 @f20(i64 %a, i64 %b, float %f2, float *%ptr) {
; CHECK-LABEL: f20:
; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: jnhe {{\.L.*}}
+; CHECK-NEXT: bnher %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f1 = load float , float *%ptr
diff --git a/test/CodeGen/SystemZ/fp-cmp-02.ll b/test/CodeGen/SystemZ/fp-cmp-02.ll
index 0808ddd8db48..8341f553b895 100644
--- a/test/CodeGen/SystemZ/fp-cmp-02.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-02.ll
@@ -12,7 +12,7 @@ declare double @foo()
define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
; CHECK-LABEL: f1:
; CHECK: cdbr %f0, %f2
-; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR-NEXT: ber %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
@@ -25,7 +25,7 @@ define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cdb %f0, 0(%r4)
-; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR-NEXT: ber %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
@@ -39,7 +39,7 @@ define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
define i64 @f3(i64 %a, i64 %b, double %f1, double *%base) {
; CHECK-LABEL: f3:
; CHECK: cdb %f0, 4088(%r4)
-; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR-NEXT: ber %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
@@ -56,7 +56,7 @@ define i64 @f4(i64 %a, i64 %b, double %f1, double *%base) {
; CHECK-LABEL: f4:
; CHECK: aghi %r4, 4096
; CHECK: cdb %f0, 0(%r4)
-; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR-NEXT: ber %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
@@ -72,7 +72,7 @@ define i64 @f5(i64 %a, i64 %b, double %f1, double *%base) {
; CHECK-LABEL: f5:
; CHECK: aghi %r4, -8
; CHECK: cdb %f0, 0(%r4)
-; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR-NEXT: ber %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
@@ -88,7 +88,7 @@ define i64 @f6(i64 %a, i64 %b, double %f1, double *%base, i64 %index) {
; CHECK-LABEL: f6:
; CHECK: sllg %r1, %r5, 3
; CHECK: cdb %f0, 800(%r1,%r4)
-; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR-NEXT: ber %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR-NEXT: locgrne %r2, %r3
; CHECK: br %r14
@@ -162,7 +162,7 @@ define double @f7(double *%ptr0) {
define i64 @f8(i64 %a, i64 %b, double %f) {
; CHECK-LABEL: f8:
; CHECK-SCALAR: ltdbr %f0, %f0
-; CHECK-SCALAR-NEXT: je
+; CHECK-SCALAR-NEXT: ber %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR: ltdbr %f0, %f0
; CHECK-VECTOR-NEXT: locgrne %r2, %r3
@@ -176,7 +176,7 @@ define i64 @f8(i64 %a, i64 %b, double %f) {
define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cdb %f0, 0(%r4)
-; CHECK-SCALAR-NEXT: jl
+; CHECK-SCALAR-NEXT: blr %r14
; CHECK-SCALAR: lgr %r2, %r3
; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
; CHECK: br %r14
diff --git a/test/CodeGen/SystemZ/fp-cmp-03.ll b/test/CodeGen/SystemZ/fp-cmp-03.ll
index 862c5e9b65b8..545414005446 100644
--- a/test/CodeGen/SystemZ/fp-cmp-03.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-03.ll
@@ -10,7 +10,7 @@ define i64 @f1(i64 %a, i64 %b, fp128 *%ptr, float %f2) {
; CHECK: ld %f1, 0(%r4)
; CHECK: ld %f3, 8(%r4)
; CHECK: cxbr %f1, %f0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f2x = fpext float %f2 to fp128
@@ -26,7 +26,7 @@ define i64 @f2(i64 %a, i64 %b, fp128 *%ptr) {
; CHECK: ld %f0, 0(%r4)
; CHECK: ld %f2, 8(%r4)
; CHECK: ltxbr %f0, %f0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: lgr %r2, %r3
; CHECK: br %r14
%f = load fp128 , fp128 *%ptr
diff --git a/test/CodeGen/SystemZ/fp-cmp-04.ll b/test/CodeGen/SystemZ/fp-cmp-04.ll
index 05c6dfe7e8e4..17f10456ecb9 100644
--- a/test/CodeGen/SystemZ/fp-cmp-04.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-04.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float %f)
define float @f1(float %a, float %b, float *%dest) {
; CHECK-LABEL: f1:
; CHECK: aebr %f0, %f2
-; CHECK-NEXT: je .L{{.*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%res = fadd float %a, %b
@@ -28,7 +28,7 @@ exit:
define float @f2(float %a, float %b, float *%dest) {
; CHECK-LABEL: f2:
; CHECK: aebr %f0, %f2
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = fadd float %a, %b
@@ -47,7 +47,7 @@ exit:
define float @f3(float %a, float %b, float *%dest) {
; CHECK-LABEL: f3:
; CHECK: aebr %f0, %f2
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%res = fadd float %a, %b
@@ -66,7 +66,7 @@ exit:
define float @f4(float %a, float %b, float *%dest) {
; CHECK-LABEL: f4:
; CHECK: aebr %f0, %f2
-; CHECK-NEXT: jnlh .L{{.*}}
+; CHECK-NEXT: bnlhr %r14
; CHECK: br %r14
entry:
%res = fadd float %a, %b
@@ -85,7 +85,7 @@ exit:
define float @f5(float %a, float %b, float *%dest) {
; CHECK-LABEL: f5:
; CHECK: seb %f0, 0(%r2)
-; CHECK-NEXT: jnhe .L{{.*}}
+; CHECK-NEXT: bnher %r14
; CHECK: br %r14
entry:
%cur = load float , float *%dest
@@ -105,7 +105,7 @@ exit:
define float @f6(float %dummy, float %a, float *%dest) {
; CHECK-LABEL: f6:
; CHECK: lpebr %f0, %f2
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%res = call float @llvm.fabs.f32(float %a)
@@ -124,7 +124,7 @@ exit:
define float @f7(float %dummy, float %a, float *%dest) {
; CHECK-LABEL: f7:
; CHECK: lnebr %f0, %f2
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%abs = call float @llvm.fabs.f32(float %a)
@@ -144,7 +144,7 @@ exit:
define float @f8(float %dummy, float %a, float *%dest) {
; CHECK-LABEL: f8:
; CHECK: lcebr %f0, %f2
-; CHECK-NEXT: jle .L{{.*}}
+; CHECK-NEXT: bler %r14
; CHECK: br %r14
entry:
%res = fsub float -0.0, %a
@@ -164,7 +164,7 @@ define float @f9(float %a, float %b, float *%dest) {
; CHECK-LABEL: f9:
; CHECK: meebr %f0, %f2
; CHECK-NEXT: ltebr %f0, %f0
-; CHECK-NEXT: jlh .L{{.*}}
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%res = fmul float %a, %b
@@ -186,7 +186,7 @@ define float @f10(float %a, float %b, float %c, float *%dest) {
; CHECK: aebr %f0, %f2
; CHECK-NEXT: debr %f0, %f4
; CHECK-NEXT: ltebr %f0, %f0
-; CHECK-NEXT: jne .L{{.*}}
+; CHECK-NEXT: bner %r14
; CHECK: br %r14
entry:
%add = fadd float %a, %b
@@ -210,7 +210,7 @@ define float @f11(float %a, float %b, float %c, float *%dest1, float *%dest2) {
; CHECK-NEXT: sebr %f4, %f0
; CHECK-NEXT: ste %f4, 0(%r2)
; CHECK-NEXT: ltebr %f0, %f0
-; CHECK-NEXT: je .L{{.*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%add = fadd float %a, %b
@@ -234,7 +234,7 @@ define float @f12(float %dummy, float %val, float *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %f0
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{f0}"(float %val)
@@ -256,7 +256,7 @@ define double @f13(double %dummy, double %val, double *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %f0
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{f0}"(double %val)
@@ -281,7 +281,7 @@ define void @f14(fp128 *%ptr1, fp128 *%ptr2) {
; CHECK-NEXT: mxbr
; CHECK-NEXT: std
; CHECK-NEXT: std
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val1 = load fp128 , fp128 *%ptr1
@@ -309,7 +309,7 @@ define float @f15(float %val, float %dummy, float *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %f2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{f2}"(float %val)
@@ -332,7 +332,7 @@ define double @f16(double %val, double %dummy, double *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %f2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{f2}"(double %val)
@@ -351,7 +351,7 @@ exit:
define float @f17(float %a, float %b, float *%dest) {
; CHECK-LABEL: f17:
; CHECK: aebr %f0, %f2
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = fadd float %a, %b
@@ -371,7 +371,7 @@ exit:
define float @f18(float %dummy, float %a, float *%dest) {
; CHECK-LABEL: f18:
; CHECK: lnebr %f0, %f2
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%abs = call float @llvm.fabs.f32(float %a)
@@ -391,7 +391,7 @@ exit:
define float @f19(float %dummy, float %a, float *%dest) {
; CHECK-LABEL: f19:
; CHECK: lcebr %f0, %f2
-; CHECK-NEXT: jle .L{{.*}}
+; CHECK-NEXT: bler %r14
; CHECK: br %r14
entry:
%res = fsub float -0.0, %a
diff --git a/test/CodeGen/SystemZ/fp-cmp-05.ll b/test/CodeGen/SystemZ/fp-cmp-05.ll
index c8eb18c6e6ba..92b5056cfbbe 100644
--- a/test/CodeGen/SystemZ/fp-cmp-05.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-05.ll
@@ -10,7 +10,7 @@
define float @f1(float %a, float %b, float %f) {
; CHECK-LABEL: f1:
; CHECK: lcebr
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
%neg = fsub float -0.0, %f
%cond = fcmp oeq float %neg, 0.0
%res = select i1 %cond, float %a, float %b
@@ -21,7 +21,7 @@ define float @f1(float %a, float %b, float %f) {
define double @f2(double %a, double %b, double %f) {
; CHECK-LABEL: f2:
; CHECK: lcdbr
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
%neg = fsub double -0.0, %f
%cond = fcmp oeq double %neg, 0.0
%res = select i1 %cond, double %a, double %b
@@ -34,7 +34,7 @@ declare float @llvm.fabs.f32(float %f)
define float @f3(float %a, float %b, float %f) {
; CHECK-LABEL: f3:
; CHECK: lnebr
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
%abs = call float @llvm.fabs.f32(float %f)
%neg = fsub float -0.0, %abs
%cond = fcmp oeq float %neg, 0.0
@@ -47,7 +47,7 @@ declare double @llvm.fabs.f64(double %f)
define double @f4(double %a, double %b, double %f) {
; CHECK-LABEL: f4:
; CHECK: lndbr
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
%abs = call double @llvm.fabs.f64(double %f)
%neg = fsub double -0.0, %abs
%cond = fcmp oeq double %neg, 0.0
@@ -60,7 +60,7 @@ define double @f4(double %a, double %b, double %f) {
define float @f5(float %a, float %b, float %f) {
; CHECK-LABEL: f5:
; CHECK: lpebr
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
%abs = call float @llvm.fabs.f32(float %f)
%cond = fcmp oeq float %abs, 0.0
%res = select i1 %cond, float %a, float %b
@@ -71,7 +71,7 @@ define float @f5(float %a, float %b, float %f) {
define double @f6(double %a, double %b, double %f) {
; CHECK-LABEL: f6:
; CHECK: lpdbr
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
%abs = call double @llvm.fabs.f64(double %f)
%cond = fcmp oeq double %abs, 0.0
%res = select i1 %cond, double %a, double %b
diff --git a/test/CodeGen/SystemZ/fp-copysign-01.ll b/test/CodeGen/SystemZ/fp-copysign-01.ll
index 57ad76fcbb2a..321027911abb 100644
--- a/test/CodeGen/SystemZ/fp-copysign-01.ll
+++ b/test/CodeGen/SystemZ/fp-copysign-01.ll
@@ -11,7 +11,7 @@ declare fp128 @copysignl(fp128, fp128) readnone
define float @f1(float %a, float %b) {
; CHECK-LABEL: f1:
; CHECK-NOT: %f2
-; CHECK: cpsdr %f0, %f0, %f2
+; CHECK: cpsdr %f0, %f2, %f0
; CHECK: br %r14
%res = call float @copysignf(float %a, float %b) readnone
ret float %res
@@ -21,7 +21,7 @@ define float @f1(float %a, float %b) {
define float @f2(float %a, double %bd) {
; CHECK-LABEL: f2:
; CHECK-NOT: %f2
-; CHECK: cpsdr %f0, %f0, %f2
+; CHECK: cpsdr %f0, %f2, %f0
; CHECK: br %r14
%b = fptrunc double %bd to float
%res = call float @copysignf(float %a, float %b) readnone
@@ -33,7 +33,7 @@ define float @f3(float %a, fp128 *%bptr) {
; CHECK-LABEL: f3:
; CHECK: ld [[BHIGH:%f[0-7]]], 0(%r2)
; CHECK: ld [[BLOW:%f[0-7]]], 8(%r2)
-; CHECK: cpsdr %f0, %f0, [[BHIGH]]
+; CHECK: cpsdr %f0, [[BHIGH]], %f0
; CHECK: br %r14
%bl = load volatile fp128 , fp128 *%bptr
%b = fptrunc fp128 %bl to float
@@ -45,7 +45,7 @@ define float @f3(float %a, fp128 *%bptr) {
define double @f4(double %a, float %bf) {
; CHECK-LABEL: f4:
; CHECK-NOT: %f2
-; CHECK: cpsdr %f0, %f0, %f2
+; CHECK: cpsdr %f0, %f2, %f0
; CHECK: br %r14
%b = fpext float %bf to double
%res = call double @copysign(double %a, double %b) readnone
@@ -56,7 +56,7 @@ define double @f4(double %a, float %bf) {
define double @f5(double %a, double %b) {
; CHECK-LABEL: f5:
; CHECK-NOT: %f2
-; CHECK: cpsdr %f0, %f0, %f2
+; CHECK: cpsdr %f0, %f2, %f0
; CHECK: br %r14
%res = call double @copysign(double %a, double %b) readnone
ret double %res
@@ -67,7 +67,7 @@ define double @f6(double %a, fp128 *%bptr) {
; CHECK-LABEL: f6:
; CHECK: ld [[BHIGH:%f[0-7]]], 0(%r2)
; CHECK: ld [[BLOW:%f[0-7]]], 8(%r2)
-; CHECK: cpsdr %f0, %f0, [[BHIGH]]
+; CHECK: cpsdr %f0, [[BHIGH]], %f0
; CHECK: br %r14
%bl = load volatile fp128 , fp128 *%bptr
%b = fptrunc fp128 %bl to double
@@ -82,7 +82,7 @@ define void @f7(fp128 *%cptr, fp128 *%aptr, float %bf) {
; CHECK-LABEL: f7:
; CHECK: ld [[AHIGH:%f[0-7]]], 0(%r3)
; CHECK: ld [[ALOW:%f[0-7]]], 8(%r3)
-; CHECK: cpsdr [[AHIGH]], [[AHIGH]], %f0
+; CHECK: cpsdr [[AHIGH]], %f0, [[AHIGH]]
; CHECK: std [[AHIGH]], 0(%r2)
; CHECK: std [[ALOW]], 8(%r2)
; CHECK: br %r14
@@ -98,7 +98,7 @@ define void @f8(fp128 *%cptr, fp128 *%aptr, double %bd) {
; CHECK-LABEL: f8:
; CHECK: ld [[AHIGH:%f[0-7]]], 0(%r3)
; CHECK: ld [[ALOW:%f[0-7]]], 8(%r3)
-; CHECK: cpsdr [[AHIGH]], [[AHIGH]], %f0
+; CHECK: cpsdr [[AHIGH]], %f0, [[AHIGH]]
; CHECK: std [[AHIGH]], 0(%r2)
; CHECK: std [[ALOW]], 8(%r2)
; CHECK: br %r14
@@ -116,7 +116,7 @@ define void @f9(fp128 *%cptr, fp128 *%aptr, fp128 *%bptr) {
; CHECK: ld [[AHIGH:%f[0-7]]], 0(%r3)
; CHECK: ld [[ALOW:%f[0-7]]], 8(%r3)
; CHECK: ld [[BHIGH:%f[0-7]]], 0(%r4)
-; CHECK: cpsdr [[AHIGH]], [[AHIGH]], [[BHIGH]]
+; CHECK: cpsdr [[AHIGH]], [[BHIGH]], [[AHIGH]]
; CHECK: std [[AHIGH]], 0(%r2)
; CHECK: std [[ALOW]], 8(%r2)
; CHECK: br %r14
diff --git a/test/CodeGen/SystemZ/fp-move-01.ll b/test/CodeGen/SystemZ/fp-move-01.ll
index 843b1b6a6e64..55c09e5d7796 100644
--- a/test/CodeGen/SystemZ/fp-move-01.ll
+++ b/test/CodeGen/SystemZ/fp-move-01.ll
@@ -1,7 +1,6 @@
; Test moves between FPRs.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
; Test f32 moves.
define float @f1(float %a, float %b) {
diff --git a/test/CodeGen/SystemZ/fp-move-09.ll b/test/CodeGen/SystemZ/fp-move-09.ll
index 5e8dce272c23..6cc92eebd2a3 100644
--- a/test/CodeGen/SystemZ/fp-move-09.ll
+++ b/test/CodeGen/SystemZ/fp-move-09.ll
@@ -32,7 +32,8 @@ define void @f2(float %val, i8 *%ptr) {
; Like f2, but with a conditional store.
define void @f3(float %val, i8 *%ptr, i32 %which) {
; CHECK-LABEL: f3:
-; CHECK: cijlh %r3, 0,
+; CHECK: ciblh %r3, 0, 0(%r14)
+
; CHECK: lgdr [[REG:%r[0-5]]], %f0
; CHECK: stch [[REG]], 0(%r2)
; CHECK: br %r14
@@ -48,7 +49,7 @@ define void @f3(float %val, i8 *%ptr, i32 %which) {
; ...and again with 16-bit memory.
define void @f4(float %val, i16 *%ptr, i32 %which) {
; CHECK-LABEL: f4:
-; CHECK: cijlh %r3, 0,
+; CHECK: ciblh %r3, 0, 0(%r14)
; CHECK: lgdr [[REG:%r[0-5]]], %f0
; CHECK: sthh [[REG]], 0(%r2)
; CHECK: br %r14
diff --git a/test/CodeGen/SystemZ/fp-move-10.ll b/test/CodeGen/SystemZ/fp-move-10.ll
index 602397d58a8d..b7e64042d10a 100644
--- a/test/CodeGen/SystemZ/fp-move-10.ll
+++ b/test/CodeGen/SystemZ/fp-move-10.ll
@@ -31,7 +31,7 @@ define void @f2(float %val, i8 *%ptr) {
; Like f2, but with a conditional store.
define void @f3(float %val, i8 *%ptr, i32 %which) {
; CHECK-LABEL: f3:
-; CHECK-DAG: cijlh %r3, 0,
+; CHECK-DAG: ciblh %r3, 0, 0(%r14)
; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0
; CHECK: stc [[REG]], 0(%r2)
; CHECK: br %r14
@@ -47,7 +47,7 @@ define void @f3(float %val, i8 *%ptr, i32 %which) {
; ...and again with 16-bit memory.
define void @f4(float %val, i16 *%ptr, i32 %which) {
; CHECK-LABEL: f4:
-; CHECK-DAG: cijlh %r3, 0,
+; CHECK-DAG: ciblh %r3, 0, 0(%r14)
; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0
; CHECK: sth [[REG]], 0(%r2)
; CHECK: br %r14
diff --git a/test/CodeGen/SystemZ/fp-move-12.ll b/test/CodeGen/SystemZ/fp-move-12.ll
new file mode 100644
index 000000000000..131f7c374ca2
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-move-12.ll
@@ -0,0 +1,33 @@
+; Test moves between FPRs on z13.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test that we use LDR instead of LER.
+define float @f1(float %a, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+ ret float %b
+}
+
+; Test f64 moves.
+define double @f2(double %a, double %b) {
+; CHECK-LABEL: f2:
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+ ret double %b
+}
+
+; Test f128 moves. Since f128s are passed by reference, we need to force
+; a copy by other means.
+define void @f3(fp128 *%x) {
+; CHECK-LABEL: f3:
+; CHECK: lxr
+; CHECK: axbr
+; CHECK: br %r14
+ %val = load volatile fp128 , fp128 *%x
+ %sum = fadd fp128 %val, %val
+ store volatile fp128 %sum, fp128 *%x
+ store volatile fp128 %val, fp128 *%x
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-sqrt-01.ll b/test/CodeGen/SystemZ/fp-sqrt-01.ll
index e8bf65bdc981..3680207e7f20 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-01.ll
@@ -159,9 +159,7 @@ define float @f8(float %dummy, float %val) {
; CHECK-LABEL: f8:
; CHECK: sqebr %f0, %f2
; CHECK: cebr %f0, %f0
-; CHECK: jo [[LABEL:\.L.*]]
-; CHECK: br %r14
-; CHECK: [[LABEL]]:
+; CHECK: bnor %r14
; CHECK: ler %f0, %f2
; CHECK: jg sqrtf@PLT
%res = tail call float @sqrtf(float %val)
diff --git a/test/CodeGen/SystemZ/fp-sqrt-02.ll b/test/CodeGen/SystemZ/fp-sqrt-02.ll
index a162466064e8..a72629443f6d 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-02.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-02.ll
@@ -161,9 +161,7 @@ define double @f8(double %dummy, double %val) {
; CHECK-LABEL: f8:
; CHECK: sqdbr %f0, %f2
; CHECK: cdbr %f0, %f0
-; CHECK: jo [[LABEL:\.L.*]]
-; CHECK: br %r14
-; CHECK: [[LABEL]]:
+; CHECK: bnor %r14
; CHECK: ldr %f0, %f2
; CHECK: jg sqrt@PLT
%res = tail call double @sqrt(double %val)
diff --git a/test/CodeGen/SystemZ/frameaddr-01.ll b/test/CodeGen/SystemZ/frameaddr-01.ll
new file mode 100644
index 000000000000..4dfdf308e8a6
--- /dev/null
+++ b/test/CodeGen/SystemZ/frameaddr-01.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; The current function's frame address is the address of
+; the optional back chain slot.
+define i8* @fp0() nounwind {
+entry:
+; CHECK-LABEL: fp0:
+; CHECK: la %r2, 0(%r15)
+; CHECK: br %r14
+ %0 = tail call i8* @llvm.frameaddress(i32 0)
+ ret i8* %0
+}
+
+; Check that the frame address is correct in a presence
+; of a stack frame.
+define i8* @fp0f() nounwind {
+entry:
+; CHECK-LABEL: fp0f:
+; CHECK: aghi %r15, -168
+; CHECK: la %r2, 168(%r15)
+; CHECK: aghi %r15, 168
+; CHECK: br %r14
+ %0 = alloca i64, align 8
+ %1 = tail call i8* @llvm.frameaddress(i32 0)
+ ret i8* %1
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/SystemZ/htm-intrinsics.ll b/test/CodeGen/SystemZ/htm-intrinsics.ll
index 6441ef94b406..107059f5cd83 100644
--- a/test/CodeGen/SystemZ/htm-intrinsics.ll
+++ b/test/CodeGen/SystemZ/htm-intrinsics.ll
@@ -67,7 +67,7 @@ define void @test_tbegin_nofloat3(i32 *%ptr) {
; CHECK-NOT: stmg
; CHECK-NOT: std
; CHECK: tbegin 0, 65292
-; CHECK: jnh {{\.L*}}
+; CHECK: bnhr %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
@@ -90,7 +90,7 @@ define i32 @test_tbegin_nofloat4(i32 %pad, i32 *%ptr) {
; CHECK: tbegin 0, 65292
; CHECK: ipm %r2
; CHECK: srl %r2, 28
-; CHECK: cijlh %r2, 2, {{\.L*}}
+; CHECK: ciblh %r2, 2, 0(%r14)
; CHECK: mvhi 0(%r3), 0
; CHECK: br %r14
%res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
@@ -198,7 +198,7 @@ define i32 @test_tend1() {
define void @test_tend3(i32 *%ptr) {
; CHECK-LABEL: test_tend3:
; CHECK: tend
-; CHECK: je {{\.L*}}
+; CHECK: ber %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%res = call i32 @llvm.s390.tend()
@@ -219,7 +219,7 @@ define i32 @test_tend2(i32 %pad, i32 *%ptr) {
; CHECK: tend
; CHECK: ipm %r2
; CHECK: srl %r2, 28
-; CHECK: cijlh %r2, 2, {{\.L*}}
+; CHECK: ciblh %r2, 2, 0(%r14)
; CHECK: mvhi 0(%r3), 0
; CHECK: br %r14
%res = call i32 @llvm.s390.tend()
diff --git a/test/CodeGen/SystemZ/int-cmp-01.ll b/test/CodeGen/SystemZ/int-cmp-01.ll
index 97b697db3bdb..12060b157fa3 100644
--- a/test/CodeGen/SystemZ/int-cmp-01.ll
+++ b/test/CodeGen/SystemZ/int-cmp-01.ll
@@ -154,7 +154,7 @@ define void @f10(i32 %lhs, i64 %base, i64 %index, i32 *%dst) {
define double @f11(double %a, double %b, i32 %rhs, i16 *%src) {
; CHECK-LABEL: f11:
; CHECK: ch %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%half = load i16 , i16 *%src
diff --git a/test/CodeGen/SystemZ/int-cmp-02.ll b/test/CodeGen/SystemZ/int-cmp-02.ll
index d5aef0f0f977..d3cd7275ec58 100644
--- a/test/CodeGen/SystemZ/int-cmp-02.ll
+++ b/test/CodeGen/SystemZ/int-cmp-02.ll
@@ -7,7 +7,7 @@ declare i32 @foo()
; Check register comparison.
define double @f1(double %a, double %b, i32 %i1, i32 %i2) {
; CHECK-LABEL: f1:
-; CHECK: crjl %r2, %r3
+; CHECK: crbl %r2, %r3, 0(%r14)
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, %i2
@@ -19,7 +19,7 @@ define double @f1(double %a, double %b, i32 %i1, i32 %i2) {
define double @f2(double %a, double %b, i32 %i1, i32 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: c %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = load i32 , i32 *%ptr
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i32 %i1, i32 *%ptr) {
define double @f3(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f3:
; CHECK: c %r2, 4092(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1023
@@ -46,7 +46,7 @@ define double @f3(double %a, double %b, i32 %i1, i32 *%base) {
define double @f4(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f4:
; CHECK: cy %r2, 4096(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1024
@@ -60,7 +60,7 @@ define double @f4(double %a, double %b, i32 %i1, i32 *%base) {
define double @f5(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f5:
; CHECK: cy %r2, 524284(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131071
@@ -76,7 +76,7 @@ define double @f6(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f6:
; CHECK: agfi %r3, 524288
; CHECK: c %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131072
@@ -90,7 +90,7 @@ define double @f6(double %a, double %b, i32 %i1, i32 *%base) {
define double @f7(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f7:
; CHECK: cy %r2, -4(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -1
@@ -104,7 +104,7 @@ define double @f7(double %a, double %b, i32 %i1, i32 *%base) {
define double @f8(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f8:
; CHECK: cy %r2, -524288(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131072
@@ -120,7 +120,7 @@ define double @f9(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f9:
; CHECK: agfi %r3, -524292
; CHECK: c %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131073
@@ -134,7 +134,7 @@ define double @f9(double %a, double %b, i32 %i1, i32 *%base) {
define double @f10(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f10:
; CHECK: c %r2, 4092({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -150,7 +150,7 @@ define double @f10(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
define double @f11(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f11:
; CHECK: cy %r2, 4096({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -186,7 +186,7 @@ while.end:
define double @f13(double %a, double %b, i32 %i2, i32 *%ptr) {
; CHECK-LABEL: f13:
; CHECK: c %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i1 = load i32 , i32 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-03.ll b/test/CodeGen/SystemZ/int-cmp-03.ll
index 0246666f06fe..7a0007e67248 100644
--- a/test/CodeGen/SystemZ/int-cmp-03.ll
+++ b/test/CodeGen/SystemZ/int-cmp-03.ll
@@ -5,7 +5,7 @@
; Check register comparison.
define double @f1(double %a, double %b, i32 %i1, i32 %i2) {
; CHECK-LABEL: f1:
-; CHECK: clrjl %r2, %r3
+; CHECK: clrbl %r2, %r3, 0(%r14)
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i32 %i1, %i2
@@ -17,7 +17,7 @@ define double @f1(double %a, double %b, i32 %i1, i32 %i2) {
define double @f2(double %a, double %b, i32 %i1, i32 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cl %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = load i32 , i32 *%ptr
@@ -30,7 +30,7 @@ define double @f2(double %a, double %b, i32 %i1, i32 *%ptr) {
define double @f3(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f3:
; CHECK: cl %r2, 4092(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1023
@@ -44,7 +44,7 @@ define double @f3(double %a, double %b, i32 %i1, i32 *%base) {
define double @f4(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f4:
; CHECK: cly %r2, 4096(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1024
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i32 %i1, i32 *%base) {
define double @f5(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f5:
; CHECK: cly %r2, 524284(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131071
@@ -74,7 +74,7 @@ define double @f6(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f6:
; CHECK: agfi %r3, 524288
; CHECK: cl %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131072
@@ -88,7 +88,7 @@ define double @f6(double %a, double %b, i32 %i1, i32 *%base) {
define double @f7(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f7:
; CHECK: cly %r2, -4(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -1
@@ -102,7 +102,7 @@ define double @f7(double %a, double %b, i32 %i1, i32 *%base) {
define double @f8(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f8:
; CHECK: cly %r2, -524288(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131072
@@ -118,7 +118,7 @@ define double @f9(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f9:
; CHECK: agfi %r3, -524292
; CHECK: cl %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131073
@@ -132,7 +132,7 @@ define double @f9(double %a, double %b, i32 %i1, i32 *%base) {
define double @f10(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f10:
; CHECK: cl %r2, 4092({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -148,7 +148,7 @@ define double @f10(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
define double @f11(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f11:
; CHECK: cly %r2, 4096({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -164,7 +164,7 @@ define double @f11(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
define double @f12(double %a, double %b, i32 %i2, i32 *%ptr) {
; CHECK-LABEL: f12:
; CHECK: cl %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i1 = load i32 , i32 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-04.ll b/test/CodeGen/SystemZ/int-cmp-04.ll
index 90f05ea38680..8f3c8031c085 100644
--- a/test/CodeGen/SystemZ/int-cmp-04.ll
+++ b/test/CodeGen/SystemZ/int-cmp-04.ll
@@ -110,7 +110,7 @@ define void @f7(i64 %lhs, i64 %base, i64 %index, i64 *%dst) {
define double @f8(double %a, double %b, i64 %rhs, i16 *%src) {
; CHECK-LABEL: f8:
; CHECK: cgh %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%half = load i16 , i16 *%src
diff --git a/test/CodeGen/SystemZ/int-cmp-05.ll b/test/CodeGen/SystemZ/int-cmp-05.ll
index 70640b607bcd..679dcc8985a3 100644
--- a/test/CodeGen/SystemZ/int-cmp-05.ll
+++ b/test/CodeGen/SystemZ/int-cmp-05.ll
@@ -8,7 +8,7 @@ declare i64 @foo()
define double @f1(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f1:
; CHECK: cgfr %r2, %r3
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = sext i32 %unext to i64
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i64 %i1, i32 %unext) {
define double @f3(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f3:
; CHECK: cgfr %r2, %r3
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = sext i32 %unext to i64
@@ -45,7 +45,7 @@ define double @f3(double %a, double %b, i64 %i1, i32 %unext) {
define double @f4(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f4:
; CHECK: cgfr %r2, %r3
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = sext i32 %unext to i64
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i64 %i1, i32 %unext) {
define double @f5(double %a, double %b, i64 %i1, i32 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: cgf %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
@@ -84,7 +84,7 @@ define double @f6(double %a, double %b, i64 %i1, i32 *%ptr) {
define double @f7(double %a, double %b, i64 %i1, i32 *%ptr) {
; CHECK-LABEL: f7:
; CHECK: cgf %r2, 0(%r3)
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
@@ -98,7 +98,7 @@ define double @f7(double %a, double %b, i64 %i1, i32 *%ptr) {
define double @f8(double %a, double %b, i64 %i1, i32 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cgf %r2, 0(%r3)
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
@@ -112,7 +112,7 @@ define double @f8(double %a, double %b, i64 %i1, i32 *%ptr) {
define double @f9(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f9:
; CHECK: cgf %r2, 524284(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131071
@@ -129,7 +129,7 @@ define double @f10(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f10:
; CHECK: agfi %r3, 524288
; CHECK: cgf %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131072
@@ -144,7 +144,7 @@ define double @f10(double %a, double %b, i64 %i1, i32 *%base) {
define double @f11(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f11:
; CHECK: cgf %r2, -4(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -1
@@ -159,7 +159,7 @@ define double @f11(double %a, double %b, i64 %i1, i32 *%base) {
define double @f12(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f12:
; CHECK: cgf %r2, -524288(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131072
@@ -176,7 +176,7 @@ define double @f13(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f13:
; CHECK: agfi %r3, -524292
; CHECK: cgf %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131073
@@ -191,7 +191,7 @@ define double @f13(double %a, double %b, i64 %i1, i32 *%base) {
define double @f14(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f14:
; CHECK: cgf %r2, 524284({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -295,7 +295,7 @@ define i64 @f15(i32 *%ptr0) {
define double @f16(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f16:
; CHECK: cgfr %r2, %r3
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = sext i32 %unext to i64
@@ -308,7 +308,7 @@ define double @f16(double %a, double %b, i64 %i1, i32 %unext) {
define double @f17(double %a, double %b, i64 %i2, i32 *%ptr) {
; CHECK-LABEL: f17:
; CHECK: cgf %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-06.ll b/test/CodeGen/SystemZ/int-cmp-06.ll
index 16c2ade83553..7b6a9aec6287 100644
--- a/test/CodeGen/SystemZ/int-cmp-06.ll
+++ b/test/CodeGen/SystemZ/int-cmp-06.ll
@@ -8,7 +8,7 @@ declare i64 @foo()
define double @f1(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f1:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = zext i32 %unext to i64
@@ -21,7 +21,7 @@ define double @f1(double %a, double %b, i64 %i1, i32 %unext) {
define double @f2(double %a, double %b, i64 %i1, i64 %unext) {
; CHECK-LABEL: f2:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = and i64 %unext, 4294967295
@@ -56,7 +56,7 @@ define double @f4(double %a, double %b, i64 %i1, i64 %unext) {
define double @f5(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f5:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = zext i32 %unext to i64
@@ -69,7 +69,7 @@ define double @f5(double %a, double %b, i64 %i1, i32 %unext) {
define double @f6(double %a, double %b, i64 %i1, i64 %unext) {
; CHECK-LABEL: f6:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = and i64 %unext, 4294967295
@@ -82,7 +82,7 @@ define double @f6(double %a, double %b, i64 %i1, i64 %unext) {
define double @f7(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f7:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = zext i32 %unext to i64
@@ -95,7 +95,7 @@ define double @f7(double %a, double %b, i64 %i1, i32 %unext) {
define double @f8(double %a, double %b, i64 %i1, i64 %unext) {
; CHECK-LABEL: f8:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = and i64 %unext, 4294967295
@@ -108,7 +108,7 @@ define double @f8(double %a, double %b, i64 %i1, i64 %unext) {
define double @f9(double %a, double %b, i64 %i1, i32 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: clgf %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
@@ -134,7 +134,7 @@ define double @f10(double %a, double %b, i64 %i1, i32 *%ptr) {
define double @f11(double %a, double %b, i64 %i1, i32 *%ptr) {
; CHECK-LABEL: f11:
; CHECK: clgf %r2, 0(%r3)
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
@@ -148,7 +148,7 @@ define double @f11(double %a, double %b, i64 %i1, i32 *%ptr) {
define double @f12(double %a, double %b, i64 %i1, i32 *%ptr) {
; CHECK-LABEL: f12:
; CHECK: clgf %r2, 0(%r3)
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
@@ -162,7 +162,7 @@ define double @f12(double %a, double %b, i64 %i1, i32 *%ptr) {
define double @f13(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f13:
; CHECK: clgf %r2, 524284(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131071
@@ -179,7 +179,7 @@ define double @f14(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f14:
; CHECK: agfi %r3, 524288
; CHECK: clgf %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 131072
@@ -194,7 +194,7 @@ define double @f14(double %a, double %b, i64 %i1, i32 *%base) {
define double @f15(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f15:
; CHECK: clgf %r2, -4(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -1
@@ -209,7 +209,7 @@ define double @f15(double %a, double %b, i64 %i1, i32 *%base) {
define double @f16(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f16:
; CHECK: clgf %r2, -524288(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131072
@@ -226,7 +226,7 @@ define double @f17(double %a, double %b, i64 %i1, i32 *%base) {
; CHECK-LABEL: f17:
; CHECK: agfi %r3, -524292
; CHECK: clgf %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -131073
@@ -241,7 +241,7 @@ define double @f17(double %a, double %b, i64 %i1, i32 *%base) {
define double @f18(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f18:
; CHECK: clgf %r2, 524284({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -345,7 +345,7 @@ define i64 @f19(i32 *%ptr0) {
define double @f20(double %a, double %b, i64 %i1, i32 %unext) {
; CHECK-LABEL: f20:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = zext i32 %unext to i64
@@ -358,7 +358,7 @@ define double @f20(double %a, double %b, i64 %i1, i32 %unext) {
define double @f21(double %a, double %b, i64 %i1, i64 %unext) {
; CHECK-LABEL: f21:
; CHECK: clgfr %r2, %r3
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = and i64 %unext, 4294967295
@@ -371,7 +371,7 @@ define double @f21(double %a, double %b, i64 %i1, i64 %unext) {
define double @f22(double %a, double %b, i64 %i2, i32 *%ptr) {
; CHECK-LABEL: f22:
; CHECK: clgf %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%unext = load i32 , i32 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-07.ll b/test/CodeGen/SystemZ/int-cmp-07.ll
index 0a787c9ea01d..8611662190c5 100644
--- a/test/CodeGen/SystemZ/int-cmp-07.ll
+++ b/test/CodeGen/SystemZ/int-cmp-07.ll
@@ -5,7 +5,7 @@
; Check CGR.
define double @f1(double %a, double %b, i64 %i1, i64 %i2) {
; CHECK-LABEL: f1:
-; CHECK: cgrjl %r2, %r3
+; CHECK: cgrbl %r2, %r3, 0(%r14)
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, %i2
@@ -17,7 +17,7 @@ define double @f1(double %a, double %b, i64 %i1, i64 %i2) {
define double @f2(double %a, double %b, i64 %i1, i64 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cg %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = load i64 , i64 *%ptr
@@ -30,7 +30,7 @@ define double @f2(double %a, double %b, i64 %i1, i64 *%ptr) {
define double @f3(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f3:
; CHECK: cg %r2, 524280(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 65535
@@ -46,7 +46,7 @@ define double @f4(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f4:
; CHECK: agfi %r3, 524288
; CHECK: cg %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 65536
@@ -60,7 +60,7 @@ define double @f4(double %a, double %b, i64 %i1, i64 *%base) {
define double @f5(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f5:
; CHECK: cg %r2, -8(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -1
@@ -74,7 +74,7 @@ define double @f5(double %a, double %b, i64 %i1, i64 *%base) {
define double @f6(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f6:
; CHECK: cg %r2, -524288(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -65536
@@ -90,7 +90,7 @@ define double @f7(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f7:
; CHECK: agfi %r3, -524296
; CHECK: cg %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -65537
@@ -104,7 +104,7 @@ define double @f7(double %a, double %b, i64 %i1, i64 *%base) {
define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f8:
; CHECK: cg %r2, 524280({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -120,7 +120,7 @@ define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
define double @f9(double %a, double %b, i64 %i2, i64 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cg %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i1 = load i64 , i64 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-08.ll b/test/CodeGen/SystemZ/int-cmp-08.ll
index 384b41b549b9..fc60993df0e8 100644
--- a/test/CodeGen/SystemZ/int-cmp-08.ll
+++ b/test/CodeGen/SystemZ/int-cmp-08.ll
@@ -5,7 +5,7 @@
; Check CLGR.
define double @f1(double %a, double %b, i64 %i1, i64 %i2) {
; CHECK-LABEL: f1:
-; CHECK: clgrjl %r2, %r3
+; CHECK: clgrbl %r2, %r3, 0(%r14)
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i64 %i1, %i2
@@ -17,7 +17,7 @@ define double @f1(double %a, double %b, i64 %i1, i64 %i2) {
define double @f2(double %a, double %b, i64 %i1, i64 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clg %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i2 = load i64 , i64 *%ptr
@@ -30,7 +30,7 @@ define double @f2(double %a, double %b, i64 %i1, i64 *%ptr) {
define double @f3(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f3:
; CHECK: clg %r2, 524280(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 65535
@@ -46,7 +46,7 @@ define double @f4(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f4:
; CHECK: agfi %r3, 524288
; CHECK: clg %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 65536
@@ -60,7 +60,7 @@ define double @f4(double %a, double %b, i64 %i1, i64 *%base) {
define double @f5(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f5:
; CHECK: clg %r2, -8(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -1
@@ -74,7 +74,7 @@ define double @f5(double %a, double %b, i64 %i1, i64 *%base) {
define double @f6(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f6:
; CHECK: clg %r2, -524288(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -65536
@@ -90,7 +90,7 @@ define double @f7(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f7:
; CHECK: agfi %r3, -524296
; CHECK: clg %r2, 0(%r3)
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -65537
@@ -104,7 +104,7 @@ define double @f7(double %a, double %b, i64 %i1, i64 *%base) {
define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
; CHECK-LABEL: f8:
; CHECK: clg %r2, 524280({{%r4,%r3|%r3,%r4}})
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add1 = add i64 %base, %index
@@ -120,7 +120,7 @@ define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
define double @f9(double %a, double %b, i64 %i2, i64 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: clg %r2, 0(%r3)
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%i1 = load i64 , i64 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-09.ll b/test/CodeGen/SystemZ/int-cmp-09.ll
index 0eb8c6688c0c..cd0ace2a9a94 100644
--- a/test/CodeGen/SystemZ/int-cmp-09.ll
+++ b/test/CodeGen/SystemZ/int-cmp-09.ll
@@ -9,7 +9,8 @@ define double @f1(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, 0
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -20,7 +21,8 @@ define double @f2(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, 2
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -31,7 +33,8 @@ define double @f3(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, 127
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -43,7 +46,8 @@ define double @f4(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, 128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -55,7 +59,8 @@ define double @f5(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, 32767
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -67,7 +72,8 @@ define double @f6(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, 32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -79,7 +85,8 @@ define double @f7(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i32 %i1, 2147483647
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -91,7 +98,8 @@ define double @f8(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i32 %i1, 2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -102,7 +110,8 @@ define double @f9(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, -1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -113,7 +122,8 @@ define double @f10(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, -128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -125,7 +135,8 @@ define double @f11(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, -129
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -137,7 +148,8 @@ define double @f12(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, -32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -149,7 +161,8 @@ define double @f13(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, -32769
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -161,7 +174,8 @@ define double @f14(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i32 %i1, -2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -173,7 +187,8 @@ define double @f15(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i32 %i1, -2147483649
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -184,7 +199,8 @@ define double @f16(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i32 %i1, 1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -195,7 +211,8 @@ define double @f17(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp sge i32 %i1, 1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -206,7 +223,8 @@ define double @f18(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp sgt i32 %i1, -1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -217,6 +235,7 @@ define double @f19(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp sle i32 %i1, -1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-10.ll b/test/CodeGen/SystemZ/int-cmp-10.ll
index 4d4c4bbd20d1..e2a0c1aa6948 100644
--- a/test/CodeGen/SystemZ/int-cmp-10.ll
+++ b/test/CodeGen/SystemZ/int-cmp-10.ll
@@ -10,7 +10,8 @@ define double @f1(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ugt i32 %i1, 1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -21,7 +22,8 @@ define double @f2(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i32 %i1, 255
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -33,7 +35,8 @@ define double @f3(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i32 %i1, 256
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -45,6 +48,7 @@ define double @f4(double %a, double %b, i32 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i32 %i1, 4294967280
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-11.ll b/test/CodeGen/SystemZ/int-cmp-11.ll
index c74135a5d393..8fd9d8c3d479 100644
--- a/test/CodeGen/SystemZ/int-cmp-11.ll
+++ b/test/CodeGen/SystemZ/int-cmp-11.ll
@@ -9,7 +9,8 @@ define double @f1(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 0
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -20,7 +21,8 @@ define double @f2(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -31,7 +33,8 @@ define double @f3(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 127
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -43,7 +46,8 @@ define double @f4(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -55,7 +59,8 @@ define double @f5(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 32767
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -67,7 +72,8 @@ define double @f6(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -79,7 +85,8 @@ define double @f7(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 2147483647
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -90,7 +97,8 @@ define double @f8(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, 2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -101,7 +109,8 @@ define double @f9(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, -1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -112,7 +121,8 @@ define double @f10(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, -128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -124,7 +134,8 @@ define double @f11(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, -129
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -136,7 +147,8 @@ define double @f12(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, -32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -148,7 +160,8 @@ define double @f13(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, -32769
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -160,7 +173,8 @@ define double @f14(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, -2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -171,6 +185,7 @@ define double @f15(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp slt i64 %i1, -2147483649
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-12.ll b/test/CodeGen/SystemZ/int-cmp-12.ll
index d9c6a9fc4efc..3d5b5749aea8 100644
--- a/test/CodeGen/SystemZ/int-cmp-12.ll
+++ b/test/CodeGen/SystemZ/int-cmp-12.ll
@@ -10,7 +10,8 @@ define double @f1(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ugt i64 %i1, 1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -21,7 +22,8 @@ define double @f2(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i64 %i1, 255
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -33,7 +35,8 @@ define double @f3(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i64 %i1, 256
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -45,7 +48,8 @@ define double @f4(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i64 %i1, 4294967295
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -57,7 +61,8 @@ define double @f5(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i64 %i1, 4294967296
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
; Check the next value up, which must use a register comparison.
@@ -67,6 +72,7 @@ define double @f6(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ult i64 %i1, 4294967297
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-13.ll b/test/CodeGen/SystemZ/int-cmp-13.ll
index 53af0c868a25..fda4496a961a 100644
--- a/test/CodeGen/SystemZ/int-cmp-13.ll
+++ b/test/CodeGen/SystemZ/int-cmp-13.ll
@@ -9,7 +9,8 @@ define double @f1(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 0
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -20,7 +21,8 @@ define double @f2(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 127
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -32,7 +34,8 @@ define double @f3(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -44,7 +47,8 @@ define double @f4(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 32767
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -56,7 +60,8 @@ define double @f5(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -68,7 +73,8 @@ define double @f6(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 2147483647
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -80,7 +86,8 @@ define double @f7(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -92,7 +99,8 @@ define double @f8(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 4294967295
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -103,7 +111,8 @@ define double @f9(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, 4294967296
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -114,7 +123,8 @@ define double @f10(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, -1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -125,7 +135,8 @@ define double @f11(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, -128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -137,7 +148,8 @@ define double @f12(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, -129
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -149,7 +161,8 @@ define double @f13(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, -32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -161,7 +174,8 @@ define double @f14(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, -32769
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -173,7 +187,8 @@ define double @f15(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, -2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -184,6 +199,7 @@ define double @f16(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp eq i64 %i1, -2147483649
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-14.ll b/test/CodeGen/SystemZ/int-cmp-14.ll
index 4dbd0ece3af6..d63aaa333889 100644
--- a/test/CodeGen/SystemZ/int-cmp-14.ll
+++ b/test/CodeGen/SystemZ/int-cmp-14.ll
@@ -9,7 +9,8 @@ define double @f1(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 0
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -20,7 +21,8 @@ define double @f2(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 127
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -32,7 +34,8 @@ define double @f3(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -44,7 +47,8 @@ define double @f4(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 32767
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -56,7 +60,8 @@ define double @f5(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -68,7 +73,8 @@ define double @f6(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 2147483647
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -80,7 +86,8 @@ define double @f7(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -92,7 +99,8 @@ define double @f8(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 4294967295
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -103,7 +111,8 @@ define double @f9(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, 4294967296
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -114,7 +123,8 @@ define double @f10(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, -1
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -125,7 +135,8 @@ define double @f11(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, -128
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -137,7 +148,8 @@ define double @f12(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, -129
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -149,7 +161,8 @@ define double @f13(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, -32768
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -161,7 +174,8 @@ define double @f14(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, -32769
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -173,7 +187,8 @@ define double @f15(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, -2147483648
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
@@ -184,6 +199,7 @@ define double @f16(double %a, double %b, i64 %i1) {
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%cond = icmp ne i64 %i1, -2147483649
- %res = select i1 %cond, double %a, double %b
+ %tmp = select i1 %cond, double %a, double %b
+ %res = fadd double %tmp, 1.0
ret double %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-15.ll b/test/CodeGen/SystemZ/int-cmp-15.ll
index 3c1e052bc35f..a8a391b62cfc 100644
--- a/test/CodeGen/SystemZ/int-cmp-15.ll
+++ b/test/CodeGen/SystemZ/int-cmp-15.ll
@@ -6,7 +6,7 @@
define double @f1(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cli 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp ugt i8 %val, 1
@@ -18,7 +18,7 @@ define double @f1(double %a, double %b, i8 *%ptr) {
define double @f2(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cli 0(%r2), 254
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp ult i8 %val, 254
@@ -30,7 +30,7 @@ define double @f2(double %a, double %b, i8 *%ptr) {
define double @f3(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp slt i8 %val, 0
@@ -42,7 +42,7 @@ define double @f3(double %a, double %b, i8 *%ptr) {
define double @f4(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp sle i8 %val, -1
@@ -54,7 +54,7 @@ define double @f4(double %a, double %b, i8 *%ptr) {
define double @f5(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp sge i8 %val, 0
@@ -66,7 +66,7 @@ define double @f5(double %a, double %b, i8 *%ptr) {
define double @f6(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp sgt i8 %val, -1
@@ -78,7 +78,7 @@ define double @f6(double %a, double %b, i8 *%ptr) {
define double @f7(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f7:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp eq i8 %val, -128
@@ -90,7 +90,7 @@ define double @f7(double %a, double %b, i8 *%ptr) {
define double @f8(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp eq i8 %val, 0
@@ -102,7 +102,7 @@ define double @f8(double %a, double %b, i8 *%ptr) {
define double @f9(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp eq i8 %val, 127
@@ -114,7 +114,7 @@ define double @f9(double %a, double %b, i8 *%ptr) {
define double @f10(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f10:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%cond = icmp eq i8 %val, 255
diff --git a/test/CodeGen/SystemZ/int-cmp-16.ll b/test/CodeGen/SystemZ/int-cmp-16.ll
index 37508b5e740f..78ac8ca4e710 100644
--- a/test/CodeGen/SystemZ/int-cmp-16.ll
+++ b/test/CodeGen/SystemZ/int-cmp-16.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i8 *%ptr) {
define double @f2(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i8 *%ptr) {
define double @f5(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i8 *%ptr) {
define double @f6(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i8 *%ptr) {
define double @f8(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i8 *%ptr) {
define double @f9(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-17.ll b/test/CodeGen/SystemZ/int-cmp-17.ll
index a22fb604d453..c58af56ce8da 100644
--- a/test/CodeGen/SystemZ/int-cmp-17.ll
+++ b/test/CodeGen/SystemZ/int-cmp-17.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i8 *%ptr) {
define double @f2(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i8 *%ptr) {
define double @f5(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i8 *%ptr) {
define double @f6(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i8 *%ptr) {
define double @f8(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i8 *%ptr) {
define double @f9(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-18.ll b/test/CodeGen/SystemZ/int-cmp-18.ll
index f4bc5c0e5ce9..547645c1aa6b 100644
--- a/test/CodeGen/SystemZ/int-cmp-18.ll
+++ b/test/CodeGen/SystemZ/int-cmp-18.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i8 *%ptr) {
define double @f2(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i8 *%ptr) {
define double @f5(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i8 *%ptr) {
define double @f6(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i8 *%ptr) {
define double @f8(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i8 *%ptr) {
define double @f9(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
diff --git a/test/CodeGen/SystemZ/int-cmp-19.ll b/test/CodeGen/SystemZ/int-cmp-19.ll
index 0a23f06a0581..2a6a97919940 100644
--- a/test/CodeGen/SystemZ/int-cmp-19.ll
+++ b/test/CodeGen/SystemZ/int-cmp-19.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i8 *%ptr) {
define double @f2(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i8 *%ptr) {
define double @f5(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: cli 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i8 *%ptr) {
define double @f6(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i8 *%ptr) {
define double @f8(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cli 0(%r2), 255
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i8 *%ptr) {
define double @f9(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
diff --git a/test/CodeGen/SystemZ/int-cmp-20.ll b/test/CodeGen/SystemZ/int-cmp-20.ll
index 2acff55af59c..55f7efc08a19 100644
--- a/test/CodeGen/SystemZ/int-cmp-20.ll
+++ b/test/CodeGen/SystemZ/int-cmp-20.ll
@@ -8,7 +8,7 @@
define double @f1(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cli 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -22,7 +22,7 @@ define double @f1(double %a, double %b, i8 *%ptr) {
define double @f2(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cli 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -36,7 +36,7 @@ define double @f2(double %a, double %b, i8 *%ptr) {
define double @f3(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: cli 0(%r2), 254
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -50,7 +50,7 @@ define double @f3(double %a, double %b, i8 *%ptr) {
define double @f4(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: cli 0(%r2), 254
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -105,7 +105,7 @@ define double @f7(double %a, double %b, i8 *%ptr) {
define double @f8(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cli 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -132,7 +132,7 @@ define double @f9(double %a, double %b, i8 *%ptr) {
define double @f10(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f10:
; CHECK: cli 0(%r2), 254
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i32
@@ -171,7 +171,7 @@ define double @f12(double %a, double %b, i8 *%ptr) {
define double @f13(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f13:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -184,7 +184,7 @@ define double @f13(double %a, double %b, i8 *%ptr) {
define double @f14(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f14:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -197,7 +197,7 @@ define double @f14(double %a, double %b, i8 *%ptr) {
define double @f15(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f15:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
@@ -210,7 +210,7 @@ define double @f15(double %a, double %b, i8 *%ptr) {
define double @f16(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f16:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-21.ll b/test/CodeGen/SystemZ/int-cmp-21.ll
index 5be97324f643..4ba63a02d594 100644
--- a/test/CodeGen/SystemZ/int-cmp-21.ll
+++ b/test/CodeGen/SystemZ/int-cmp-21.ll
@@ -8,7 +8,7 @@
define double @f1(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cli 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -22,7 +22,7 @@ define double @f1(double %a, double %b, i8 *%ptr) {
define double @f2(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cli 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -36,7 +36,7 @@ define double @f2(double %a, double %b, i8 *%ptr) {
define double @f3(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: cli 0(%r2), 254
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -50,7 +50,7 @@ define double @f3(double %a, double %b, i8 *%ptr) {
define double @f4(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: cli 0(%r2), 254
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -105,7 +105,7 @@ define double @f7(double %a, double %b, i8 *%ptr) {
define double @f8(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cli 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -132,7 +132,7 @@ define double @f9(double %a, double %b, i8 *%ptr) {
define double @f10(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f10:
; CHECK: cli 0(%r2), 254
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = zext i8 %val to i64
@@ -171,7 +171,7 @@ define double @f12(double %a, double %b, i8 *%ptr) {
define double @f13(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f13:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -184,7 +184,7 @@ define double @f13(double %a, double %b, i8 *%ptr) {
define double @f14(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f14:
; CHECK: cli 0(%r2), 128
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -197,7 +197,7 @@ define double @f14(double %a, double %b, i8 *%ptr) {
define double @f15(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f15:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
@@ -210,7 +210,7 @@ define double @f15(double %a, double %b, i8 *%ptr) {
define double @f16(double %a, double %b, i8 *%ptr) {
; CHECK-LABEL: f16:
; CHECK: cli 0(%r2), 127
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i8 , i8 *%ptr
%ext = sext i8 %val to i64
diff --git a/test/CodeGen/SystemZ/int-cmp-22.ll b/test/CodeGen/SystemZ/int-cmp-22.ll
index f29023cf02ae..47372658165e 100644
--- a/test/CodeGen/SystemZ/int-cmp-22.ll
+++ b/test/CodeGen/SystemZ/int-cmp-22.ll
@@ -6,7 +6,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: chhsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -19,7 +19,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: chhsi 0(%r2), 0
-; CHECK-NEXT: jle
+; CHECK-NEXT: bler %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i16 *%ptr) {
define double @f3(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: chhsi 0(%r2), 32766
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -45,7 +45,7 @@ define double @f3(double %a, double %b, i16 *%ptr) {
define double @f4(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: chhsi 0(%r2), -1
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i16 *%ptr) {
define double @f5(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: chhsi 0(%r2), -32766
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i16 *%ptr) {
define double @f6(double %a, double %b, i16 %i1, i16 *%base) {
; CHECK-LABEL: f6:
; CHECK: chhsi 4094(%r3), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 2047
@@ -86,7 +86,7 @@ define double @f7(double %a, double %b, i16 *%base) {
; CHECK-LABEL: f7:
; CHECK: aghi %r2, 4096
; CHECK: chhsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 2048
@@ -101,7 +101,7 @@ define double @f8(double %a, double %b, i16 *%base) {
; CHECK-LABEL: f8:
; CHECK: aghi %r2, -2
; CHECK: chhsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 -1
@@ -116,7 +116,7 @@ define double @f9(double %a, double %b, i64 %base, i64 %index) {
; CHECK-LABEL: f9:
; CHECK: agr {{%r2, %r3|%r3, %r2}}
; CHECK: chhsi 0({{%r[23]}}), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add = add i64 %base, %index
diff --git a/test/CodeGen/SystemZ/int-cmp-23.ll b/test/CodeGen/SystemZ/int-cmp-23.ll
index df6b62616a79..a1126e13ca98 100644
--- a/test/CodeGen/SystemZ/int-cmp-23.ll
+++ b/test/CodeGen/SystemZ/int-cmp-23.ll
@@ -6,7 +6,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -19,7 +19,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 65534
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i16 *%ptr) {
define double @f3(double %a, double %b, i16 %i1, i16 *%base) {
; CHECK-LABEL: f3:
; CHECK: clhhsi 4094(%r3), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 2047
@@ -47,7 +47,7 @@ define double @f4(double %a, double %b, i16 *%base) {
; CHECK-LABEL: f4:
; CHECK: aghi %r2, 4096
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 2048
@@ -62,7 +62,7 @@ define double @f5(double %a, double %b, i16 *%base) {
; CHECK-LABEL: f5:
; CHECK: aghi %r2, -2
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i16, i16 *%base, i64 -1
@@ -77,7 +77,7 @@ define double @f6(double %a, double %b, i64 %base, i64 %index) {
; CHECK-LABEL: f6:
; CHECK: agr {{%r2, %r3|%r3, %r2}}
; CHECK: clhhsi 0({{%r[23]}}), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add = add i64 %base, %index
diff --git a/test/CodeGen/SystemZ/int-cmp-24.ll b/test/CodeGen/SystemZ/int-cmp-24.ll
index e1141a78ddda..d7bfeb270f37 100644
--- a/test/CodeGen/SystemZ/int-cmp-24.ll
+++ b/test/CodeGen/SystemZ/int-cmp-24.ll
@@ -6,7 +6,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -19,7 +19,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i16 *%ptr) {
define double @f3(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: clhhsi 0(%r2), 32768
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -45,7 +45,7 @@ define double @f3(double %a, double %b, i16 *%ptr) {
define double @f4(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: clhhsi 0(%r2), 32767
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-25.ll b/test/CodeGen/SystemZ/int-cmp-25.ll
index 268530316506..4da5fd8e0381 100644
--- a/test/CodeGen/SystemZ/int-cmp-25.ll
+++ b/test/CodeGen/SystemZ/int-cmp-25.ll
@@ -6,7 +6,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -19,7 +19,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i16 *%ptr) {
define double @f3(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: clhhsi 0(%r2), 32768
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
@@ -45,7 +45,7 @@ define double @f3(double %a, double %b, i16 *%ptr) {
define double @f4(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: clhhsi 0(%r2), 32767
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i16 , i16 *%ptr
diff --git a/test/CodeGen/SystemZ/int-cmp-26.ll b/test/CodeGen/SystemZ/int-cmp-26.ll
index ba93f081e9b9..e280c7f6c03a 100644
--- a/test/CodeGen/SystemZ/int-cmp-26.ll
+++ b/test/CodeGen/SystemZ/int-cmp-26.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i16 *%ptr) {
define double @f5(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i16 *%ptr) {
define double @f6(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: clhhsi 0(%r2), 32767
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i16 *%ptr) {
define double @f8(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i16 *%ptr) {
define double @f9(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: clhhsi 0(%r2), 32768
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-27.ll b/test/CodeGen/SystemZ/int-cmp-27.ll
index 9a503c9254a2..afbdbaf45f24 100644
--- a/test/CodeGen/SystemZ/int-cmp-27.ll
+++ b/test/CodeGen/SystemZ/int-cmp-27.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i16 *%ptr) {
define double @f5(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i16 *%ptr) {
define double @f6(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: clhhsi 0(%r2), 32767
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i16 *%ptr) {
define double @f8(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i16 *%ptr) {
define double @f9(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: clhhsi 0(%r2), 32768
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-28.ll b/test/CodeGen/SystemZ/int-cmp-28.ll
index 68f1cd28c62d..3fbfb1f679ed 100644
--- a/test/CodeGen/SystemZ/int-cmp-28.ll
+++ b/test/CodeGen/SystemZ/int-cmp-28.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i16 *%ptr) {
define double @f5(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i16 *%ptr) {
define double @f6(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: clhhsi 0(%r2), 32767
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i16 *%ptr) {
define double @f8(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i16 *%ptr) {
define double @f9(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: clhhsi 0(%r2), 32768
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
diff --git a/test/CodeGen/SystemZ/int-cmp-29.ll b/test/CodeGen/SystemZ/int-cmp-29.ll
index 4fb2e8577699..e90f434ec744 100644
--- a/test/CodeGen/SystemZ/int-cmp-29.ll
+++ b/test/CodeGen/SystemZ/int-cmp-29.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i16 *%ptr) {
define double @f5(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: clhhsi 0(%r2), 0
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -71,7 +71,7 @@ define double @f5(double %a, double %b, i16 *%ptr) {
define double @f6(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: clhhsi 0(%r2), 32767
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -97,7 +97,7 @@ define double @f7(double %a, double %b, i16 *%ptr) {
define double @f8(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: clhhsi 0(%r2), 65535
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -110,7 +110,7 @@ define double @f8(double %a, double %b, i16 *%ptr) {
define double @f9(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: clhhsi 0(%r2), 32768
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
diff --git a/test/CodeGen/SystemZ/int-cmp-30.ll b/test/CodeGen/SystemZ/int-cmp-30.ll
index 043ff484c145..bac0fe516959 100644
--- a/test/CodeGen/SystemZ/int-cmp-30.ll
+++ b/test/CodeGen/SystemZ/int-cmp-30.ll
@@ -8,7 +8,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -22,7 +22,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -36,7 +36,7 @@ define double @f2(double %a, double %b, i16 *%ptr) {
define double @f3(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: clhhsi 0(%r2), 65534
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -50,7 +50,7 @@ define double @f3(double %a, double %b, i16 *%ptr) {
define double @f4(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: clhhsi 0(%r2), 65534
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -106,7 +106,7 @@ define double @f7(double %a, double %b, i16 *%ptr) {
define double @f8(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -120,7 +120,7 @@ define double @f8(double %a, double %b, i16 *%ptr) {
define double @f9(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: chhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -134,7 +134,7 @@ define double @f9(double %a, double %b, i16 *%ptr) {
define double @f10(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f10:
; CHECK: clhhsi 0(%r2), 65534
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i32
@@ -148,7 +148,7 @@ define double @f10(double %a, double %b, i16 *%ptr) {
define double @f11(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f11:
; CHECK: chhsi 0(%r2), -2
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -175,7 +175,7 @@ define double @f12(double %a, double %b, i16 *%ptr) {
define double @f13(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f13:
; CHECK: chhsi 0(%r2), 32766
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
@@ -202,7 +202,7 @@ define double @f14(double %a, double %b, i16 *%ptr) {
define double @f15(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f15:
; CHECK: chhsi 0(%r2), -32767
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-31.ll b/test/CodeGen/SystemZ/int-cmp-31.ll
index 298b446e7f1d..45c5c789dd50 100644
--- a/test/CodeGen/SystemZ/int-cmp-31.ll
+++ b/test/CodeGen/SystemZ/int-cmp-31.ll
@@ -8,7 +8,7 @@
define double @f1(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -22,7 +22,7 @@ define double @f1(double %a, double %b, i16 *%ptr) {
define double @f2(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -36,7 +36,7 @@ define double @f2(double %a, double %b, i16 *%ptr) {
define double @f3(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: clhhsi 0(%r2), 65534
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -50,7 +50,7 @@ define double @f3(double %a, double %b, i16 *%ptr) {
define double @f4(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: clhhsi 0(%r2), 65534
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -106,7 +106,7 @@ define double @f7(double %a, double %b, i16 *%ptr) {
define double @f8(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: clhhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -120,7 +120,7 @@ define double @f8(double %a, double %b, i16 *%ptr) {
define double @f9(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: chhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -134,7 +134,7 @@ define double @f9(double %a, double %b, i16 *%ptr) {
define double @f10(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f10:
; CHECK: clhhsi 0(%r2), 65534
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = zext i16 %val to i64
@@ -148,7 +148,7 @@ define double @f10(double %a, double %b, i16 *%ptr) {
define double @f11(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f11:
; CHECK: chhsi 0(%r2), -2
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -175,7 +175,7 @@ define double @f12(double %a, double %b, i16 *%ptr) {
define double @f13(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f13:
; CHECK: chhsi 0(%r2), 32766
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
@@ -202,7 +202,7 @@ define double @f14(double %a, double %b, i16 *%ptr) {
define double @f15(double %a, double %b, i16 *%ptr) {
; CHECK-LABEL: f15:
; CHECK: chhsi 0(%r2), -32767
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
%val = load i16 , i16 *%ptr
%ext = sext i16 %val to i64
diff --git a/test/CodeGen/SystemZ/int-cmp-32.ll b/test/CodeGen/SystemZ/int-cmp-32.ll
index da0e2d7562dd..dae09b446651 100644
--- a/test/CodeGen/SystemZ/int-cmp-32.ll
+++ b/test/CodeGen/SystemZ/int-cmp-32.ll
@@ -6,7 +6,7 @@
define double @f1(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: chsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -19,7 +19,7 @@ define double @f1(double %a, double %b, i32 *%ptr) {
define double @f2(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: chsi 0(%r2), 0
-; CHECK-NEXT: jle
+; CHECK-NEXT: bler %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i32 *%ptr) {
define double @f3(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: chsi 0(%r2), 32767
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -56,7 +56,7 @@ define double @f4(double %a, double %b, i32 *%ptr) {
define double @f5(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: chsi 0(%r2), -1
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -69,7 +69,7 @@ define double @f5(double %a, double %b, i32 *%ptr) {
define double @f6(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: chsi 0(%r2), -32768
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -93,7 +93,7 @@ define double @f7(double %a, double %b, i32 *%ptr) {
define double @f8(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: chsi 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -106,7 +106,7 @@ define double @f8(double %a, double %b, i32 *%ptr) {
define double @f9(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: chsi 0(%r2), 1
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -119,7 +119,7 @@ define double @f9(double %a, double %b, i32 *%ptr) {
define double @f10(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f10:
; CHECK: chsi 0(%r2), 32767
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -143,7 +143,7 @@ define double @f11(double %a, double %b, i32 *%ptr) {
define double @f12(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f12:
; CHECK: chsi 0(%r2), -1
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -156,7 +156,7 @@ define double @f12(double %a, double %b, i32 *%ptr) {
define double @f13(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f13:
; CHECK: chsi 0(%r2), -32768
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -180,7 +180,7 @@ define double @f14(double %a, double %b, i32 *%ptr) {
define double @f15(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f15:
; CHECK: chsi 4092(%r3), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1023
@@ -195,7 +195,7 @@ define double @f16(double %a, double %b, i32 *%base) {
; CHECK-LABEL: f16:
; CHECK: aghi %r2, 4096
; CHECK: chsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1024
@@ -210,7 +210,7 @@ define double @f17(double %a, double %b, i32 *%base) {
; CHECK-LABEL: f17:
; CHECK: aghi %r2, -4
; CHECK: chsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -1
@@ -225,7 +225,7 @@ define double @f18(double %a, double %b, i64 %base, i64 %index) {
; CHECK-LABEL: f18:
; CHECK: agr {{%r2, %r3|%r3, %r2}}
; CHECK: chsi 0({{%r[23]}}), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add = add i64 %base, %index
diff --git a/test/CodeGen/SystemZ/int-cmp-33.ll b/test/CodeGen/SystemZ/int-cmp-33.ll
index 94f3e705391e..ec02147ee525 100644
--- a/test/CodeGen/SystemZ/int-cmp-33.ll
+++ b/test/CodeGen/SystemZ/int-cmp-33.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clfhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i32 *%ptr) {
define double @f2(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clfhsi 0(%r2), 65535
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -45,7 +45,7 @@ define double @f3(double %a, double %b, i32 *%ptr) {
define double @f4(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: clfhsi 0(%r2), 32768
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i32 *%ptr) {
define double @f5(double %a, double %b, i32 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: clfhsi 0(%r2), 65535
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i32 , i32 *%ptr
@@ -82,7 +82,7 @@ define double @f6(double %a, double %b, i32 *%ptr) {
define double @f7(double %a, double %b, i32 %i1, i32 *%base) {
; CHECK-LABEL: f7:
; CHECK: clfhsi 4092(%r3), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1023
@@ -97,7 +97,7 @@ define double @f8(double %a, double %b, i32 *%base) {
; CHECK-LABEL: f8:
; CHECK: aghi %r2, 4096
; CHECK: clfhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 1024
@@ -112,7 +112,7 @@ define double @f9(double %a, double %b, i32 *%base) {
; CHECK-LABEL: f9:
; CHECK: aghi %r2, -4
; CHECK: clfhsi 0(%r2), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i32, i32 *%base, i64 -1
@@ -127,7 +127,7 @@ define double @f10(double %a, double %b, i64 %base, i64 %index) {
; CHECK-LABEL: f10:
; CHECK: agr {{%r2, %r3|%r3, %r2}}
; CHECK: clfhsi 0({{%r[23]}}), 1
-; CHECK-NEXT: jh
+; CHECK-NEXT: bhr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add = add i64 %base, %index
diff --git a/test/CodeGen/SystemZ/int-cmp-34.ll b/test/CodeGen/SystemZ/int-cmp-34.ll
index 114b694a3b09..2dbc04e4ec0c 100644
--- a/test/CodeGen/SystemZ/int-cmp-34.ll
+++ b/test/CodeGen/SystemZ/int-cmp-34.ll
@@ -6,7 +6,7 @@
define double @f1(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: cghsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -19,7 +19,7 @@ define double @f1(double %a, double %b, i64 *%ptr) {
define double @f2(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: cghsi 0(%r2), 0
-; CHECK-NEXT: jle
+; CHECK-NEXT: bler %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -32,7 +32,7 @@ define double @f2(double %a, double %b, i64 *%ptr) {
define double @f3(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f3:
; CHECK: cghsi 0(%r2), 32767
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -56,7 +56,7 @@ define double @f4(double %a, double %b, i64 *%ptr) {
define double @f5(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: cghsi 0(%r2), -1
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -69,7 +69,7 @@ define double @f5(double %a, double %b, i64 *%ptr) {
define double @f6(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f6:
; CHECK: cghsi 0(%r2), -32768
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -93,7 +93,7 @@ define double @f7(double %a, double %b, i64 *%ptr) {
define double @f8(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f8:
; CHECK: cghsi 0(%r2), 0
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -106,7 +106,7 @@ define double @f8(double %a, double %b, i64 *%ptr) {
define double @f9(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f9:
; CHECK: cghsi 0(%r2), 1
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -119,7 +119,7 @@ define double @f9(double %a, double %b, i64 *%ptr) {
define double @f10(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f10:
; CHECK: cghsi 0(%r2), 32767
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -143,7 +143,7 @@ define double @f11(double %a, double %b, i64 *%ptr) {
define double @f12(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f12:
; CHECK: cghsi 0(%r2), -1
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -156,7 +156,7 @@ define double @f12(double %a, double %b, i64 *%ptr) {
define double @f13(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f13:
; CHECK: cghsi 0(%r2), -32768
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -180,7 +180,7 @@ define double @f14(double %a, double %b, i64 *%ptr) {
define double @f15(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f15:
; CHECK: cghsi 4088(%r3), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 511
@@ -195,7 +195,7 @@ define double @f16(double %a, double %b, i64 *%base) {
; CHECK-LABEL: f16:
; CHECK: aghi %r2, 4096
; CHECK: cghsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 512
@@ -210,7 +210,7 @@ define double @f17(double %a, double %b, i64 *%base) {
; CHECK-LABEL: f17:
; CHECK: aghi %r2, -8
; CHECK: cghsi 0(%r2), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -1
@@ -225,7 +225,7 @@ define double @f18(double %a, double %b, i64 %base, i64 %index) {
; CHECK-LABEL: f18:
; CHECK: agr {{%r2, %r3|%r3, %r2}}
; CHECK: cghsi 0({{%r[23]}}), 0
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add = add i64 %base, %index
diff --git a/test/CodeGen/SystemZ/int-cmp-35.ll b/test/CodeGen/SystemZ/int-cmp-35.ll
index 0eaf4fa0a075..de362af2bea2 100644
--- a/test/CodeGen/SystemZ/int-cmp-35.ll
+++ b/test/CodeGen/SystemZ/int-cmp-35.ll
@@ -7,7 +7,7 @@
define double @f1(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f1:
; CHECK: clghsi 0(%r2), 2
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -20,7 +20,7 @@ define double @f1(double %a, double %b, i64 *%ptr) {
define double @f2(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f2:
; CHECK: clghsi 0(%r2), 65535
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -45,7 +45,7 @@ define double @f3(double %a, double %b, i64 *%ptr) {
define double @f4(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f4:
; CHECK: clghsi 0(%r2), 32768
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -58,7 +58,7 @@ define double @f4(double %a, double %b, i64 *%ptr) {
define double @f5(double %a, double %b, i64 *%ptr) {
; CHECK-LABEL: f5:
; CHECK: clghsi 0(%r2), 65535
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%val = load i64 , i64 *%ptr
@@ -82,7 +82,7 @@ define double @f6(double %a, double %b, i64 *%ptr) {
define double @f7(double %a, double %b, i64 %i1, i64 *%base) {
; CHECK-LABEL: f7:
; CHECK: clghsi 4088(%r3), 2
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 511
@@ -97,7 +97,7 @@ define double @f8(double %a, double %b, i64 *%base) {
; CHECK-LABEL: f8:
; CHECK: aghi %r2, 4096
; CHECK: clghsi 0(%r2), 2
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 512
@@ -112,7 +112,7 @@ define double @f9(double %a, double %b, i64 *%base) {
; CHECK-LABEL: f9:
; CHECK: aghi %r2, -8
; CHECK: clghsi 0(%r2), 2
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%ptr = getelementptr i64, i64 *%base, i64 -1
@@ -127,7 +127,7 @@ define double @f10(double %a, double %b, i64 %base, i64 %index) {
; CHECK-LABEL: f10:
; CHECK: agr {{%r2, %r3|%r3, %r2}}
; CHECK: clghsi 0({{%r[23]}}), 2
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: ldr %f0, %f2
; CHECK: br %r14
%add = add i64 %base, %index
diff --git a/test/CodeGen/SystemZ/int-cmp-36.ll b/test/CodeGen/SystemZ/int-cmp-36.ll
index 113d2c1587e0..b17fedd03db2 100644
--- a/test/CodeGen/SystemZ/int-cmp-36.ll
+++ b/test/CodeGen/SystemZ/int-cmp-36.ll
@@ -10,7 +10,7 @@
define i32 @f1(i32 %src1) {
; CHECK-LABEL: f1:
; CHECK: chrl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
@@ -47,7 +47,7 @@ exit:
define i32 @f3(i32 %src1) {
; CHECK-LABEL: f3:
; CHECK: chrl %r2, g
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
@@ -66,7 +66,7 @@ exit:
define i32 @f4(i32 %src1) {
; CHECK-LABEL: f4:
; CHECK: chrl %r2, g
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
@@ -86,7 +86,7 @@ define i32 @f5(i32 %src1) {
; CHECK-LABEL: f5:
; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
; CHECK: ch %r2, 0([[REG]])
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@h, align 1
@@ -105,7 +105,7 @@ exit:
define i32 @f6(i32 %src2) {
; CHECK-LABEL: f6:
; CHECK: chrl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
diff --git a/test/CodeGen/SystemZ/int-cmp-37.ll b/test/CodeGen/SystemZ/int-cmp-37.ll
index ac5d39f96511..aabb8a2fd3e3 100644
--- a/test/CodeGen/SystemZ/int-cmp-37.ll
+++ b/test/CodeGen/SystemZ/int-cmp-37.ll
@@ -21,7 +21,8 @@ mulb:
%mul = mul i32 %src1, %src1
br label %exit
exit:
- %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i32 %tmp, 1
ret i32 %res
}
@@ -39,7 +40,8 @@ mulb:
%mul = mul i32 %src1, %src1
br label %exit
exit:
- %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i32 %tmp, 1
ret i32 %res
}
@@ -58,7 +60,8 @@ mulb:
%mul = mul i32 %src1, %src1
br label %exit
exit:
- %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i32 %tmp, 1
ret i32 %res
}
@@ -77,7 +80,8 @@ mulb:
%mul = mul i32 %src1, %src1
br label %exit
exit:
- %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i32 %tmp, 1
ret i32 %res
}
@@ -97,7 +101,8 @@ mulb:
%mul = mul i32 %src1, %src1
br label %exit
exit:
- %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i32 %tmp, 1
ret i32 %res
}
@@ -116,6 +121,7 @@ mulb:
%mul = mul i32 %src2, %src2
br label %exit
exit:
- %res = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+ %tmp = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+ %res = add i32 %tmp, 1
ret i32 %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-38.ll b/test/CodeGen/SystemZ/int-cmp-38.ll
index 0d8913b02861..f8b754bf5ea2 100644
--- a/test/CodeGen/SystemZ/int-cmp-38.ll
+++ b/test/CodeGen/SystemZ/int-cmp-38.ll
@@ -10,7 +10,7 @@
define i32 @f1(i32 %src1) {
; CHECK-LABEL: f1:
; CHECK: crl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%src2 = load i32 , i32 *@g
@@ -28,7 +28,7 @@ exit:
define i32 @f2(i32 %src1) {
; CHECK-LABEL: f2:
; CHECK: clrl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%src2 = load i32 , i32 *@g
@@ -46,7 +46,7 @@ exit:
define i32 @f3(i32 %src1) {
; CHECK-LABEL: f3:
; CHECK: c{{l?}}rl %r2, g
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%src2 = load i32 , i32 *@g
@@ -64,7 +64,7 @@ exit:
define i32 @f4(i32 %src1) {
; CHECK-LABEL: f4:
; CHECK: c{{l?}}rl %r2, g
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%src2 = load i32 , i32 *@g
@@ -83,7 +83,7 @@ define i32 @f5(i32 %src1) {
; CHECK-LABEL: f5:
; CHECK: larl [[REG:%r[0-5]]], h
; CHECK: c %r2, 0([[REG]])
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%src2 = load i32 , i32 *@h, align 2
@@ -102,7 +102,7 @@ define i32 @f6(i32 %src1) {
; CHECK-LABEL: f6:
; CHECK: larl [[REG:%r[0-5]]], h
; CHECK: cl %r2, 0([[REG]])
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%src2 = load i32 , i32 *@h, align 2
@@ -120,7 +120,7 @@ exit:
define i32 @f7(i32 %src2) {
; CHECK-LABEL: f7:
; CHECK: crl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%src1 = load i32 , i32 *@g
diff --git a/test/CodeGen/SystemZ/int-cmp-39.ll b/test/CodeGen/SystemZ/int-cmp-39.ll
index 5e3abceeca45..2e38e4bb5955 100644
--- a/test/CodeGen/SystemZ/int-cmp-39.ll
+++ b/test/CodeGen/SystemZ/int-cmp-39.ll
@@ -10,7 +10,7 @@
define i64 @f1(i64 %src1) {
; CHECK-LABEL: f1:
; CHECK: cghrl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
@@ -47,7 +47,7 @@ exit:
define i64 @f3(i64 %src1) {
; CHECK-LABEL: f3:
; CHECK: cghrl %r2, g
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
@@ -66,7 +66,7 @@ exit:
define i64 @f4(i64 %src1) {
; CHECK-LABEL: f4:
; CHECK: cghrl %r2, g
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
@@ -86,7 +86,7 @@ define i64 @f5(i64 %src1) {
; CHECK-LABEL: f5:
; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
; CHECK: cgh %r2, 0([[REG]])
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@h, align 1
@@ -105,7 +105,7 @@ exit:
define i64 @f6(i64 %src2) {
; CHECK-LABEL: f6:
; CHECK: cghrl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%val = load i16 , i16 *@g
diff --git a/test/CodeGen/SystemZ/int-cmp-40.ll b/test/CodeGen/SystemZ/int-cmp-40.ll
index 92696d71fc48..fc38940ce397 100644
--- a/test/CodeGen/SystemZ/int-cmp-40.ll
+++ b/test/CodeGen/SystemZ/int-cmp-40.ll
@@ -21,7 +21,8 @@ mulb:
%mul = mul i64 %src1, %src1
br label %exit
exit:
- %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i64 %tmp, 1
ret i64 %res
}
@@ -39,7 +40,8 @@ mulb:
%mul = mul i64 %src1, %src1
br label %exit
exit:
- %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i64 %tmp, 1
ret i64 %res
}
@@ -58,7 +60,8 @@ mulb:
%mul = mul i64 %src1, %src1
br label %exit
exit:
- %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i64 %tmp, 1
ret i64 %res
}
@@ -77,7 +80,8 @@ mulb:
%mul = mul i64 %src1, %src1
br label %exit
exit:
- %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i64 %tmp, 1
ret i64 %res
}
@@ -97,7 +101,8 @@ mulb:
%mul = mul i64 %src1, %src1
br label %exit
exit:
- %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %tmp = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
+ %res = add i64 %tmp, 1
ret i64 %res
}
@@ -116,6 +121,7 @@ mulb:
%mul = mul i64 %src2, %src2
br label %exit
exit:
- %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+ %tmp = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+ %res = add i64 %tmp, 1
ret i64 %res
}
diff --git a/test/CodeGen/SystemZ/int-cmp-41.ll b/test/CodeGen/SystemZ/int-cmp-41.ll
index f4f5b4a0cf16..035de5733e94 100644
--- a/test/CodeGen/SystemZ/int-cmp-41.ll
+++ b/test/CodeGen/SystemZ/int-cmp-41.ll
@@ -10,7 +10,7 @@
define i64 @f1(i64 %src1) {
; CHECK-LABEL: f1:
; CHECK: cgfrl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
@@ -47,7 +47,7 @@ exit:
define i64 @f3(i64 %src1) {
; CHECK-LABEL: f3:
; CHECK: cgfrl %r2, g
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
@@ -66,7 +66,7 @@ exit:
define i64 @f4(i64 %src1) {
; CHECK-LABEL: f4:
; CHECK: cgfrl %r2, g
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
@@ -86,7 +86,7 @@ define i64 @f5(i64 %src1) {
; CHECK-LABEL: f5:
; CHECK: larl [[REG:%r[0-5]]], h
; CHECK: cgf %r2, 0([[REG]])
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@h, align 2
@@ -105,7 +105,7 @@ exit:
define i64 @f6(i64 %src2) {
; CHECK-LABEL: f6:
; CHECK: cgfrl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
diff --git a/test/CodeGen/SystemZ/int-cmp-42.ll b/test/CodeGen/SystemZ/int-cmp-42.ll
index ca87b865ad14..7fa5d720fc51 100644
--- a/test/CodeGen/SystemZ/int-cmp-42.ll
+++ b/test/CodeGen/SystemZ/int-cmp-42.ll
@@ -10,7 +10,7 @@
define i64 @f1(i64 %src1) {
; CHECK-LABEL: f1:
; CHECK: clgfrl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
@@ -47,7 +47,7 @@ exit:
define i64 @f3(i64 %src1) {
; CHECK-LABEL: f3:
; CHECK: clgfrl %r2, g
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
@@ -66,7 +66,7 @@ exit:
define i64 @f4(i64 %src1) {
; CHECK-LABEL: f4:
; CHECK: clgfrl %r2, g
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
@@ -86,7 +86,7 @@ define i64 @f5(i64 %src1) {
; CHECK-LABEL: f5:
; CHECK: larl [[REG:%r[0-5]]], h
; CHECK: clgf %r2, 0([[REG]])
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@h, align 2
@@ -105,7 +105,7 @@ exit:
define i64 @f6(i64 %src2) {
; CHECK-LABEL: f6:
; CHECK: clgfrl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%val = load i32 , i32 *@g
diff --git a/test/CodeGen/SystemZ/int-cmp-43.ll b/test/CodeGen/SystemZ/int-cmp-43.ll
index 108b041fa377..700db89435b6 100644
--- a/test/CodeGen/SystemZ/int-cmp-43.ll
+++ b/test/CodeGen/SystemZ/int-cmp-43.ll
@@ -10,7 +10,7 @@
define i64 @f1(i64 %src1) {
; CHECK-LABEL: f1:
; CHECK: cgrl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%src2 = load i64 , i64 *@g
@@ -28,7 +28,7 @@ exit:
define i64 @f2(i64 %src1) {
; CHECK-LABEL: f2:
; CHECK: clgrl %r2, g
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%src2 = load i64 , i64 *@g
@@ -46,7 +46,7 @@ exit:
define i64 @f3(i64 %src1) {
; CHECK-LABEL: f3:
; CHECK: c{{l?}}grl %r2, g
-; CHECK-NEXT: je
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%src2 = load i64 , i64 *@g
@@ -64,7 +64,7 @@ exit:
define i64 @f4(i64 %src1) {
; CHECK-LABEL: f4:
; CHECK: c{{l?}}grl %r2, g
-; CHECK-NEXT: jlh
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%src2 = load i64 , i64 *@g
@@ -83,7 +83,7 @@ define i64 @f5(i64 %src1) {
; CHECK-LABEL: f5:
; CHECK: larl [[REG:%r[0-5]]], h
; CHECK: cg %r2, 0([[REG]])
-; CHECK-NEXT: jl
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%src2 = load i64 , i64 *@h, align 4
@@ -101,7 +101,7 @@ exit:
define i64 @f6(i64 %src2) {
; CHECK-LABEL: f6:
; CHECK: cgrl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%src1 = load i64 , i64 *@g
diff --git a/test/CodeGen/SystemZ/int-cmp-44.ll b/test/CodeGen/SystemZ/int-cmp-44.ll
index a87dccd4ac2a..1b9a4ae353fe 100644
--- a/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -11,7 +11,7 @@ declare void @foo()
define i32 @f1(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f1:
; CHECK: afi %r2, 1000000
-; CHECK-NEXT: je .L{{.*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%res = add i32 %a, 1000000
@@ -30,7 +30,7 @@ exit:
define i32 @f2(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f2:
; CHECK: afi %r2, 1000000
-; CHECK-NEXT: jne .L{{.*}}
+; CHECK-NEXT: bner %r14
; CHECK: br %r14
entry:
%res = add i32 %a, 1000000
@@ -49,7 +49,7 @@ exit:
define i32 @f3(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f3:
; CHECK: afi %r2, 1000000
-; CHECK-NEXT: cijl %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibl %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%res = add i32 %a, 1000000
@@ -68,7 +68,7 @@ exit:
define i32 @f4(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f4:
; CHECK: afi %r2, 1000000
-; CHECK-NEXT: cijle %r2, 0, .L{{.*}}
+; CHECK-NEXT: cible %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%res = add i32 %a, 1000000
@@ -87,7 +87,7 @@ exit:
define i32 @f5(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f5:
; CHECK: afi %r2, 1000000
-; CHECK-NEXT: cijh %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibh %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%res = add i32 %a, 1000000
@@ -106,7 +106,7 @@ exit:
define i32 @f6(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f6:
; CHECK: afi %r2, 1000000
-; CHECK-NEXT: cijhe %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibhe %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%res = add i32 %a, 1000000
@@ -125,7 +125,7 @@ exit:
define i32 @f7(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f7:
; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: jne .L{{.*}}
+; CHECK-NEXT: bner %r14
; CHECK: br %r14
entry:
%cur = load i32 , i32 *%dest
@@ -145,7 +145,7 @@ exit:
define i32 @f8(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f8:
; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: cijl %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibl %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%cur = load i32 , i32 *%dest
@@ -166,7 +166,7 @@ exit:
define i32 @f9(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f9:
; CHECK: nr %r2, %r3
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = and i32 %a, %b
@@ -185,7 +185,7 @@ exit:
define i32 @f10(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f10:
; CHECK: nr %r2, %r3
-; CHECK-NEXT: cijl %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibl %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%res = and i32 %a, %b
@@ -205,7 +205,7 @@ exit:
define i32 @f11(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f11:
; CHECK: nilf %r2, 100000001
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = and i32 %a, 100000001
@@ -225,7 +225,7 @@ exit:
define i32 @f12(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f12:
; CHECK: nill %r2, 65436
-; CHECK-NEXT: cijlh %r2, 0, .L{{.*}}
+; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%res = and i32 %a, -100
@@ -244,7 +244,7 @@ exit:
define i32 @f13(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f13:
; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: je .L{{.*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%res = ashr i32 %a, %b
@@ -263,7 +263,7 @@ exit:
define i32 @f14(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f14:
; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: jlh .L{{.*}}
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%res = ashr i32 %a, %b
@@ -282,7 +282,7 @@ exit:
define i32 @f15(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f15:
; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = ashr i32 %a, %b
@@ -301,7 +301,7 @@ exit:
define i32 @f16(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f16:
; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: jle .L{{.*}}
+; CHECK-NEXT: bler %r14
; CHECK: br %r14
entry:
%res = ashr i32 %a, %b
@@ -320,7 +320,7 @@ exit:
define i32 @f17(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f17:
; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%res = ashr i32 %a, %b
@@ -339,7 +339,7 @@ exit:
define i32 @f18(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f18:
; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: jhe .L{{.*}}
+; CHECK-NEXT: bher %r14
; CHECK: br %r14
entry:
%res = ashr i32 %a, %b
@@ -359,7 +359,7 @@ exit:
define i64 @f19(i64 %a, i64 %b, i64 *%dest) {
; CHECK-LABEL: f19:
; CHECK: risbg %r2, %r3, 0, 190, 0
-; CHECK-NEXT: je .L{{.*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
entry:
%res = and i64 %b, -2
@@ -378,7 +378,7 @@ exit:
define i64 @f20(i64 %a, i64 %b, i64 *%dest) {
; CHECK-LABEL: f20:
; CHECK: risbg %r2, %r3, 0, 190, 0
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = and i64 %b, -2
@@ -401,7 +401,7 @@ define i32 @f21(i32 %a, i32 %b, i32 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: cije %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%add = add i32 %a, 1000000
@@ -424,7 +424,7 @@ define i32 @f22(i32 %a, i32 %b, i32 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: cije %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%add = add i32 %a, 1000000
@@ -445,7 +445,7 @@ define i32 @f23(i32 %a, i32 %b, i32 *%dest1, i32 *%dest2) {
; CHECK-LABEL: f23:
; CHECK: afi %r2, 1000000
; CHECK-NEXT: st %r2, 0(%r4)
-; CHECK-NEXT: jne .L{{.*}}
+; CHECK-NEXT: bner %r14
; CHECK: br %r14
entry:
%res = add i32 %a, 1000000
@@ -491,7 +491,7 @@ define void @f25(i32 %a, i32 *%ptr) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jne .L{{.*}}
+; CHECK-NEXT: bner %r14
; CHECK: br %r14
entry:
%add = add i32 %a, 1000000
@@ -514,7 +514,7 @@ define void @f26(i32 %a, i32 *%ptr) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: cijlh %r2, 0, .L{{.*}}
+; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%add = add i32 %a, 1000000
@@ -537,7 +537,7 @@ define i32 @f27(i32 %a, i32 %b, i32 *%dest1, i32 *%dest2) {
; CHECK: afi %r2, 1000000
; CHECK-NEXT: sr %r3, %r2
; CHECK-NEXT: st %r3, 0(%r4)
-; CHECK-NEXT: cije %r2, 0, .L{{.*}}
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%add = add i32 %a, 1000000
@@ -558,7 +558,7 @@ exit:
define void @f28(i64 %a, i64 *%dest) {
; CHECK-LABEL: f28:
; CHECK: xi 0(%r2), 15
-; CHECK: cgije %r2, 0, .L{{.*}}
+; CHECK: cgibe %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%ptr = inttoptr i64 %a to i8 *
@@ -580,7 +580,7 @@ exit:
define i32 @f29(i64 %base, i64 %index, i32 *%dest) {
; CHECK-LABEL: f29:
; CHECK: lt %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: jle .L{{.*}}
+; CHECK-NEXT: bler %r14
; CHECK: br %r14
entry:
%add = add i64 %base, %index
@@ -601,7 +601,7 @@ exit:
define i32 @f30(i64 %base, i64 %index, i32 *%dest) {
; CHECK-LABEL: f30:
; CHECK: lt %r2, 100000({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: jle .L{{.*}}
+; CHECK-NEXT: bler %r14
; CHECK: br %r14
entry:
%add1 = add i64 %base, %index
@@ -623,7 +623,7 @@ exit:
define i64 @f31(i64 %base, i64 %index, i64 *%dest) {
; CHECK-LABEL: f31:
; CHECK: ltg %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: jhe .L{{.*}}
+; CHECK-NEXT: bher %r14
; CHECK: br %r14
entry:
%add = add i64 %base, %index
@@ -644,7 +644,7 @@ exit:
define i64 @f32(i64 %base, i64 %index, i64 *%dest) {
; CHECK-LABEL: f32:
; CHECK: ltgf %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%add = add i64 %base, %index
@@ -669,7 +669,7 @@ define i32 @f33(i32 %dummy, i32 %val, i32 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{r2}"(i32 %val)
@@ -691,7 +691,7 @@ define i64 @f34(i64 %dummy, i64 %val, i64 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{r2}"(i64 %val)
@@ -713,7 +713,7 @@ define i64 @f35(i64 %dummy, i32 %val, i64 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%ext = sext i32 %val to i64
@@ -737,7 +737,7 @@ define i32 @f36(i32 %val, i32 %dummy, i32 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r3
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{r3}"(i32 %val)
@@ -760,7 +760,7 @@ define i64 @f37(i64 %val, i64 %dummy, i64 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r3
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
call void asm sideeffect "blah $0", "{r3}"(i64 %val)
@@ -783,7 +783,7 @@ define i32 @f38(i32 %val, i64 %dummy, i32 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r3
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jl .L{{.*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%ext = sext i32 %val to i64
@@ -806,7 +806,7 @@ define i64 @f39(i64 %dummy, i64 %a, i64 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%val = trunc i64 %a to i32
@@ -830,7 +830,7 @@ define i64 @f40(i64 %dummy, i64 %a, i64 *%dest) {
; CHECK-NEXT: #APP
; CHECK-NEXT: blah %r2
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%shl = shl i64 %a, 32
@@ -851,7 +851,7 @@ exit:
define i32 @f41(i32 %a, i32 %b, i32 *%dest) {
; CHECK-LABEL: f41:
; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: jne .L{{.*}}
+; CHECK-NEXT: bner %r14
; CHECK: br %r14
entry:
%cur = load i32 , i32 *%dest
@@ -871,7 +871,7 @@ exit:
define i64 @f42(i64 %base, i64 %index, i64 *%dest) {
; CHECK-LABEL: f42:
; CHECK: ltgf %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: jh .L{{.*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%add = add i64 %base, %index
diff --git a/test/CodeGen/SystemZ/int-cmp-46.ll b/test/CodeGen/SystemZ/int-cmp-46.ll
index f311942b9f86..8374cd3bec50 100644
--- a/test/CodeGen/SystemZ/int-cmp-46.ll
+++ b/test/CodeGen/SystemZ/int-cmp-46.ll
@@ -8,7 +8,7 @@
define void @f1(i32 %a) {
; CHECK-LABEL: f1:
; CHECK: tmll %r2, 1
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 1
@@ -27,7 +27,7 @@ exit:
define void @f2(i32 %a) {
; CHECK-LABEL: f2:
; CHECK: tmll %r2, 65535
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 65535
@@ -46,7 +46,7 @@ exit:
define void @f3(i32 %a) {
; CHECK-LABEL: f3:
; CHECK: tmlh %r2, 1
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 65536
@@ -83,7 +83,7 @@ exit:
define void @f5(i32 %a) {
; CHECK-LABEL: f5:
; CHECK: tmlh %r2, 65535
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 4294901760
@@ -103,7 +103,7 @@ exit:
define void @f6(i32 %a) {
; CHECK-LABEL: f6:
; CHECK: tmll %r2, 240
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 240
@@ -122,7 +122,7 @@ exit:
define void @f7(i32 %a) {
; CHECK-LABEL: f7:
; CHECK: tmll %r2, 240
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 240
@@ -142,7 +142,7 @@ exit:
define void @f8(i32 %a) {
; CHECK-LABEL: f8:
; CHECK: tmll %r2, 240
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 240
@@ -161,7 +161,7 @@ exit:
define void @f9(i32 %a) {
; CHECK-LABEL: f9:
; CHECK: tmll %r2, 240
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 240
@@ -181,7 +181,7 @@ exit:
define void @f10(i32 %a) {
; CHECK-LABEL: f10:
; CHECK: tmll %r2, 35
-; CHECK: jle {{\.L.*}}
+; CHECK: bler %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 35
@@ -200,7 +200,7 @@ exit:
define void @f11(i32 %a) {
; CHECK-LABEL: f11:
; CHECK: tmll %r2, 35
-; CHECK: jle {{\.L.*}}
+; CHECK: bler %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 35
@@ -220,7 +220,7 @@ exit:
define void @f12(i32 %a) {
; CHECK-LABEL: f12:
; CHECK: tmll %r2, 140
-; CHECK: jnle {{\.L.*}}
+; CHECK: bnler %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 140
@@ -239,7 +239,7 @@ exit:
define void @f13(i32 %a) {
; CHECK-LABEL: f13:
; CHECK: tmll %r2, 140
-; CHECK: jnle {{\.L.*}}
+; CHECK: bnler %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 140
@@ -258,7 +258,7 @@ exit:
define void @f14(i32 %a) {
; CHECK-LABEL: f14:
; CHECK: tmll %r2, 101
-; CHECK: jo {{\.L.*}}
+; CHECK: bor %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 101
@@ -277,7 +277,7 @@ exit:
define void @f15(i32 %a) {
; CHECK-LABEL: f15:
; CHECK: tmll %r2, 65519
-; CHECK: jno {{\.L.*}}
+; CHECK: bnor %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 65519
@@ -297,7 +297,7 @@ exit:
define void @f16(i32 %a) {
; CHECK-LABEL: f16:
; CHECK: tmll %r2, 130
-; CHECK: jno {{\.L.*}}
+; CHECK: bnor %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 130
@@ -316,7 +316,7 @@ exit:
define void @f17(i32 %a) {
; CHECK-LABEL: f17:
; CHECK: tmll %r2, 130
-; CHECK: jno {{\.L.*}}
+; CHECK: bnor %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 130
@@ -336,7 +336,7 @@ exit:
define void @f18(i32 %a) {
; CHECK-LABEL: f18:
; CHECK: tmll %r2, 194
-; CHECK: jo {{\.L.*}}
+; CHECK: bor %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 194
@@ -355,7 +355,7 @@ exit:
define void @f19(i32 %a) {
; CHECK-LABEL: f19:
; CHECK: tmll %r2, 194
-; CHECK: jo {{\.L.*}}
+; CHECK: bor %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 194
@@ -375,7 +375,7 @@ exit:
define void @f20(i32 %a) {
; CHECK-LABEL: f20:
; CHECK: tmll %r2, 20
-; CHECK: jl {{\.L.*}}
+; CHECK: blr %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 20
@@ -395,7 +395,7 @@ exit:
define void @f21(i32 %a) {
; CHECK-LABEL: f21:
; CHECK: tmll %r2, 20
-; CHECK: jnl {{\.L.*}}
+; CHECK: bnlr %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 20
@@ -415,7 +415,7 @@ exit:
define void @f22(i32 %a) {
; CHECK-LABEL: f22:
; CHECK: tmll %r2, 20
-; CHECK: jh {{\.L.*}}
+; CHECK: bhr %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 20
@@ -435,7 +435,7 @@ exit:
define void @f23(i32 %a) {
; CHECK-LABEL: f23:
; CHECK: tmll %r2, 20
-; CHECK: jnh {{\.L.*}}
+; CHECK: bnhr %r14
; CHECK: br %r14
entry:
%and = and i32 %a, 20
@@ -454,7 +454,7 @@ exit:
define void @f24(i32 %a) {
; CHECK-LABEL: f24:
; CHECK: tmll %r2, 255
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%shl = shl i32 %a, 12
@@ -474,7 +474,7 @@ exit:
define void @f25(i32 %a) {
; CHECK-LABEL: f25:
; CHECK: tmlh %r2, 512
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%shr = lshr i32 %a, 25
diff --git a/test/CodeGen/SystemZ/int-cmp-47.ll b/test/CodeGen/SystemZ/int-cmp-47.ll
index 274350d24de1..dc87284ff5f5 100644
--- a/test/CodeGen/SystemZ/int-cmp-47.ll
+++ b/test/CodeGen/SystemZ/int-cmp-47.ll
@@ -9,7 +9,7 @@
define void @f1(i64 %a) {
; CHECK-LABEL: f1:
; CHECK: tmll %r2, 1
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 1
@@ -28,7 +28,7 @@ exit:
define void @f2(i64 %a) {
; CHECK-LABEL: f2:
; CHECK: tmll %r2, 65535
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 65535
@@ -47,7 +47,7 @@ exit:
define void @f3(i64 %a) {
; CHECK-LABEL: f3:
; CHECK: tmlh %r2, 1
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 65536
@@ -84,7 +84,7 @@ exit:
define void @f5(i64 %a) {
; CHECK-LABEL: f5:
; CHECK: tmlh %r2, 65535
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 4294901760
@@ -103,7 +103,7 @@ exit:
define void @f6(i64 %a) {
; CHECK-LABEL: f6:
; CHECK: tmhl %r2, 1
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 4294967296
@@ -140,7 +140,7 @@ exit:
define void @f8(i64 %a) {
; CHECK-LABEL: f8:
; CHECK: tmhl %r2, 65535
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 281470681743360
@@ -159,7 +159,7 @@ exit:
define void @f9(i64 %a) {
; CHECK-LABEL: f9:
; CHECK: tmhh %r2, 1
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 281474976710656
@@ -178,7 +178,7 @@ exit:
define void @f10(i64 %a) {
; CHECK-LABEL: f10:
; CHECK: tmhh %r2, 65535
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%and = and i64 %a, 18446462598732840960
@@ -197,7 +197,7 @@ exit:
define void @f11(i64 %a) {
; CHECK-LABEL: f11:
; CHECK: tmhl %r2, 32768
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%shl = shl i64 %a, 1
@@ -217,7 +217,7 @@ exit:
define void @f12(i64 %a) {
; CHECK-LABEL: f12:
; CHECK: tmhh %r2, 256
-; CHECK: jne {{\.L.*}}
+; CHECK: bner %r14
; CHECK: br %r14
entry:
%shr = lshr i64 %a, 56
@@ -237,7 +237,7 @@ exit:
define void @f13(i64 %a) {
; CHECK-LABEL: f13:
; CHECK: tmhh %r2, 49152
-; CHECK: jno {{\.L.*}}
+; CHECK: bnor %r14
; CHECK: br %r14
entry:
%cmp = icmp ult i64 %a, 13835058055282163712
@@ -255,7 +255,7 @@ exit:
define void @f14(i64 %a) {
; CHECK-LABEL: f14:
; CHECK: tmhh %r2, 49152
-; CHECK: jno {{\.L.*}}
+; CHECK: bnor %r14
; CHECK: br %r14
entry:
%cmp = icmp ule i64 %a, 13835058055282163711
@@ -273,7 +273,7 @@ exit:
define void @f15(i64 %a) {
; CHECK-LABEL: f15:
; CHECK: tmhh %r2, 49152
-; CHECK: jo {{\.L.*}}
+; CHECK: bor %r14
; CHECK: br %r14
entry:
%cmp = icmp ugt i64 %a, 13835058055282163711
@@ -291,7 +291,7 @@ exit:
define void @f16(i64 %a) {
; CHECK-LABEL: f16:
; CHECK: tmhh %r2, 49152
-; CHECK: jo {{\.L.*}}
+; CHECK: bor %r14
; CHECK: br %r14
entry:
%cmp = icmp uge i64 %a, 13835058055282163712
@@ -329,7 +329,7 @@ exit:
define void @f18(i64 %a) {
; CHECK-LABEL: f18:
; CHECK-NOT: tmhh
-; CHECK: cgijhe %r2, 0,
+; CHECK: cgibhe %r2, 0, 0(%r14)
; CHECK: br %r14
entry:
%cmp = icmp ult i64 %a, 9223372036854775808
diff --git a/test/CodeGen/SystemZ/int-cmp-48.ll b/test/CodeGen/SystemZ/int-cmp-48.ll
index e26694753e7c..277423b8cc0c 100644
--- a/test/CodeGen/SystemZ/int-cmp-48.ll
+++ b/test/CodeGen/SystemZ/int-cmp-48.ll
@@ -8,7 +8,7 @@
define void @f1(i8 *%src) {
; CHECK-LABEL: f1:
; CHECK: tm 0(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%byte = load i8 , i8 *%src
@@ -31,7 +31,7 @@ define void @f2(i8 *%src) {
; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
; CHECK: mvi 0(%r2), 0
; CHECK: tmll [[REG]], 1
-; CHECK: je {{\.L.*}}
+; CHECK: ber %r14
; CHECK: br %r14
entry:
%byte = load i8 , i8 *%src
diff --git a/test/CodeGen/SystemZ/memchr-01.ll b/test/CodeGen/SystemZ/memchr-01.ll
index f7509c4f256b..f4d381b37f26 100644
--- a/test/CodeGen/SystemZ/memchr-01.ll
+++ b/test/CodeGen/SystemZ/memchr-01.ll
@@ -13,7 +13,7 @@ define i8 *@f1(i8 *%src, i16 %char, i32 %len) {
; CHECK: [[LABEL:\.[^:]*]]:
; CHECK: srst %r2, [[REG]]
; CHECK-NEXT: jo [[LABEL]]
-; CHECK: jl {{\.L.*}}
+; CHECK: blr %r14
; CHECK: lghi %r2, 0
; CHECK: br %r14
%res = call i8 *@memchr(i8 *%src, i16 %char, i32 %len)
diff --git a/test/CodeGen/SystemZ/memchr-02.ll b/test/CodeGen/SystemZ/memchr-02.ll
index 71b2cf02b352..0cfca2af1e98 100644
--- a/test/CodeGen/SystemZ/memchr-02.ll
+++ b/test/CodeGen/SystemZ/memchr-02.ll
@@ -12,7 +12,7 @@ define i8 *@f1(i64 %len, i8 *%src, i32 %char) {
; CHECK: [[LABEL:\.[^:]*]]:
; CHECK: srst %r2, %r3
; CHECK-NEXT: jo [[LABEL]]
-; CHECK: jl {{\.L.*}}
+; CHECK: blr %r14
; CHECK: lghi %r2, 0
; CHECK: br %r14
%res = call i8 *@memchr(i8 *%src, i32 %char, i64 %len)
diff --git a/test/CodeGen/SystemZ/memchr-nobuiltin.ll b/test/CodeGen/SystemZ/memchr-nobuiltin.ll
new file mode 100644
index 000000000000..f94e1162ae4e
--- /dev/null
+++ b/test/CodeGen/SystemZ/memchr-nobuiltin.ll
@@ -0,0 +1,16 @@
+; Test that memchr won't be converted to SRST if calls are
+; marked with nobuiltin, eg. for sanitizers.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@memchr(i8 *%src, i16 %char, i32 %len)
+
+; Test a simple forwarded call.
+define i8 *@f1(i8 *%src, i16 %char, i32 %len) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: srst
+; CHECK: brasl %r14, memchr
+; CHECK: br %r14
+ %res = call i8 *@memchr(i8 *%src, i16 %char, i32 %len) nobuiltin
+ ret i8 *%res
+}
diff --git a/test/CodeGen/SystemZ/memcmp-01.ll b/test/CodeGen/SystemZ/memcmp-01.ll
index a01441946937..ac980e49d60b 100644
--- a/test/CodeGen/SystemZ/memcmp-01.ll
+++ b/test/CodeGen/SystemZ/memcmp-01.ll
@@ -29,7 +29,7 @@ define i32 @f2(i8 *%src1, i8 *%src2) {
define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f3:
; CHECK: clc 0(3,%r2), 0(%r3)
-; CHECK-NEXT: je {{\..*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3)
%cmp = icmp eq i32 %res, 0
@@ -47,7 +47,7 @@ exit:
define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f4:
; CHECK: clc 0(4,%r2), 0(%r3)
-; CHECK-NEXT: jlh {{\..*}}
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4)
@@ -66,7 +66,7 @@ exit:
define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f5:
; CHECK: clc 0(5,%r2), 0(%r3)
-; CHECK-NEXT: jl {{\..*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5)
@@ -85,7 +85,7 @@ exit:
define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-LABEL: f6:
; CHECK: clc 0(6,%r2), 0(%r3)
-; CHECK-NEXT: jh {{\..*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6)
@@ -108,7 +108,7 @@ define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK: ipm [[REG:%r[0-5]]]
; CHECK: srl [[REG]], 28
; CHECK: rll %r2, [[REG]], 31
-; CHECK: jl {{.L*}}
+; CHECK: blr %r14
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256)
@@ -143,7 +143,7 @@ define void @f9(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK: jlh [[LABEL:\..*]]
; CHECK: clc 256(1,%r2), 256(%r3)
; CHECK: [[LABEL]]:
-; CHECK-NEXT: jl .L
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
diff --git a/test/CodeGen/SystemZ/memcmp-02.ll b/test/CodeGen/SystemZ/memcmp-02.ll
index 74b090dcdd8e..da11170def79 100644
--- a/test/CodeGen/SystemZ/memcmp-02.ll
+++ b/test/CodeGen/SystemZ/memcmp-02.ll
@@ -30,7 +30,7 @@ define i64 @f2(i8 *%src1, i8 *%src2) {
define void @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
; CHECK-LABEL: f3:
; CHECK: clc 0(3,%r2), 0(%r3)
-; CHECK-NEXT: je {{\..*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 3)
%cmp = icmp eq i64 %res, 0
@@ -48,7 +48,7 @@ exit:
define void @f4(i8 *%src1, i8 *%src2, i64 *%dest) {
; CHECK-LABEL: f4:
; CHECK: clc 0(4,%r2), 0(%r3)
-; CHECK-NEXT: jlh {{\..*}}
+; CHECK-NEXT: blhr %r14
; CHECK: br %r14
entry:
%res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 4)
@@ -67,7 +67,7 @@ exit:
define void @f5(i8 *%src1, i8 *%src2, i64 *%dest) {
; CHECK-LABEL: f5:
; CHECK: clc 0(5,%r2), 0(%r3)
-; CHECK-NEXT: jl {{\..*}}
+; CHECK-NEXT: blr %r14
; CHECK: br %r14
entry:
%res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 5)
@@ -86,7 +86,7 @@ exit:
define void @f6(i8 *%src1, i8 *%src2, i64 *%dest) {
; CHECK-LABEL: f6:
; CHECK: clc 0(6,%r2), 0(%r3)
-; CHECK-NEXT: jh {{\..*}}
+; CHECK-NEXT: bhr %r14
; CHECK: br %r14
entry:
%res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 6)
@@ -110,7 +110,7 @@ define i64 @f7(i8 *%src1, i8 *%src2, i64 *%dest) {
; CHECK: srl [[REG]], 28
; CHECK: rll [[REG]], [[REG]], 31
; CHECK: lgfr %r2, [[REG]]
-; CHECK: jl {{.L*}}
+; CHECK: blr %r14
; CHECK: br %r14
entry:
%res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 256)
diff --git a/test/CodeGen/SystemZ/memcmp-nobuiltin.ll b/test/CodeGen/SystemZ/memcmp-nobuiltin.ll
new file mode 100644
index 000000000000..5703552289f3
--- /dev/null
+++ b/test/CodeGen/SystemZ/memcmp-nobuiltin.ll
@@ -0,0 +1,191 @@
+; Test that memcmp won't be converted to CLC if calls are
+; marked with nobuiltin, eg. for sanitizers.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 0) nobuiltin
+ ret i32 %res
+}
+
+; Check a case where the result is used as an integer.
+define i32 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 2) nobuiltin
+ ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3) nobuiltin
+ %cmp = icmp eq i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4) nobuiltin
+ %cmp = icmp ne i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5) nobuiltin
+ %cmp = icmp slt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6) nobuiltin
+ %cmp = icmp sgt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check the upper end of the CLC range. Here the result is used both as
+; an integer and for branching.
+define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256) nobuiltin
+ %cmp = icmp slt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret i32 %res
+}
+
+; 257 bytes needs two CLCs.
+define i32 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257) nobuiltin
+ ret i32 %res
+}
+
+; Test a comparison of 258 bytes in which the CC result can be used directly.
+define void @f9(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f9:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257) nobuiltin
+ %cmp = icmp slt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Test the largest size that can use two CLCs.
+define i32 @f10(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f10:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 512) nobuiltin
+ ret i32 %res
+}
+
+; Test the smallest size that needs 3 CLCs.
+define i32 @f11(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f11:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 513) nobuiltin
+ ret i32 %res
+}
+
+; Test the largest size than can use 3 CLCs.
+define i32 @f12(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f12:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 768) nobuiltin
+ ret i32 %res
+}
+
+; The next size up uses a loop instead. We leave the more complicated
+; loop tests to memcpy-01.ll, which shares the same form.
+define i32 @f13(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f13:
+; CHECK-NOT: clc
+; CHECK: brasl %r14, memcmp
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 769) nobuiltin
+ ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/pie.ll b/test/CodeGen/SystemZ/pie.ll
new file mode 100644
index 000000000000..8fc261454464
--- /dev/null
+++ b/test/CodeGen/SystemZ/pie.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=s390x-linux-gnu -relocation-model=pic < %s | FileCheck %s
+
+@foo = global i32 42
+
+define i32* @get_foo() {
+ ret i32* @foo
+}
+
+; CHECK: larl %r2, foo{{$}}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"PIE Level", i32 2}
diff --git a/test/CodeGen/SystemZ/ret-addr-01.ll b/test/CodeGen/SystemZ/ret-addr-01.ll
new file mode 100644
index 000000000000..9c3b246af578
--- /dev/null
+++ b/test/CodeGen/SystemZ/ret-addr-01.ll
@@ -0,0 +1,15 @@
+; Test support for the llvm.returnaddress intrinsic.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; The current function's return address is in the link register.
+define i8* @rt0() norecurse nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: lgr %r2, %r14
+; CHECK: br %r14
+ %0 = tail call i8* @llvm.returnaddress(i32 0)
+ ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/SystemZ/risbg-01.ll b/test/CodeGen/SystemZ/risbg-01.ll
index d75e8e4b11a6..1c4315343de0 100644
--- a/test/CodeGen/SystemZ/risbg-01.ll
+++ b/test/CodeGen/SystemZ/risbg-01.ll
@@ -480,3 +480,24 @@ define i64 @f42(i1 %x) {
%ext2 = zext i8 %ext to i64
ret i64 %ext2
}
+
+; Check that we get the case where a 64-bit shift is used by a 32-bit and.
+define signext i32 @f43(i64 %x) {
+; CHECK-LABEL: f43:
+; CHECK: risbg [[REG:%r[0-5]]], %r2, 32, 189, 52
+; CHECK: lgfr %r2, [[REG]]
+ %shr3 = lshr i64 %x, 12
+ %shr3.tr = trunc i64 %shr3 to i32
+ %conv = and i32 %shr3.tr, -4
+ ret i32 %conv
+}
+
+; Check that we don't get the case where the 32-bit and mask is not contiguous
+define signext i32 @f44(i64 %x) {
+; CHECK-LABEL: f44:
+; CHECK: srlg [[REG:%r[0-5]]], %r2, 12
+ %shr4 = lshr i64 %x, 12
+ %conv = trunc i64 %shr4 to i32
+ %and = and i32 %conv, 10
+ ret i32 %and
+}
diff --git a/test/CodeGen/SystemZ/risbg-02.ll b/test/CodeGen/SystemZ/risbg-02.ll
index 5ccfab028b02..094005acae4b 100644
--- a/test/CodeGen/SystemZ/risbg-02.ll
+++ b/test/CodeGen/SystemZ/risbg-02.ll
@@ -91,3 +91,28 @@ define i64 @f8(i64 %a, i64 %b) {
%or = or i64 %anda, %shrb
ret i64 %or
}
+
+; Check that we can get the case where a 64-bit shift feeds a 32-bit or of
+; ands with complement masks.
+define signext i32 @f9(i64 %x, i32 signext %y) {
+; CHECK-LABEL: f9:
+; CHECK: risbg [[REG:%r[0-5]]], %r2, 48, 63, 16
+; CHECK: lgfr %r2, [[REG]]
+ %shr6 = lshr i64 %x, 48
+ %conv = trunc i64 %shr6 to i32
+ %and1 = and i32 %y, -65536
+ %or = or i32 %conv, %and1
+ ret i32 %or
+}
+
+; Check that we don't get the case where a 64-bit shift feeds a 32-bit or of
+; ands with incompatible masks.
+define signext i32 @f10(i64 %x, i32 signext %y) {
+; CHECK-LABEL: f10:
+; CHECK: nilf %r3, 4278190080
+ %shr6 = lshr i64 %x, 48
+ %conv = trunc i64 %shr6 to i32
+ %and1 = and i32 %y, -16777216
+ %or = or i32 %conv, %and1
+ ret i32 %or
+}
diff --git a/test/CodeGen/SystemZ/rot-01.ll b/test/CodeGen/SystemZ/rot-01.ll
new file mode 100644
index 000000000000..ea275e68df54
--- /dev/null
+++ b/test/CodeGen/SystemZ/rot-01.ll
@@ -0,0 +1,35 @@
+; Test shortening of NILL to NILF when the result is used as a rotate amount.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test 32-bit rotate.
+define i32 @f1(i32 %val, i32 %amt) {
+; CHECK-LABEL: f1:
+; CHECK: nill %r3, 31
+; CHECK: rll %r2, %r2, 0(%r3)
+ %mod = urem i32 %amt, 32
+
+ %inv = sub i32 32, %mod
+ %parta = shl i32 %val, %mod
+ %partb = lshr i32 %val, %inv
+
+ %rotl = or i32 %parta, %partb
+
+ ret i32 %rotl
+}
+
+; Test 64-bit rotate.
+define i64 @f2(i64 %val, i64 %amt) {
+; CHECK-LABEL: f2:
+; CHECK: nill %r3, 31
+; CHECK: rllg %r2, %r2, 0(%r3)
+ %mod = urem i64 %amt, 32
+
+ %inv = sub i64 64, %mod
+ %parta = shl i64 %val, %mod
+ %partb = lshr i64 %val, %inv
+
+ %rotl = or i64 %parta, %partb
+
+ ret i64 %rotl
+}
diff --git a/test/CodeGen/SystemZ/rot-02.ll b/test/CodeGen/SystemZ/rot-02.ll
new file mode 100644
index 000000000000..12b09f131850
--- /dev/null
+++ b/test/CodeGen/SystemZ/rot-02.ll
@@ -0,0 +1,86 @@
+; Test removal of AND operations that don't affect last 6 bits of rotate amount
+; operand.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test that AND is not removed when some lower 6 bits are not set.
+define i32 @f1(i32 %val, i32 %amt) {
+; CHECK-LABEL: f1:
+; CHECK: nil{{[lf]}} %r3, 31
+; CHECK: rll %r2, %r2, 0(%r3)
+ %and = and i32 %amt, 31
+
+ %inv = sub i32 32, %and
+ %parta = shl i32 %val, %and
+ %partb = lshr i32 %val, %inv
+
+ %rotl = or i32 %parta, %partb
+
+ ret i32 %rotl
+}
+
+; Test removal of AND mask with only bottom 6 bits set.
+define i32 @f2(i32 %val, i32 %amt) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: rll %r2, %r2, 0(%r3)
+ %and = and i32 %amt, 63
+
+ %inv = sub i32 32, %and
+ %parta = shl i32 %val, %and
+ %partb = lshr i32 %val, %inv
+
+ %rotl = or i32 %parta, %partb
+
+ ret i32 %rotl
+}
+
+; Test removal of AND mask including but not limited to bottom 6 bits.
+define i32 @f3(i32 %val, i32 %amt) {
+; CHECK-LABEL: f3:
+; CHECK-NOT: nil{{[lf]}} %r3, 255
+; CHECK: rll %r2, %r2, 0(%r3)
+ %and = and i32 %amt, 255
+
+ %inv = sub i32 32, %and
+ %parta = shl i32 %val, %and
+ %partb = lshr i32 %val, %inv
+
+ %rotl = or i32 %parta, %partb
+
+ ret i32 %rotl
+}
+
+; Test removal of AND mask from RLLG.
+define i64 @f4(i64 %val, i64 %amt) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: rllg %r2, %r2, 0(%r3)
+ %and = and i64 %amt, 63
+
+ %inv = sub i64 64, %and
+ %parta = shl i64 %val, %and
+ %partb = lshr i64 %val, %inv
+
+ %rotl = or i64 %parta, %partb
+
+ ret i64 %rotl
+}
+
+; Test that AND is not entirely removed if the result is reused.
+define i32 @f5(i32 %val, i32 %amt) {
+; CHECK-LABEL: f5:
+; CHECK: rll %r2, %r2, 0(%r3)
+; CHECK: nil{{[lf]}} %r3, 63
+; CHECK: ar %r2, %r3
+ %and = and i32 %amt, 63
+
+ %inv = sub i32 32, %and
+ %parta = shl i32 %val, %and
+ %partb = lshr i32 %val, %inv
+
+ %rotl = or i32 %parta, %partb
+
+ %reuse = add i32 %and, %rotl
+ ret i32 %reuse
+}
diff --git a/test/CodeGen/SystemZ/shift-11.ll b/test/CodeGen/SystemZ/shift-11.ll
new file mode 100644
index 000000000000..9741fa5a0b55
--- /dev/null
+++ b/test/CodeGen/SystemZ/shift-11.ll
@@ -0,0 +1,63 @@
+; Test shortening of NILL to NILF when the result is used as a shift amount.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test logical shift right.
+define i32 @f1(i32 %a, i32 %sh) {
+; CHECK-LABEL: f1:
+; CHECK: nill %r3, 31
+; CHECK: srl %r2, 0(%r3)
+ %and = and i32 %sh, 31
+ %shift = lshr i32 %a, %and
+ ret i32 %shift
+}
+
+; Test arithmetic shift right.
+define i32 @f2(i32 %a, i32 %sh) {
+; CHECK-LABEL: f2:
+; CHECK: nill %r3, 31
+; CHECK: sra %r2, 0(%r3)
+ %and = and i32 %sh, 31
+ %shift = ashr i32 %a, %and
+ ret i32 %shift
+}
+
+; Test shift left.
+define i32 @f3(i32 %a, i32 %sh) {
+; CHECK-LABEL: f3:
+; CHECK: nill %r3, 31
+; CHECK: sll %r2, 0(%r3)
+ %and = and i32 %sh, 31
+ %shift = shl i32 %a, %and
+ ret i32 %shift
+}
+
+; Test 64-bit logical shift right.
+define i64 @f4(i64 %a, i64 %sh) {
+; CHECK-LABEL: f4:
+; CHECK: nill %r3, 31
+; CHECK: srlg %r2, %r2, 0(%r3)
+ %and = and i64 %sh, 31
+ %shift = lshr i64 %a, %and
+ ret i64 %shift
+}
+
+; Test 64-bit arithmetic shift right.
+define i64 @f5(i64 %a, i64 %sh) {
+; CHECK-LABEL: f5:
+; CHECK: nill %r3, 31
+; CHECK: srag %r2, %r2, 0(%r3)
+ %and = and i64 %sh, 31
+ %shift = ashr i64 %a, %and
+ ret i64 %shift
+}
+
+; Test 64-bit shift left.
+define i64 @f6(i64 %a, i64 %sh) {
+; CHECK-LABEL: f6:
+; CHECK: nill %r3, 31
+; CHECK: sllg %r2, %r2, 0(%r3)
+ %and = and i64 %sh, 31
+ %shift = shl i64 %a, %and
+ ret i64 %shift
+}
diff --git a/test/CodeGen/SystemZ/shift-12.ll b/test/CodeGen/SystemZ/shift-12.ll
new file mode 100644
index 000000000000..4ebc42b44a47
--- /dev/null
+++ b/test/CodeGen/SystemZ/shift-12.ll
@@ -0,0 +1,106 @@
+; Test removal of AND operations that don't affect last 6 bits of shift amount
+; operand.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test that AND is not removed when some lower 6 bits are not set.
+define i32 @f1(i32 %a, i32 %sh) {
+; CHECK-LABEL: f1:
+; CHECK: nil{{[lf]}} %r3, 31
+; CHECK: sll %r2, 0(%r3)
+ %and = and i32 %sh, 31
+ %shift = shl i32 %a, %and
+ ret i32 %shift
+}
+
+; Test removal of AND mask with only bottom 6 bits set.
+define i32 @f2(i32 %a, i32 %sh) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: sll %r2, 0(%r3)
+ %and = and i32 %sh, 63
+ %shift = shl i32 %a, %and
+ ret i32 %shift
+}
+
+; Test removal of AND mask including but not limited to bottom 6 bits.
+define i32 @f3(i32 %a, i32 %sh) {
+; CHECK-LABEL: f3:
+; CHECK-NOT: nil{{[lf]}} %r3, 255
+; CHECK: sll %r2, 0(%r3)
+ %and = and i32 %sh, 255
+ %shift = shl i32 %a, %and
+ ret i32 %shift
+}
+
+; Test removal of AND mask from SRA.
+define i32 @f4(i32 %a, i32 %sh) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: sra %r2, 0(%r3)
+ %and = and i32 %sh, 63
+ %shift = ashr i32 %a, %and
+ ret i32 %shift
+}
+
+; Test removal of AND mask from SRL.
+define i32 @f5(i32 %a, i32 %sh) {
+; CHECK-LABEL: f5:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: srl %r2, 0(%r3)
+ %and = and i32 %sh, 63
+ %shift = lshr i32 %a, %and
+ ret i32 %shift
+}
+
+; Test removal of AND mask from SLLG.
+define i64 @f6(i64 %a, i64 %sh) {
+; CHECK-LABEL: f6:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: sllg %r2, %r2, 0(%r3)
+ %and = and i64 %sh, 63
+ %shift = shl i64 %a, %and
+ ret i64 %shift
+}
+
+; Test removal of AND mask from SRAG.
+define i64 @f7(i64 %a, i64 %sh) {
+; CHECK-LABEL: f7:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: srag %r2, %r2, 0(%r3)
+ %and = and i64 %sh, 63
+ %shift = ashr i64 %a, %and
+ ret i64 %shift
+}
+
+; Test removal of AND mask from SRLG.
+define i64 @f8(i64 %a, i64 %sh) {
+; CHECK-LABEL: f8:
+; CHECK-NOT: nil{{[lf]}} %r3, 63
+; CHECK: srlg %r2, %r2, 0(%r3)
+ %and = and i64 %sh, 63
+ %shift = lshr i64 %a, %and
+ ret i64 %shift
+}
+
+; Test that AND with two register operands is not affected.
+define i32 @f9(i32 %a, i32 %b, i32 %sh) {
+; CHECK-LABEL: f9:
+; CHECK: nr %r3, %r4
+; CHECK: sll %r2, 0(%r3)
+ %and = and i32 %sh, %b
+ %shift = shl i32 %a, %and
+ ret i32 %shift
+}
+
+; Test that AND is not entirely removed if the result is reused.
+define i32 @f10(i32 %a, i32 %sh) {
+; CHECK-LABEL: f10:
+; CHECK: sll %r2, 0(%r3)
+; CHECK: nil{{[lf]}} %r3, 63
+; CHECK: ar %r2, %r3
+ %and = and i32 %sh, 63
+ %shift = shl i32 %a, %and
+ %reuse = add i32 %and, %shift
+ ret i32 %reuse
+}
diff --git a/test/CodeGen/SystemZ/stack-guard.ll b/test/CodeGen/SystemZ/stack-guard.ll
new file mode 100644
index 000000000000..0889e7ba941e
--- /dev/null
+++ b/test/CodeGen/SystemZ/stack-guard.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; CHECK-LABEL: @test_stack_guard
+; CHECK: ear [[REG1:%r[1-9][0-9]?]], %a0
+; CHECK: sllg [[REG1]], [[REG1]], 32
+; CHECK: ear [[REG1]], %a1
+; CHECK: lg [[REG1]], 40([[REG1]])
+; CHECK: stg [[REG1]], {{[0-9]*}}(%r15)
+; CHECK: brasl %r14, foo3@PLT
+; CHECK: ear [[REG2:%r[1-9][0-9]?]], %a0
+; CHECK: sllg [[REG2]], [[REG2]], 32
+; CHECK: ear [[REG2]], %a1
+; CHECK: lg [[REG2]], 40([[REG2]])
+; CHECK: sg [[REG2]], {{[0-9]*}}(%r15)
+
+define i32 @test_stack_guard() #0 {
+entry:
+ %a1 = alloca [256 x i32], align 4
+ %0 = bitcast [256 x i32]* %a1 to i8*
+ call void @llvm.lifetime.start(i64 1024, i8* %0)
+ %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i64 0, i64 0
+ call void @foo3(i32* %arraydecay)
+ call void @llvm.lifetime.end(i64 1024, i8* %0)
+ ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { sspstrong }
diff --git a/test/CodeGen/SystemZ/strcmp-01.ll b/test/CodeGen/SystemZ/strcmp-01.ll
index 122c160babaf..a30663a13f1f 100644
--- a/test/CodeGen/SystemZ/strcmp-01.ll
+++ b/test/CodeGen/SystemZ/strcmp-01.ll
@@ -28,7 +28,7 @@ define void @f2(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK: clst %r2, %r3
; CHECK-NEXT: jo [[LABEL]]
; CHECK-NEXT: BB#{{[0-9]+}}
-; CHECK-NEXT: je {{\.L.*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%res = call i32 @strcmp(i8 *%src1, i8 *%src2)
%cmp = icmp eq i32 %res, 0
@@ -54,7 +54,7 @@ define i32 @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
; CHECK-NEXT: ipm [[REG:%r[0-5]]]
; CHECK: srl [[REG]], 28
; CHECK: rll %r2, [[REG]], 31
-; CHECK: jl {{\.L*}}
+; CHECK: blr %r14
; CHECK: br %r14
entry:
%res = call i32 @strcmp(i8 *%src1, i8 *%src2)
diff --git a/test/CodeGen/SystemZ/strcmp-02.ll b/test/CodeGen/SystemZ/strcmp-02.ll
index 27bd00b47fd3..99d7d9cfa692 100644
--- a/test/CodeGen/SystemZ/strcmp-02.ll
+++ b/test/CodeGen/SystemZ/strcmp-02.ll
@@ -29,7 +29,7 @@ define void @f2(i8 *%src1, i8 *%src2, i64 *%dest) {
; CHECK: clst %r2, %r3
; CHECK-NEXT: jo [[LABEL]]
; CHECK-NEXT: BB#{{[0-9]+}}
-; CHECK-NEXT: je {{\.L.*}}
+; CHECK-NEXT: ber %r14
; CHECK: br %r14
%res = call i64 @strcmp(i8 *%src1, i8 *%src2)
%cmp = icmp eq i64 %res, 0
@@ -56,7 +56,7 @@ define i64 @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
; CHECK: srl [[REG]], 28
; CHECK: rll [[REG]], [[REG]], 31
; CHECK: lgfr %r2, [[REG]]
-; CHECK: jl {{\.L*}}
+; CHECK: blr %r14
; CHECK: br %r14
entry:
%res = call i64 @strcmp(i8 *%src1, i8 *%src2)
diff --git a/test/CodeGen/SystemZ/strcmp-nobuiltin.ll b/test/CodeGen/SystemZ/strcmp-nobuiltin.ll
new file mode 100644
index 000000000000..187348881a6d
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcmp-nobuiltin.ll
@@ -0,0 +1,54 @@
+; Test that strcmp won't be converted to CLST if calls are
+; marked with nobuiltin, eg. for sanitizers.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @strcmp(i8 *%src1, i8 *%src2)
+
+; Check a case where the result is used as an integer.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: clst
+; CHECK: brasl %r14, strcmp
+; CHECK: br %r14
+ %res = call i32 @strcmp(i8 *%src1, i8 *%src2) nobuiltin
+ ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f2(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: clst
+; CHECK: brasl %r14, strcmp
+; CHECK: br %r14
+ %res = call i32 @strcmp(i8 *%src1, i8 *%src2) nobuiltin
+ %cmp = icmp eq i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Test a case where the result is used both as an integer and for
+; branching.
+define i32 @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK-NOT: clst
+; CHECK: brasl %r14, strcmp
+; CHECK: br %r14
+entry:
+ %res = call i32 @strcmp(i8 *%src1, i8 *%src2) nobuiltin
+ %cmp = icmp slt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/strcpy-nobuiltin.ll b/test/CodeGen/SystemZ/strcpy-nobuiltin.ll
new file mode 100644
index 000000000000..746fd67e0840
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcpy-nobuiltin.ll
@@ -0,0 +1,42 @@
+; Test that strcmp won't be converted to MVST if calls are
+; marked with nobuiltin, eg. for sanitizers.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@strcpy(i8 *%dest, i8 *%src)
+declare i8 *@stpcpy(i8 *%dest, i8 *%src)
+
+; Check strcpy.
+define i8 *@f1(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: mvst
+; CHECK: brasl %r14, strcpy
+; CHECK: br %r14
+ %res = call i8 *@strcpy(i8 *%dest, i8 *%src) nobuiltin
+ ret i8 *%res
+}
+
+; Check stpcpy.
+define i8 *@f2(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: mvst
+; CHECK: brasl %r14, stpcpy
+; CHECK: br %r14
+ %res = call i8 *@stpcpy(i8 *%dest, i8 *%src) nobuiltin
+ ret i8 *%res
+}
+
+; Check correct operation with other loads and stores. The load must
+; come before the loop and the store afterwards.
+define i32 @f3(i32 %dummy, i8 *%dest, i8 *%src, i32 *%resptr, i32 *%storeptr) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: l [[REG1:%r[0-9]+]], 0(%r5)
+; CHECK-NOT: mvst
+; CHECK: brasl %r14, strcpy
+; CHECK: mvhi 0(%r6), 0
+; CHECK: br %r14
+ %res = load i32 , i32 *%resptr
+ %unused = call i8 *@strcpy(i8 *%dest, i8 *%src) nobuiltin
+ store i32 0, i32 *%storeptr
+ ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/strlen-nobuiltin.ll b/test/CodeGen/SystemZ/strlen-nobuiltin.ll
new file mode 100644
index 000000000000..c16e601def35
--- /dev/null
+++ b/test/CodeGen/SystemZ/strlen-nobuiltin.ll
@@ -0,0 +1,25 @@
+; Test that strlen/strnlen won't be converted to SRST if calls are
+; marked with nobuiltin, eg. for sanitizers.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @strlen(i8 *%src)
+declare i64 @strnlen(i8 *%src, i64 %len)
+
+define i64 @f1(i32 %dummy, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: brasl %r14, strlen
+; CHECK: br %r14
+ %res = call i64 @strlen(i8 *%src) nobuiltin
+ ret i64 %res
+}
+
+; Likewise for strnlen.
+define i64 @f2(i64 %len, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: srst
+; CHECK: brasl %r14, strnlen
+; CHECK: br %r14
+ %res = call i64 @strnlen(i8 *%src, i64 %len) nobuiltin
+ ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/swift-return.ll b/test/CodeGen/SystemZ/swift-return.ll
new file mode 100644
index 000000000000..e72d6def84e8
--- /dev/null
+++ b/test/CodeGen/SystemZ/swift-return.ll
@@ -0,0 +1,203 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -O0 -verify-machineinstrs | FileCheck --check-prefix=CHECK-O0 %s
+
+@var = global i32 0
+
+; Test how llvm handles return type of {i16, i8}. The return value will be
+; passed in %r2 and %r3.
+; CHECK-LABEL: test:
+; CHECK: st %r2
+; CHECK: brasl %r14, gen
+; CHECK-DAG: lhr %r2, %r2
+; CHECK-DAG: lbr %[[REG1:r[0-9]+]], %r3
+; CHECK: ar %r2, %[[REG1]]
+; CHECK-O0-LABEL: test
+; CHECK-O0: st %r2
+; CHECK-O0: brasl %r14, gen
+; CHECK-O0-DAG: lhr %[[REG1:r[0-9]+]], %r2
+; CHECK-O0-DAG: lbr %[[REG2:r[0-9]+]], %r3
+; CHECK-O0: ar %[[REG1]], %[[REG2]]
+; CHECK-O0: lr %r2, %[[REG1]]
+define i16 @test(i32 %key) {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i16, i8 } @gen(i32 %0)
+ %v3 = extractvalue { i16, i8 } %call, 0
+ %v1 = sext i16 %v3 to i32
+ %v5 = extractvalue { i16, i8 } %call, 1
+ %v2 = sext i8 %v5 to i32
+ %add = add nsw i32 %v1, %v2
+ %conv = trunc i32 %add to i16
+ ret i16 %conv
+}
+
+declare swiftcc { i16, i8 } @gen(i32)
+
+; If we can't pass every return value in registers, we will pass everything
+; in memroy. The caller provides space for the return value and passes
+; the address in %r2. The first input argument will be in %r3.
+; CHECK-LABEL: test2:
+; CHECK: lr %[[REG1:r[0-9]+]], %r2
+; CHECK-DAG: la %r2, 160(%r15)
+; CHECK-DAG: lr %r3, %[[REG1]]
+; CHECK: brasl %r14, gen2
+; CHECK: l %r2, 160(%r15)
+; CHECK: a %r2, 164(%r15)
+; CHECK: a %r2, 168(%r15)
+; CHECK: a %r2, 172(%r15)
+; CHECK: a %r2, 176(%r15)
+; CHECK-O0-LABEL: test2:
+; CHECK-O0: la %[[REG1:r[0-9]+]], 168(%r15)
+; CHECK-O0: st %r2, [[SPILL1:[0-9]+]](%r15)
+; CHECK-O0: lgr %r2, %[[REG1]]
+; CHECK-O0: l %r3, [[SPILL1]](%r15)
+; CHECK-O0: brasl %r14, gen2
+; CHECK-O0-DAG: l %r{{.*}}, 184(%r15)
+; CHECK-O0-DAG: l %r{{.*}}, 180(%r15)
+; CHECK-O0-DAG: l %r{{.*}}, 176(%r15)
+; CHECK-O0-DAG: l %r{{.*}}, 172(%r15)
+; CHECK-O0-DAG: l %r{{.*}}, 168(%r15)
+; CHECK-O0: ar
+; CHECK-O0: ar
+; CHECK-O0: ar
+; CHECK-O0: ar
+; CHECK-O0: lr %r2
+define i32 @test2(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32, i32 } %call, 3
+ %v8 = extractvalue { i32, i32, i32, i32, i32 } %call, 4
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ %add3 = add nsw i32 %add2, %v8
+ ret i32 %add3
+}
+
+; The address of the return value is passed in %r2.
+; On return, %r2 will contain the adddress that has been passed in by the caller in %r2.
+; CHECK-LABEL: gen2:
+; CHECK: st %r3, 16(%r2)
+; CHECK: st %r3, 12(%r2)
+; CHECK: st %r3, 8(%r2)
+; CHECK: st %r3, 4(%r2)
+; CHECK: st %r3, 0(%r2)
+; CHECK-O0-LABEL: gen2:
+; CHECK-O0-DAG: st %r3, 16(%r2)
+; CHECK-O0-DAG: st %r3, 12(%r2)
+; CHECK-O0-DAG: st %r3, 8(%r2)
+; CHECK-O0-DAG: st %r3, 4(%r2)
+; CHECK-O0-DAG: st %r3, 0(%r2)
+define swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %key) {
+ %Y = insertvalue { i32, i32, i32, i32, i32 } undef, i32 %key, 0
+ %Z = insertvalue { i32, i32, i32, i32, i32 } %Y, i32 %key, 1
+ %Z2 = insertvalue { i32, i32, i32, i32, i32 } %Z, i32 %key, 2
+ %Z3 = insertvalue { i32, i32, i32, i32, i32 } %Z2, i32 %key, 3
+ %Z4 = insertvalue { i32, i32, i32, i32, i32 } %Z3, i32 %key, 4
+ ret { i32, i32, i32, i32, i32 } %Z4
+}
+
+; The return value {i32, i32, i32, i32} will be returned via registers
+; %r2, %r3, %r4, %r5.
+; CHECK-LABEL: test3:
+; CHECK: brasl %r14, gen3
+; CHECK: ar %r2, %r3
+; CHECK: ar %r2, %r4
+; CHECK: ar %r2, %r5
+; CHECK-O0-LABEL: test3:
+; CHECK-O0: brasl %r14, gen3
+; CHECK-O0: ar %r2, %r3
+; CHECK-O0: ar %r2, %r4
+; CHECK-O0: ar %r2, %r5
+define i32 @test3(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32 } @gen3(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32 } %call, 3
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ ret i32 %add2
+}
+
+declare swiftcc { i32, i32, i32, i32 } @gen3(i32 %key)
+
+; The return value {float, float, float, float} will be returned via registers
+; %f0, %f2, %f4, %f6.
+; CHECK-LABEL: test4:
+; CHECK: brasl %r14, gen4
+; CHECK: aebr %f0, %f2
+; CHECK: aebr %f0, %f4
+; CHECK: aebr %f0, %f6
+; CHECK-O0-LABEL: test4:
+; CHECK-O0: brasl %r14, gen4
+; CHECK-O0: aebr %f0, %f2
+; CHECK-O0: aebr %f0, %f4
+; CHECK-O0: aebr %f0, %f6
+define float @test4(float %key) #0 {
+entry:
+ %key.addr = alloca float, align 4
+ store float %key, float* %key.addr, align 4
+ %0 = load float, float* %key.addr, align 4
+ %call = call swiftcc { float, float, float, float } @gen4(float %0)
+
+ %v3 = extractvalue { float, float, float, float } %call, 0
+ %v5 = extractvalue { float, float, float, float } %call, 1
+ %v6 = extractvalue { float, float, float, float } %call, 2
+ %v7 = extractvalue { float, float, float, float } %call, 3
+
+ %add = fadd float %v3, %v5
+ %add1 = fadd float %add, %v6
+ %add2 = fadd float %add1, %v7
+ ret float %add2
+}
+
+declare swiftcc { float, float, float, float } @gen4(float %key)
+
+; CHECK-LABEL: consume_i1_ret:
+; CHECK: brasl %r14, produce_i1_ret
+; CHECK: nilf %r2, 1
+; CHECK: nilf %r3, 1
+; CHECK: nilf %r4, 1
+; CHECK: nilf %r5, 1
+; CHECK-O0-LABEL: consume_i1_ret:
+; CHECK-O0: brasl %r14, produce_i1_ret
+; CHECK-O0: nilf %r2, 1
+; CHECK-O0: nilf %r3, 1
+; CHECK-O0: nilf %r4, 1
+; CHECK-O0: nilf %r5, 1
+define void @consume_i1_ret() {
+ %call = call swiftcc { i1, i1, i1, i1 } @produce_i1_ret()
+ %v3 = extractvalue { i1, i1, i1, i1 } %call, 0
+ %v5 = extractvalue { i1, i1, i1, i1 } %call, 1
+ %v6 = extractvalue { i1, i1, i1, i1 } %call, 2
+ %v7 = extractvalue { i1, i1, i1, i1 } %call, 3
+ %val = zext i1 %v3 to i32
+ store i32 %val, i32* @var
+ %val2 = zext i1 %v5 to i32
+ store i32 %val2, i32* @var
+ %val3 = zext i1 %v6 to i32
+ store i32 %val3, i32* @var
+ %val4 = zext i1 %v7 to i32
+ store i32 %val4, i32* @var
+ ret void
+}
+
+declare swiftcc { i1, i1, i1, i1 } @produce_i1_ret()
diff --git a/test/CodeGen/SystemZ/swifterror.ll b/test/CodeGen/SystemZ/swifterror.ll
new file mode 100644
index 000000000000..90d55eef4aef
--- /dev/null
+++ b/test/CodeGen/SystemZ/swifterror.ll
@@ -0,0 +1,358 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu| FileCheck %s
+; RUN: llc < %s -O0 -mtriple=s390x-linux-gnu | FileCheck --check-prefix=CHECK-O0 %s
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+%swift_error = type {i64, i8}
+
+; This tests the basic usage of a swifterror parameter. "foo" is the function
+; that takes a swifterror parameter and "caller" is the caller of "foo".
+define float @foo(%swift_error** swifterror %error_ptr_ref) {
+; CHECK-LABEL: foo:
+; CHECK: lghi %r2, 16
+; CHECK: brasl %r14, malloc
+; CHECK: mvi 8(%r2), 1
+; CHECK: lgr %r9, %r2
+; CHECK-O0-LABEL: foo:
+; CHECK-O0: lghi %r2, 16
+; CHECK-O0: brasl %r14, malloc
+; CHECK-O0: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK-O0: mvi 8(%r2), 1
+; CHECK-O0: lgr %r9, %r[[REG1]]
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+}
+
+; "caller" calls "foo" that takes a swifterror parameter.
+define float @caller(i8* %error_ref) {
+; CHECK-LABEL: caller:
+; Make a copy of error_ref because r2 is getting clobbered
+; CHECK: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK: lghi %r9, 0
+; CHECK: brasl %r14, foo
+; CHECK: cgijlh %r9, 0,
+; Access part of the error object and save it to error_ref
+; CHECK: lb %r[[REG2:[0-9]+]], 8(%r9)
+; CHECK: stc %r[[REG2]], 0(%r[[REG1]])
+; CHECK: lgr %r2, %r9
+; CHECK: brasl %r14, free
+; CHECK-O0-LABEL: caller:
+; CHECK-O0: lghi %r9, 0
+; CHECK-O0: brasl %r14, foo
+; CHECK-O0: cghi %r9, 0
+; CHECK-O0: jlh
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "caller2" is the caller of "foo", it calls "foo" inside a loop.
+define float @caller2(i8* %error_ref) {
+; CHECK-LABEL: caller2:
+; Make a copy of error_ref because r2 is getting clobbered
+; CHECK: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK: lghi %r9, 0
+; CHECK: brasl %r14, foo
+; CHECK: cgijlh %r9, 0,
+; CHECK: ceb %f0,
+; CHECK: jnh
+; Access part of the error object and save it to error_ref
+; CHECK: lb %r[[REG2:[0-9]+]], 8(%r9)
+; CHECK: stc %r[[REG2]], 0(%r[[REG1]])
+; CHECK: lgr %r2, %r9
+; CHECK: brasl %r14, free
+; CHECK-O0-LABEL: caller2:
+; CHECK-O0: lghi %r9, 0
+; CHECK-O0: brasl %r14, foo
+; CHECK-O0: cghi %r9, 0
+; CHECK-O0: jlh
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ br label %bb_loop
+bb_loop:
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %cmp = fcmp ogt float %call, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "foo_if" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition.
+define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
+; CHECK-LABEL: foo_if:
+; CHECK: cije %r2, 0
+; CHECK: lghi %r2, 16
+; CHECK: brasl %r14, malloc
+; CHECK: mvi 8(%r2), 1
+; CHECK: lgr %r9, %r2
+; CHECK-NOT: %r9
+; CHECK: br %r14
+; CHECK-O0-LABEL: foo_if:
+; CHECK-O0: chi %r2, 0
+; spill to stack
+; CHECK-O0: stg %r9, [[OFFS:[0-9]+]](%r15)
+; CHECK-O0: je
+; CHECK-O0: lghi %r2, 16
+; CHECK-O0: brasl %r14, malloc
+; CHECK-O0: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK-O0: mvi 8(%r2), 1
+; CHECK-O0: lgr %r9, %r[[REG1]]
+; CHECK-O0: br %r14
+; reload from stack
+; CHECK-O0: lg %r9, [[OFFS]](%r15)
+; CHECK-O0: br %r14
+entry:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %normal
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+
+normal:
+ ret float 0.0
+}
+
+; "foo_loop" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition inside a loop.
+define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float %cc2) {
+; CHECK-LABEL: foo_loop:
+; CHECK: lr %r[[REG1:[0-9]+]], %r2
+; CHECK: cije %r[[REG1]], 0
+; CHECK: lghi %r2, 16
+; CHECK: brasl %r14, malloc
+; CHECK: mvi 8(%r2), 1
+; CHECK: ceb %f8,
+; CHECK: jnh
+; CHECK: lgr %r9, %r2
+; CHECK: br %r14
+; CHECK-O0-LABEL: foo_loop:
+; spill to stack
+; CHECK-O0: stg %r9, [[OFFS:[0-9]+]](%r15)
+; CHECK-O0: chi %r{{.*}}, 0
+; CHECK-O0: je
+; CHECK-O0: lghi %r2, 16
+; CHECK-O0: brasl %r14, malloc
+; CHECK-O0: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK-O0: mvi 8(%r2), 1
+; CHECK-O0: jnh
+; reload from stack
+; CHECK-O0: lg %r9, [[OFFS:[0-9]+]](%r15)
+; CHECK-O0: br %r14
+entry:
+ br label %bb_loop
+
+bb_loop:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %bb_cont
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ br label %bb_cont
+
+bb_cont:
+ %cmp = fcmp ogt float %cc2, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ ret float 0.0
+}
+
+%struct.S = type { i32, i32, i32, i32, i32, i32 }
+
+; "foo_sret" is a function that takes a swifterror parameter, it also has a sret
+; parameter.
+define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swifterror %error_ptr_ref) {
+; CHECK-LABEL: foo_sret:
+; CHECK-DAG: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK-DAG: lr %r[[REG2:[0-9]+]], %r3
+; CHECK: lghi %r2, 16
+; CHECK: brasl %r14, malloc
+; CHECK: mvi 8(%r2), 1
+; CHECK: st %r[[REG2]], 4(%r[[REG1]])
+; CHECK: lgr %r9, %r2
+; CHECK-NOT: %r9
+; CHECK: br %r14
+
+; CHECK-O0-LABEL: foo_sret:
+; CHECK-O0: lghi %r{{.*}}, 16
+; spill sret to stack
+; CHECK-O0: stg %r2, [[OFFS1:[0-9]+]](%r15)
+; CHECK-O0: lgr %r2, %r{{.*}}
+; CHECK-O0: st %r3, [[OFFS2:[0-9]+]](%r15)
+; CHECK-O0: brasl %r14, malloc
+; CHECK-O0: lgr {{.*}}, %r2
+; CHECK-O0: mvi 8(%r2), 1
+; CHECK-O0-DAG: lg %r[[REG1:[0-9]+]], [[OFFS1]](%r15)
+; CHECK-O0-DAG: l %r[[REG2:[0-9]+]], [[OFFS2]](%r15)
+; CHECK-O0: st %r[[REG2]], 4(%r[[REG1]])
+; CHECK-O0: lgr %r9, {{.*}}
+; CHECK-O0: br %r14
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ %v2 = getelementptr inbounds %struct.S, %struct.S* %agg.result, i32 0, i32 1
+ store i32 %val1, i32* %v2
+ ret void
+}
+
+; "caller3" calls "foo_sret" that takes a swifterror parameter.
+define float @caller3(i8* %error_ref) {
+; CHECK-LABEL: caller3:
+; Make a copy of error_ref because r2 is getting clobbered
+; CHECK: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK: lhi %r3, 1
+; CHECK: lghi %r9, 0
+; CHECK: brasl %r14, foo_sret
+; CHECK: cgijlh %r9, 0,
+; Access part of the error object and save it to error_ref
+; CHECK: lb %r0, 8(%r9)
+; CHECK: stc %r0, 0(%r[[REG1]])
+; CHECK: lgr %r2, %r9
+; CHECK: brasl %r14, free
+
+; CHECK-O0-LABEL: caller3:
+; CHECK-O0: lghi %r9, 0
+; CHECK-O0: lhi %r3, 1
+; CHECK-O0: stg %r2, {{.*}}(%r15)
+; CHECK-O0: lgr %r2, {{.*}}
+; CHECK-O0: brasl %r14, foo_sret
+; CHECK-O0: lgr {{.*}}, %r9
+; CHECK-O0: cghi %r9, 0
+; CHECK-O0: jlh
+; Access part of the error object and save it to error_ref
+; CHECK-O0: lb %r0, 8(%r{{.*}})
+; CHECK-O0: stc %r0, 0(%r{{.*}})
+; reload from stack
+; CHECK-O0: lg %r2, {{.*}}(%r15)
+; CHECK-O0: brasl %r14, free
+entry:
+ %s = alloca %struct.S, align 8
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ call void @foo_sret(%struct.S* sret %s, i32 1, %swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; This is a caller with multiple swifterror values, it calls "foo" twice, each
+; time with a different swifterror value, from "alloca swifterror".
+define float @caller_with_multiple_swifterror_values(i8* %error_ref, i8* %error_ref2) {
+; CHECK-LABEL: caller_with_multiple_swifterror_values:
+; CHECK-DAG: lgr %r[[REG1:[0-9]+]], %r2
+; CHECK-DAG: lgr %r[[REG2:[0-9]+]], %r3
+; The first swifterror value:
+; CHECK: lghi %r9, 0
+; CHECK: brasl %r14, foo
+; CHECK: cgijlh %r9, 0,
+; Access part of the error object and save it to error_ref
+; CHECK: lb %r0, 8(%r9)
+; CHECK: stc %r0, 0(%r[[REG1]])
+; CHECK: lgr %r2, %r9
+; CHECK: brasl %r14, free
+
+; The second swifterror value:
+; CHECK: lghi %r9, 0
+; CHECK: brasl %r14, foo
+; CHECK: cgijlh %r9, 0,
+; Access part of the error object and save it to error_ref
+; CHECK: lb %r0, 8(%r9)
+; CHECK: stc %r0, 0(%r[[REG2]])
+; CHECK: lgr %r2, %r9
+; CHECK: brasl %r14, free
+
+; CHECK-O0-LABEL: caller_with_multiple_swifterror_values:
+
+; The first swifterror value:
+; CHECK-O0: lghi %r9, 0
+; CHECK-O0: brasl %r14, foo
+; CHECK-O0: jlh
+
+; The second swifterror value:
+; CHECK-O0: lghi %r9, 0
+; CHECK-O0: brasl %r14, foo
+; CHECK-O0: jlh
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+
+ %error_ptr_ref2 = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref2
+ %call2 = call float @foo(%swift_error** swifterror %error_ptr_ref2)
+ %error_from_foo2 = load %swift_error*, %swift_error** %error_ptr_ref2
+ %had_error_from_foo2 = icmp ne %swift_error* %error_from_foo2, null
+ %bitcast2 = bitcast %swift_error* %error_from_foo2 to i8*
+ br i1 %had_error_from_foo2, label %handler2, label %cont2
+cont2:
+ %v2 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo2, i64 0, i32 1
+ %t2 = load i8, i8* %v2
+ store i8 %t2, i8* %error_ref2
+ br label %handler2
+handler2:
+ call void @free(i8* %bitcast2)
+
+ ret float 1.0
+}
diff --git a/test/CodeGen/SystemZ/swiftself.ll b/test/CodeGen/SystemZ/swiftself.ll
new file mode 100644
index 000000000000..ee6104ad2039
--- /dev/null
+++ b/test/CodeGen/SystemZ/swiftself.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Parameter with swiftself should be allocated to r10.
+; CHECK-LABEL: swiftself_param:
+; CHECK: lgr %r2, %r10
+define i8 *@swiftself_param(i8* swiftself %addr0) {
+ ret i8 *%addr0
+}
+
+; Check that r10 is used to pass a swiftself argument.
+; CHECK-LABEL: call_swiftself:
+; CHECK: lgr %r10, %r2
+; CHECK: brasl %r14, swiftself_param
+define i8 *@call_swiftself(i8* %arg) {
+ %res = call i8 *@swiftself_param(i8* swiftself %arg)
+ ret i8 *%res
+}
+
+; r10 should be saved by the callee even if used for swiftself
+; CHECK-LABEL: swiftself_clobber:
+; CHECK: stmg %r10,
+; ...
+; CHECK: lmg %r10,
+; CHECK: br %r14
+define i8 *@swiftself_clobber(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{r10}"()
+ ret i8 *%addr0
+}
+
+; Demonstrate that we do not need any loads when calling multiple functions
+; with swiftself argument.
+; CHECK-LABEL: swiftself_passthrough:
+; CHECK-NOT: lg{{.*}}r10,
+; CHECK: brasl %r14, swiftself_param
+; CHECK-NOT: lg{{.*}}r10,
+; CHECK-NEXT: brasl %r14, swiftself_param
+define void @swiftself_passthrough(i8* swiftself %addr0) {
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ ret void
+}
+
+; Normally, we can use a tail call if the callee swiftself is the same as the
+; caller one. Not yet supported on SystemZ.
+; CHECK-LABEL: swiftself_tail:
+; CHECK: lgr %r[[REG1:[0-9]+]], %r10
+; CHECK: lgr %r10, %r[[REG1]]
+; CHECK: brasl %r14, swiftself_param
+; CHECK: br %r14
+define i8* @swiftself_tail(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{r10}"()
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr0)
+ ret i8* %res
+}
+
+; We can not use a tail call if the callee swiftself is not the same as the
+; caller one.
+; CHECK-LABEL: swiftself_notail:
+; CHECK: lgr %r10, %r2
+; CHECK: brasl %r14, swiftself_param
+; CHECK: lmg %r10,
+; CHECK: br %r14
+define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
+ ret i8* %res
+}
diff --git a/test/CodeGen/SystemZ/tdc-01.ll b/test/CodeGen/SystemZ/tdc-01.ll
new file mode 100644
index 000000000000..052d895b798f
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-01.ll
@@ -0,0 +1,95 @@
+; Test the Test Data Class instruction, selected manually via the intrinsic.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i32 @llvm.s390.tdc.f32(float, i64)
+declare i32 @llvm.s390.tdc.f64(double, i64)
+declare i32 @llvm.s390.tdc.f128(fp128, i64)
+
+; Check using as i32 - f32
+define i32 @f1(float %x) {
+; CHECK-LABEL: f1
+; CHECK: tceb %f0, 123
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+ %res = call i32 @llvm.s390.tdc.f32(float %x, i64 123)
+ ret i32 %res
+}
+
+; Check using as i32 - f64
+define i32 @f2(double %x) {
+; CHECK-LABEL: f2
+; CHECK: tcdb %f0, 123
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+ %res = call i32 @llvm.s390.tdc.f64(double %x, i64 123)
+ ret i32 %res
+}
+
+; Check using as i32 - f128
+define i32 @f3(fp128 %x) {
+; CHECK-LABEL: f3
+; CHECK: ld %f0, 0(%r2)
+; CHECK: ld %f2, 8(%r2)
+; CHECK: tcxb %f0, 123
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+ %res = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 123)
+ ret i32 %res
+}
+
+declare void @g()
+
+; Check branch
+define void @f4(float %x) {
+; CHECK-LABEL: f4
+; CHECK: tceb %f0, 123
+; CHECK: jgl g
+; CHECK: br %r14
+ %res = call i32 @llvm.s390.tdc.f32(float %x, i64 123)
+ %cond = icmp ne i32 %res, 0
+ br i1 %cond, label %call, label %exit
+
+call:
+ tail call void @g()
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check branch negated
+define void @f5(float %x) {
+; CHECK-LABEL: f5
+; CHECK: tceb %f0, 123
+; CHECK: jge g
+; CHECK: br %r14
+ %res = call i32 @llvm.s390.tdc.f32(float %x, i64 123)
+ %cond = icmp eq i32 %res, 0
+ br i1 %cond, label %call, label %exit
+
+call:
+ tail call void @g()
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check non-const mask
+define void @f6(float %x, i64 %y) {
+; CHECK-LABEL: f6
+; CHECK: tceb %f0, 0(%r2)
+; CHECK: jge g
+; CHECK: br %r14
+ %res = call i32 @llvm.s390.tdc.f32(float %x, i64 %y)
+ %cond = icmp eq i32 %res, 0
+ br i1 %cond, label %call, label %exit
+
+call:
+ tail call void @g()
+ br label %exit
+
+exit:
+ ret void
+}
diff --git a/test/CodeGen/SystemZ/tdc-02.ll b/test/CodeGen/SystemZ/tdc-02.ll
new file mode 100644
index 000000000000..c0c4ac84349e
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-02.ll
@@ -0,0 +1,96 @@
+; Test the Test Data Class instruction logic operation folding.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i32 @llvm.s390.tdc.f32(float, i64)
+declare i32 @llvm.s390.tdc.f64(double, i64)
+declare i32 @llvm.s390.tdc.f128(fp128, i64)
+
+; Check using or i1
+define i32 @f1(float %x) {
+; CHECK-LABEL: f1
+; CHECK: tceb %f0, 7
+; CHECK-NEXT: ipm [[REG1:%r[0-9]+]]
+; CHECK-NEXT: risbg %r2, [[REG1]], 63, 191, 36
+ %a = call i32 @llvm.s390.tdc.f32(float %x, i64 3)
+ %b = call i32 @llvm.s390.tdc.f32(float %x, i64 6)
+ %a1 = icmp ne i32 %a, 0
+ %b1 = icmp ne i32 %b, 0
+ %res = or i1 %a1, %b1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Check using and i1
+define i32 @f2(double %x) {
+; CHECK-LABEL: f2
+; CHECK: tcdb %f0, 2
+; CHECK-NEXT: ipm [[REG1:%r[0-9]+]]
+; CHECK-NEXT: risbg %r2, [[REG1]], 63, 191, 36
+ %a = call i32 @llvm.s390.tdc.f64(double %x, i64 3)
+ %b = call i32 @llvm.s390.tdc.f64(double %x, i64 6)
+ %a1 = icmp ne i32 %a, 0
+ %b1 = icmp ne i32 %b, 0
+ %res = and i1 %a1, %b1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Check using xor i1
+define i32 @f3(fp128 %x) {
+; CHECK-LABEL: f3
+; CHECK: tcxb %f0, 5
+; CHECK-NEXT: ipm [[REG1:%r[0-9]+]]
+; CHECK-NEXT: risbg %r2, [[REG1]], 63, 191, 36
+ %a = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 3)
+ %b = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 6)
+ %a1 = icmp ne i32 %a, 0
+ %b1 = icmp ne i32 %b, 0
+ %res = xor i1 %a1, %b1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Check using xor i1 - negated test
+define i32 @f4(fp128 %x) {
+; CHECK-LABEL: f4
+; CHECK: tcxb %f0, 4090
+; CHECK-NEXT: ipm [[REG1:%r[0-9]+]]
+; CHECK-NEXT: risbg %r2, [[REG1]], 63, 191, 36
+ %a = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 3)
+ %b = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 6)
+ %a1 = icmp ne i32 %a, 0
+ %b1 = icmp eq i32 %b, 0
+ %res = xor i1 %a1, %b1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Check different first args
+define i32 @f5(float %x, float %y) {
+; CHECK-LABEL: f5
+; CHECK-NOT: tceb {{%f[0-9]+}}, 5
+; CHECK-DAG: tceb %f0, 3
+; CHECK-DAG: tceb %f2, 6
+ %a = call i32 @llvm.s390.tdc.f32(float %x, i64 3)
+ %b = call i32 @llvm.s390.tdc.f32(float %y, i64 6)
+ %a1 = icmp ne i32 %a, 0
+ %b1 = icmp ne i32 %b, 0
+ %res = xor i1 %a1, %b1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Non-const mask (not supported)
+define i32 @f6(float %x, i64 %y) {
+; CHECK-LABEL: f6
+; CHECK-DAG: tceb %f0, 0(%r2)
+; CHECK-DAG: tceb %f0, 6
+ %a = call i32 @llvm.s390.tdc.f32(float %x, i64 %y)
+ %b = call i32 @llvm.s390.tdc.f32(float %x, i64 6)
+ %a1 = icmp ne i32 %a, 0
+ %b1 = icmp ne i32 %b, 0
+ %res = xor i1 %a1, %b1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
diff --git a/test/CodeGen/SystemZ/tdc-03.ll b/test/CodeGen/SystemZ/tdc-03.ll
new file mode 100644
index 000000000000..95708f1effc6
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-03.ll
@@ -0,0 +1,139 @@
+; Test the Test Data Class instruction logic operation conversion from
+; compares.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare fp128 @llvm.fabs.f128(fp128)
+
+; Compare with 0 (unworthy)
+define i32 @f1(float %x) {
+; CHECK-LABEL: f1
+; CHECK-NOT: tceb
+; CHECK: ltebr {{%f[0-9]+}}, %f0
+; CHECK-NOT: tceb
+ %res = fcmp ugt float %x, 0.0
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs with 0 (unworthy)
+define i32 @f2(float %x) {
+; CHECK-LABEL: f2
+; CHECK-NOT: tceb
+; CHECK: lpebr {{%f[0-9]+}}, %f0
+; CHECK-NOT: tceb
+ %y = call float @llvm.fabs.f32(float %x)
+ %res = fcmp ugt float %y, 0.0
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare with inf (unworthy)
+define i32 @f3(float %x) {
+; CHECK-LABEL: f3
+; CHECK-NOT: tceb
+; CHECK: ceb %f0, 0(%r{{[0-9]+}})
+; CHECK-NOT: tceb
+ %res = fcmp ult float %x, 0x7ff0000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs with inf
+define i32 @f4(float %x) {
+; CHECK-LABEL: f4
+; CHECK: tceb %f0, 4047
+ %y = call float @llvm.fabs.f32(float %x)
+ %res = fcmp ult float %y, 0x7ff0000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare with minnorm (unworthy)
+define i32 @f5(float %x) {
+; CHECK-LABEL: f5
+; CHECK-NOT: tceb
+; CHECK: ceb %f0, 0(%r{{[0-9]+}})
+; CHECK-NOT: tceb
+ %res = fcmp ult float %x, 0x3810000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs with minnorm
+define i32 @f6(float %x) {
+; CHECK-LABEL: f6
+; CHECK: tceb %f0, 3279
+ %y = call float @llvm.fabs.f32(float %x)
+ %res = fcmp ult float %y, 0x3810000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs with minnorm, unsupported condition
+define i32 @f7(float %x) {
+; CHECK-LABEL: f7
+; CHECK-NOT: tceb
+; CHECK: lpdfr [[REG:%f[0-9]+]], %f0
+; CHECK: ceb [[REG]], 0(%r{{[0-9]+}})
+; CHECK-NOT: tceb
+ %y = call float @llvm.fabs.f32(float %x)
+ %res = fcmp ugt float %y, 0x3810000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs with unsupported constant
+define i32 @f8(float %x) {
+; CHECK-LABEL: f8
+; CHECK-NOT: tceb
+; CHECK: lpdfr [[REG:%f[0-9]+]], %f0
+; CHECK: ceb [[REG]], 0(%r{{[0-9]+}})
+; CHECK-NOT: tceb
+ %y = call float @llvm.fabs.f32(float %x)
+ %res = fcmp ult float %y, 0x3ff0000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs with minnorm - double
+define i32 @f9(double %x) {
+; CHECK-LABEL: f9
+; CHECK: tcdb %f0, 3279
+ %y = call double @llvm.fabs.f64(double %x)
+ %res = fcmp ult double %y, 0x0010000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs with minnorm - long double
+define i32 @f10(fp128 %x) {
+; CHECK-LABEL: f10
+; CHECK: tcxb %f0, 3279
+ %y = call fp128 @llvm.fabs.f128(fp128 %x)
+ %res = fcmp ult fp128 %y, 0xL00000000000000000001000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs for one with inf - clang's isfinite
+define i32 @f11(double %x) {
+; CHECK-LABEL: f11
+; CHECK: tcdb %f0, 4032
+ %y = call double @llvm.fabs.f64(double %x)
+ %res = fcmp one double %y, 0x7ff0000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare fabs for oeq with inf - clang's isinf
+define i32 @f12(double %x) {
+; CHECK-LABEL: f12
+; CHECK: tcdb %f0, 48
+ %y = call double @llvm.fabs.f64(double %x)
+ %res = fcmp oeq double %y, 0x7ff0000000000000
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
diff --git a/test/CodeGen/SystemZ/tdc-04.ll b/test/CodeGen/SystemZ/tdc-04.ll
new file mode 100644
index 000000000000..929285b0ba8f
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-04.ll
@@ -0,0 +1,85 @@
+; Test the Test Data Class instruction logic operation conversion from
+; signbit extraction.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+;
+
+; Extract sign bit.
+define i32 @f1(float %x) {
+; CHECK-LABEL: f1
+; CHECK: tceb %f0, 1365
+ %cast = bitcast float %x to i32
+ %res = icmp slt i32 %cast, 0
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Extract negated sign bit.
+define i32 @f2(float %x) {
+; CHECK-LABEL: f2
+; CHECK: tceb %f0, 2730
+ %cast = bitcast float %x to i32
+ %res = icmp sgt i32 %cast, -1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Extract sign bit.
+define i32 @f3(double %x) {
+; CHECK-LABEL: f3
+; CHECK: tcdb %f0, 1365
+ %cast = bitcast double %x to i64
+ %res = icmp slt i64 %cast, 0
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Extract negated sign bit.
+define i32 @f4(double %x) {
+; CHECK-LABEL: f4
+; CHECK: tcdb %f0, 2730
+ %cast = bitcast double %x to i64
+ %res = icmp sgt i64 %cast, -1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Extract sign bit.
+define i32 @f5(fp128 %x) {
+; CHECK-LABEL: f5
+; CHECK: tcxb %f0, 1365
+ %cast = bitcast fp128 %x to i128
+ %res = icmp slt i128 %cast, 0
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Extract negated sign bit.
+define i32 @f6(fp128 %x) {
+; CHECK-LABEL: f6
+; CHECK: tcxb %f0, 2730
+ %cast = bitcast fp128 %x to i128
+ %res = icmp sgt i128 %cast, -1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Wrong const.
+define i32 @f7(float %x) {
+; CHECK-LABEL: f7
+; CHECK-NOT: tceb
+ %cast = bitcast float %x to i32
+ %res = icmp slt i32 %cast, -1
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Wrong pred.
+define i32 @f8(float %x) {
+; CHECK-LABEL: f8
+; CHECK-NOT: tceb
+ %cast = bitcast float %x to i32
+ %res = icmp eq i32 %cast, 0
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
diff --git a/test/CodeGen/SystemZ/tdc-05.ll b/test/CodeGen/SystemZ/tdc-05.ll
new file mode 100644
index 000000000000..c639a9b7b475
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-05.ll
@@ -0,0 +1,97 @@
+; Test the Test Data Class instruction logic operation conversion from
+; compares, combined with signbit or other compares to ensure worthiness.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+;
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare fp128 @llvm.fabs.f128(fp128)
+
+; Compare with 0, extract sign bit
+define i32 @f1(float %x) {
+; CHECK-LABEL: f1
+; CHECK: tceb %f0, 2047
+ %cast = bitcast float %x to i32
+ %sign = icmp slt i32 %cast, 0
+ %fcmp = fcmp ugt float %x, 0.0
+ %res = or i1 %sign, %fcmp
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare with inf, extract negated sign bit
+define i32 @f2(float %x) {
+; CHECK-LABEL: f2
+; CHECK: tceb %f0, 2698
+ %cast = bitcast float %x to i32
+ %sign = icmp sgt i32 %cast, -1
+ %fcmp = fcmp ult float %x, 0x7ff0000000000000
+ %res = and i1 %sign, %fcmp
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Compare with minnorm, extract negated sign bit
+define i32 @f3(float %x) {
+; CHECK-LABEL: f3
+; CHECK: tceb %f0, 2176
+ %cast = bitcast float %x to i32
+ %sign = icmp sgt i32 %cast, -1
+ %fcmp = fcmp olt float %x, 0x3810000000000000
+ %res = and i1 %sign, %fcmp
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Test float isnormal, from clang.
+define i32 @f4(float %x) {
+; CHECK-LABEL: f4
+; CHECK: tceb %f0, 768
+ %y = call float @llvm.fabs.f32(float %x)
+ %ord = fcmp ord float %x, 0.0
+ %a = fcmp ult float %y, 0x7ff0000000000000
+ %b = fcmp uge float %y, 0x3810000000000000
+ %c = and i1 %a, %b
+ %res = and i1 %ord, %c
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Check for negative 0.
+define i32 @f5(float %x) {
+; CHECK-LABEL: f5
+; CHECK: tceb %f0, 1024
+ %cast = bitcast float %x to i32
+ %sign = icmp slt i32 %cast, 0
+ %fcmp = fcmp oeq float %x, 0.0
+ %res = and i1 %sign, %fcmp
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Test isnormal, from clang.
+define i32 @f6(double %x) {
+; CHECK-LABEL: f6
+; CHECK: tcdb %f0, 768
+ %y = call double @llvm.fabs.f64(double %x)
+ %ord = fcmp ord double %x, 0.0
+ %a = fcmp ult double %y, 0x7ff0000000000000
+ %b = fcmp uge double %y, 0x0010000000000000
+ %c = and i1 %ord, %a
+ %res = and i1 %b, %c
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
+
+; Test isinf || isnan, from clang.
+define i32 @f7(double %x) {
+; CHECK-LABEL: f7
+; CHECK: tcdb %f0, 63
+ %y = call double @llvm.fabs.f64(double %x)
+ %a = fcmp oeq double %y, 0x7ff0000000000000
+ %b = fcmp uno double %x, 0.0
+ %res = or i1 %a, %b
+ %xres = zext i1 %res to i32
+ ret i32 %xres
+}
diff --git a/test/CodeGen/SystemZ/tdc-06.ll b/test/CodeGen/SystemZ/tdc-06.ll
new file mode 100644
index 000000000000..11fb1e2916e0
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-06.ll
@@ -0,0 +1,48 @@
+; Test the Test Data Class instruction, as used by fpclassify.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+;
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare fp128 @llvm.fabs.f128(fp128)
+
+define i32 @fpc(double %x) {
+entry:
+; CHECK-LABEL: fpc
+; CHECK: lhi %r2, 5
+; CHECK: ltdbr %f0, %f0
+; CHECK: je [[RET:.L.*]]
+ %testeq = fcmp oeq double %x, 0.000000e+00
+ br i1 %testeq, label %ret, label %nonzero
+
+nonzero:
+; CHECK: lhi %r2, 1
+; CHECK: cdbr %f0, %f0
+; CHECK: jo [[RET]]
+ %testnan = fcmp uno double %x, 0.000000e+00
+ br i1 %testnan, label %ret, label %nonzeroord
+
+nonzeroord:
+; CHECK: lhi %r2, 2
+; CHECK: tcdb %f0, 48
+; CHECK: jl [[RET]]
+ %abs = tail call double @llvm.fabs.f64(double %x)
+ %testinf = fcmp oeq double %abs, 0x7FF0000000000000
+ br i1 %testinf, label %ret, label %finite
+
+finite:
+; CHECK: lhi %r2, 3
+; CHECK: tcdb %f0, 831
+; CHECK: blr %r14
+; CHECK: lhi %r2, 4
+ %testnormal = fcmp uge double %abs, 0x10000000000000
+ %finres = select i1 %testnormal, i32 3, i32 4
+ br label %ret
+
+ret:
+; CHECK: [[RET]]:
+; CHECK: br %r14
+ %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ]
+ ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/trap-01.ll b/test/CodeGen/SystemZ/trap-01.ll
new file mode 100644
index 000000000000..3a766d9e8e3b
--- /dev/null
+++ b/test/CodeGen/SystemZ/trap-01.ll
@@ -0,0 +1,179 @@
+; Test traps and conditional traps
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @llvm.trap()
+
+; Check unconditional traps
+define i32 @f0() {
+; CHECK-LABEL: f0:
+; CHECK-LABEL: .Ltmp0
+; CHECK: j .Ltmp0+2
+entry:
+ tail call void @llvm.trap()
+ ret i32 0
+}
+
+; Check conditional compare immediate and trap
+define i32 @f1(i32 signext %a) {
+; CHECK-LABEL: f1:
+; CHECK: cithe %r2, 15
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp sgt i32 %a, 14
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i32 0
+}
+
+; Check conditional compare grande immediate and trap
+define i64 @f2(i64 signext %a) {
+; CHECK-LABEL: f2:
+; CHECK: cgitle %r2, 14
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp slt i64 %a, 15
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i64 0
+}
+
+; Check conditional compare logical immediate and trap
+define i32 @f3(i32 zeroext %a) {
+; CHECK-LABEL: f3:
+; CHECK: clfithe %r2, 15
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp ugt i32 %a, 14
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i32 0
+}
+
+; Check conditional compare grande logical immediate and trap
+define i64 @f4(i64 zeroext %a) {
+; CHECK-LABEL: f4:
+; CHECK: clgitle %r2, 14
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp ult i64 %a, 15
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i64 0
+}
+
+; Check conditional compare and trap
+define i32 @f5(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: f5:
+; CHECK: crte %r2, %r3
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp eq i32 %a, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i32 0
+}
+
+; Check conditional compare grande and trap
+define i64 @f6(i64 signext %a, i64 signext %b) {
+; CHECK-LABEL: f6:
+; CHECK: cgrtl %r2, %r3
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp slt i64 %a, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i64 0
+}
+
+; Check conditional compare logical and trap
+define i32 @f7(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: f7:
+; CHECK: clrth %r2, %r3
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp ugt i32 %a, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i32 0
+}
+
+; Check conditional compare logical grande and trap
+define i64 @f8(i64 zeroext %a, i64 zeroext %b) {
+; CHECK-LABEL: f8:
+; CHECK: clgrtl %r2, %r3
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+entry:
+ %cmp = icmp ult i64 %a, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret i64 0
+}
+
+; Check conditional traps that don't have a valid Compare and Trap
+define double @f9(double %a, double %b) {
+; CHECK-LABEL: f9:
+; CHECK: cdbr %f0, %f2
+; CHECK-LABEL: .Ltmp1
+; CHECK: je .Ltmp1+2
+; CHECK: lzdr %f0
+; CHECK: br %r14
+entry:
+ %cmp = fcmp oeq double %a, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ tail call void @llvm.trap()
+ unreachable
+
+if.end: ; preds = %entry
+ ret double 0.000000e+00
+}
diff --git a/test/CodeGen/SystemZ/vec-extract-02.ll b/test/CodeGen/SystemZ/vec-extract-02.ll
index c91e852fcf45..a87b7d52771b 100644
--- a/test/CodeGen/SystemZ/vec-extract-02.ll
+++ b/test/CodeGen/SystemZ/vec-extract-02.ll
@@ -6,7 +6,7 @@
; The index must be extended from i32 to i64.
define i32 @f1(<4 x i32> *%ptr, i32 %index) {
; CHECK-LABEL: f1:
-; CHECK: risbg {{%r[0-5]}}, %r3, 30, 189, 2
+; CHECK: risbgn {{%r[0-5]}}, %r3, 30, 189, 2
; CHECK: l %r2,
; CHECK: br %r14
%vec = load <4 x i32>, <4 x i32> *%ptr
diff --git a/test/CodeGen/SystemZ/vec-intrinsics.ll b/test/CodeGen/SystemZ/vec-intrinsics.ll
index 55527787da4c..6f5eb0691aa8 100644
--- a/test/CodeGen/SystemZ/vec-intrinsics.ll
+++ b/test/CodeGen/SystemZ/vec-intrinsics.ll
@@ -396,7 +396,7 @@ define <16 x i8> @test_vpkshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
define <16 x i8> @test_vpkshs_all_store(<8 x i16> %a, <8 x i16> %b, i32 *%ptr) {
; CHECK-LABEL: test_vpkshs_all_store:
; CHECK: vpkshs %v24, %v24, %v26
-; CHECK-NEXT: {{jno|jle}} {{\.L*}}
+; CHECK-NEXT: {{bnor|bler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b)
@@ -432,7 +432,7 @@ define <8 x i16> @test_vpksfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
define <8 x i16> @test_vpksfs_any_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
; CHECK-LABEL: test_vpksfs_any_store:
; CHECK: vpksfs %v24, %v24, %v26
-; CHECK-NEXT: {{jhe|je}} {{\.L*}}
+; CHECK-NEXT: {{bher|ber}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
@@ -469,7 +469,7 @@ define <4 x i32> @test_vpksgs_none_store(<2 x i64> %a, <2 x i64> %b,
i32 *%ptr) {
; CHECK-LABEL: test_vpksgs_none_store:
; CHECK: vpksgs %v24, %v24, %v26
-; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
+; CHECK-NEXT: {{bnher|bner}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b)
@@ -533,7 +533,7 @@ define <16 x i8> @test_vpklshs_all_store(<8 x i16> %a, <8 x i16> %b,
i32 *%ptr) {
; CHECK-LABEL: test_vpklshs_all_store:
; CHECK: vpklshs %v24, %v24, %v26
-; CHECK-NEXT: {{jno|jle}} {{\.L*}}
+; CHECK-NEXT: {{bnor|bler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b)
@@ -570,7 +570,7 @@ define <8 x i16> @test_vpklsfs_any_store(<4 x i32> %a, <4 x i32> %b,
i32 *%ptr) {
; CHECK-LABEL: test_vpklsfs_any_store:
; CHECK: vpklsfs %v24, %v24, %v26
-; CHECK-NEXT: {{jhe|je}} {{\.L*}}
+; CHECK-NEXT: {{bher|ber}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b)
@@ -607,7 +607,7 @@ define <4 x i32> @test_vpklsgs_none_store(<2 x i64> %a, <2 x i64> %b,
i32 *%ptr) {
; CHECK-LABEL: test_vpklsgs_none_store:
; CHECK: vpklsgs %v24, %v24, %v26
-; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
+; CHECK-NEXT: {{bnher|bner}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b)
@@ -1705,7 +1705,7 @@ define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
; CHECK-LABEL: test_vtm_all_store:
; CHECK-NOT: %r
; CHECK: vtm %v24, %v26
-; CHECK-NEXT: {{jno|jle}} {{\.L*}}
+; CHECK-NEXT: {{bnor|bler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
@@ -1752,7 +1752,7 @@ define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
; CHECK-LABEL: test_vceqbs_any_store:
; CHECK-NOT: %r
; CHECK: vceqbs %v24, %v24, %v26
-; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
+; CHECK-NEXT: {{bor|bnler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
@@ -1801,7 +1801,7 @@ define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b,
; CHECK-LABEL: test_vceqhs_notall_store:
; CHECK-NOT: %r
; CHECK: vceqhs %v24, %v24, %v26
-; CHECK-NEXT: {{jhe|je}} {{\.L*}}
+; CHECK-NEXT: {{bher|ber}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
@@ -1850,7 +1850,7 @@ define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b,
; CHECK-LABEL: test_vceqfs_none_store:
; CHECK-NOT: %r
; CHECK: vceqfs %v24, %v24, %v26
-; CHECK-NEXT: {{jno|jle}} {{\.L*}}
+; CHECK-NEXT: {{bnor|bler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
@@ -1899,7 +1899,7 @@ define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
; CHECK-LABEL: test_vceqgs_all_store:
; CHECK-NOT: %r
; CHECK: vceqgs %v24, %v24, %v26
-; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
+; CHECK-NEXT: {{bnher|bner}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
@@ -1948,7 +1948,7 @@ define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
; CHECK-LABEL: test_vchbs_any_store:
; CHECK-NOT: %r
; CHECK: vchbs %v24, %v24, %v26
-; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
+; CHECK-NEXT: {{bor|bnler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
@@ -1997,7 +1997,7 @@ define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b,
; CHECK-LABEL: test_vchhs_notall_store:
; CHECK-NOT: %r
; CHECK: vchhs %v24, %v24, %v26
-; CHECK-NEXT: {{jhe|je}} {{\.L*}}
+; CHECK-NEXT: {{bher|ber}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
@@ -2045,7 +2045,7 @@ define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
; CHECK-LABEL: test_vchfs_none_store:
; CHECK-NOT: %r
; CHECK: vchfs %v24, %v24, %v26
-; CHECK-NEXT: {{jno|jle}} {{\.L*}}
+; CHECK-NEXT: {{bnor|bler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
@@ -2094,7 +2094,7 @@ define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
; CHECK-LABEL: test_vchgs_all_store:
; CHECK-NOT: %r
; CHECK: vchgs %v24, %v24, %v26
-; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
+; CHECK-NEXT: {{bnher|bner}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
@@ -2143,7 +2143,7 @@ define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
; CHECK-LABEL: test_vchlbs_any_store:
; CHECK-NOT: %r
; CHECK: vchlbs %v24, %v24, %v26
-; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
+; CHECK-NEXT: {{bor|bnler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
@@ -2192,7 +2192,7 @@ define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b,
; CHECK-LABEL: test_vchlhs_notall_store:
; CHECK-NOT: %r
; CHECK: vchlhs %v24, %v24, %v26
-; CHECK-NEXT: {{jhe|je}} {{\.L*}}
+; CHECK-NEXT: {{bher|ber}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
@@ -2241,7 +2241,7 @@ define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b,
; CHECK-LABEL: test_vchlfs_none_store:
; CHECK-NOT: %r
; CHECK: vchlfs %v24, %v24, %v26
-; CHECK-NEXT: {{jno|jle}} {{\.L*}}
+; CHECK-NEXT: {{bnor|bler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
@@ -2290,7 +2290,7 @@ define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
; CHECK-LABEL: test_vchlgs_all_store:
; CHECK-NOT: %r
; CHECK: vchlgs %v24, %v24, %v26
-; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
+; CHECK-NEXT: {{bnher|bner}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
@@ -3166,7 +3166,7 @@ define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b,
; CHECK-LABEL: test_vfcedbs_any_store:
; CHECK-NOT: %r
; CHECK: vfcedbs %v24, %v24, %v26
-; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
+; CHECK-NEXT: {{bor|bnler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
@@ -3218,7 +3218,7 @@ define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b,
; CHECK-LABEL: test_vfchdbs_notall_store:
; CHECK-NOT: %r
; CHECK: vfchdbs %v24, %v24, %v26
-; CHECK-NEXT: {{jhe|je}} {{\.L*}}
+; CHECK-NEXT: {{bher|ber}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
@@ -3270,7 +3270,7 @@ define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b,
; CHECK-LABEL: test_vfchedbs_none_store:
; CHECK-NOT: %r
; CHECK: vfchedbs %v24, %v24, %v26
-; CHECK-NEXT: {{jno|jle}} {{\.L*}}
+; CHECK-NEXT: {{bnor|bler}} %r14
; CHECK: mvhi 0(%r2), 0
; CHECK: br %r14
%call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
diff --git a/test/CodeGen/SystemZ/vec-sub-01.ll b/test/CodeGen/SystemZ/vec-sub-01.ll
index 4afad8bef659..9829bd024332 100644
--- a/test/CodeGen/SystemZ/vec-sub-01.ll
+++ b/test/CodeGen/SystemZ/vec-sub-01.ll
@@ -52,7 +52,7 @@ define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) {
; CHECK-DAG: vrepf %v[[C2:[0-5]]], %v[[A2]], 2
; CHECK-DAG: vrepf %v[[D1:[0-5]]], %v[[A1]], 3
; CHECK-DAG: vrepf %v[[D2:[0-5]]], %v[[A2]], 3
-; CHECK-DAG: ler %f[[A1copy:[0-5]]], %f[[A1]]
+; CHECK-DAG: ldr %f[[A1copy:[0-5]]], %f[[A1]]
; CHECK-DAG: sebr %f[[A1copy]], %f[[A2]]
; CHECK-DAG: sebr %f[[B1]], %f[[B2]]
; CHECK-DAG: sebr %f[[C1]], %f[[C2]]
diff --git a/test/CodeGen/Thumb/2010-07-01-FuncAlign.ll b/test/CodeGen/Thumb/2010-07-01-FuncAlign.ll
index 8e09441feba4..c132a0a66222 100644
--- a/test/CodeGen/Thumb/2010-07-01-FuncAlign.ll
+++ b/test/CodeGen/Thumb/2010-07-01-FuncAlign.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
; Radar 8143571: Function alignments were off by a power of two.
-; CHECK: .align 1
+; CHECK: .p2align 1
define void @test() {
ret void
}
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index 8ec4d5b9865b..2f8e36b66b87 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -8,10 +8,10 @@
@llvm.used = appending global [1 x i8*] [i8* bitcast (void (%0*, i32, i32)* @_Z19getClosestDiagonal3ii to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
define void @_Z19getClosestDiagonal3ii(%0* noalias sret, i32, i32) nounwind {
-; CHECK: blx ___muldf3
-; CHECK: blx ___muldf3
+; CHECK: bl ___muldf3
+; CHECK: bl ___muldf3
; CHECK: beq LBB0
-; CHECK: blx ___muldf3
+; CHECK: bl ___muldf3
; <label>:3
switch i32 %1, label %4 [
i32 0, label %5
@@ -51,9 +51,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!0 = !DILocation(line: 46, scope: !1)
!1 = distinct !DILexicalBlock(line: 44, column: 0, file: !101, scope: !2)
!2 = distinct !DILexicalBlock(line: 44, column: 0, file: !101, scope: !3)
-!3 = distinct !DISubprogram(name: "getClosestDiagonal3", linkageName: "_Z19getClosestDiagonal3ii", line: 44, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !101, scope: null, type: !6)
+!3 = distinct !DISubprogram(name: "getClosestDiagonal3", linkageName: "_Z19getClosestDiagonal3ii", line: 44, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !5, file: !101, scope: null, type: !6)
!4 = !DIFile(filename: "ggEdgeDiscrepancy.cc", directory: "/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src")
-!5 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: 0, file: !101, enums: !102, retainedTypes: !102, subprograms: !103)
+!5 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", isOptimized: true, emissionKind: FullDebug, file: !101, enums: !102, retainedTypes: !102)
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !22, !22}
!8 = !DICompositeType(tag: DW_TAG_structure_type, name: "ggVector3", line: 66, size: 192, align: 32, file: !99, elements: !10)
@@ -87,12 +87,12 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!34 = !DIDerivedType(tag: DW_TAG_const_type, size: 192, align: 32, file: !101, scope: !4, baseType: !8)
!35 = !DISubprogram(name: "y", linkageName: "_ZNK9ggVector31yEv", line: 83, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
!36 = !DISubprogram(name: "z", linkageName: "_ZNK9ggVector31zEv", line: 84, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
-!37 = distinct !DISubprogram(name: "x", linkageName: "_ZN9ggVector31xEv", line: 85, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
+!37 = distinct !DISubprogram(name: "x", linkageName: "_ZN9ggVector31xEv", line: 85, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !5, file: !9, scope: !8, type: !38)
!38 = !DISubroutineType(types: !39)
!39 = !{!40, !19}
!40 = !DIDerivedType(tag: DW_TAG_reference_type, name: "double", size: 32, align: 32, file: !101, scope: !4, baseType: !13)
-!41 = distinct !DISubprogram(name: "y", linkageName: "_ZN9ggVector31yEv", line: 86, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
-!42 = distinct !DISubprogram(name: "z", linkageName: "_ZN9ggVector31zEv", line: 87, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !38)
+!41 = distinct !DISubprogram(name: "y", linkageName: "_ZN9ggVector31yEv", line: 86, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !5, file: !9, scope: !8, type: !38)
+!42 = distinct !DISubprogram(name: "z", linkageName: "_ZN9ggVector31zEv", line: 87, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !5, file: !9, scope: !8, type: !38)
!43 = !DISubprogram(name: "SetX", linkageName: "_ZN9ggVector34SetXEd", line: 88, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !44)
!44 = !DISubroutineType(types: !45)
!45 = !{null, !19, !13}
@@ -127,7 +127,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!74 = !DISubprogram(name: "operator/=", linkageName: "_ZN9ggVector3dVEd", line: 324, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !72)
!75 = !DISubprogram(name: "length", linkageName: "_ZNK9ggVector36lengthEv", line: 121, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
!76 = !DISubprogram(name: "squaredLength", linkageName: "_ZNK9ggVector313squaredLengthEv", line: 122, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !31)
-!77 = distinct !DISubprogram(name: "MakeUnitVector", linkageName: "_ZN9ggVector314MakeUnitVectorEv", line: 217, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !24)
+!77 = distinct !DISubprogram(name: "MakeUnitVector", linkageName: "_ZN9ggVector314MakeUnitVectorEv", line: 217, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !5, file: !9, scope: !8, type: !24)
!78 = !DISubprogram(name: "Perturb", linkageName: "_ZNK9ggVector37PerturbEdd", line: 126, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, file: !9, scope: !8, type: !79)
!79 = !DISubroutineType(types: !80)
!80 = !{!8, !33, !13, !13}
diff --git a/test/CodeGen/Thumb/and_neg.ll b/test/CodeGen/Thumb/and_neg.ll
new file mode 100644
index 000000000000..88217c7cb206
--- /dev/null
+++ b/test/CodeGen/Thumb/and_neg.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs
+; Just shouldn't crash, PR28348
+
+%C = type { i8* }
+
+define void @repro(%C* %this, i32 %a) {
+ %a_align1 = and i32 %a, -4096
+ %a_and = and i32 %a, 4095
+ %a_align2 = or i32 %a_and, 4096
+
+ call void @use(i32 %a_align1)
+
+ %C_field = getelementptr inbounds %C, %C* %this, i32 0, i32 0
+ %addptr = getelementptr inbounds i8, i8* null, i32 %a_align2
+ store i8* %addptr, i8** %C_field, align 4
+
+ ret void
+}
+
+declare void @use(i32)
diff --git a/test/CodeGen/Thumb/barrier.ll b/test/CodeGen/Thumb/barrier.ll
index 92d9bb2097ff..f6bc2ff822de 100644
--- a/test/CodeGen/Thumb/barrier.ll
+++ b/test/CodeGen/Thumb/barrier.ll
@@ -4,7 +4,7 @@
define void @t1() {
; V6-LABEL: t1:
-; V6: blx {{_*}}sync_synchronize
+; V6: bl {{_*}}sync_synchronize
; V6M-LABEL: t1:
; V6M: dmb sy
diff --git a/test/CodeGen/Thumb/bic_imm.ll b/test/CodeGen/Thumb/bic_imm.ll
new file mode 100644
index 000000000000..33d88b7b8ddd
--- /dev/null
+++ b/test/CodeGen/Thumb/bic_imm.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=cortex-m0 -verify-machineinstrs | FileCheck --check-prefix CHECK-T1 %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=cortex-m3 -verify-machineinstrs | FileCheck --check-prefix CHECK-T2 %s
+
+; CHECK-T1-LABEL: @i
+; CHECK-T2-LABEL: @i
+; CHECK-T1: movs r1, #255
+; CHECK-T1: adds r1, #20
+; CHECK-T1: bics r0, r1
+; CHECK-T2: movw r1, #275
+; CHECK-T2: bics r0, r1
+define i32 @i(i32 %a) {
+entry:
+ %and = and i32 %a, -276
+ ret i32 %and
+}
+
+; CHECK-T1-LABEL: @j
+; CHECK-T2-LABEL: @j
+; CHECK-T1: movs r1, #128
+; CHECK-T1: bics r0, r1
+; CHECK-T2: bic r0, r0, #128
+define i32 @j(i32 %a) {
+entry:
+ %and = and i32 %a, -129
+ ret i32 %and
+}
diff --git a/test/CodeGen/Thumb/constants.ll b/test/CodeGen/Thumb/constants.ll
new file mode 100644
index 000000000000..b1145d7b1d81
--- /dev/null
+++ b/test/CodeGen/Thumb/constants.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=cortex-m0 -verify-machineinstrs | FileCheck --check-prefix CHECK-T1 %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=cortex-m3 -verify-machineinstrs | FileCheck --check-prefix CHECK-T2 %s
+
+; CHECK-T1-LABEL: @mov_and_add
+; CHECK-T2-LABEL: @mov_and_add
+; CHECK-T1: movs r0, #255
+; CHECK-T1: adds r0, #12
+; CHECK-T2: movw r0, #267
+define i32 @mov_and_add() {
+ ret i32 267
+}
+
+; CHECK-T1-LABEL: @mov_and_add2
+; CHECK-T2-LABEL: @mov_and_add2
+; CHECK-T1: ldr r0,
+; CHECK-T2: movw r0, #511
+define i32 @mov_and_add2() {
+ ret i32 511
+}
diff --git a/test/CodeGen/Thumb/ldm-merge-struct.ll b/test/CodeGen/Thumb/ldm-merge-struct.ll
index 9815a9e505f1..707236803fb3 100644
--- a/test/CodeGen/Thumb/ldm-merge-struct.ll
+++ b/test/CodeGen/Thumb/ldm-merge-struct.ll
@@ -10,7 +10,7 @@ define i32 @f() {
entry:
; CHECK-LABEL: f:
; CHECK: ldm r[[BASE:[0-9]]],
-; CHECK-NEXT-NOT: subs r[[BASE]]
+; CHECK-NOT: subs r[[BASE]]
%0 = load i32, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 0), align 4
%1 = load i32, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1), align 4
%cmp = icmp sgt i32 %0, %1
diff --git a/test/CodeGen/Thumb/ldm-stm-postinc.ll b/test/CodeGen/Thumb/ldm-stm-postinc.ll
new file mode 100644
index 000000000000..f2e222bd5b9e
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-stm-postinc.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=thumbv7 -mcpu=cortex-m0 < %s -disable-lsr | FileCheck %s
+; FIXME: LSR mangles the last two testcases pretty badly. When this is fixed, remove
+; the -disable-lsr above.
+
+; CHECK-LABEL: @f
+; CHECK: ldm {{r[0-9]}}!, {r{{[0-9]}}}
+define i32 @f(i32* readonly %a, i32* readnone %b) {
+ %1 = icmp eq i32* %a, %b
+ br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph: ; preds = %.lr.ph, %0
+ %i.02 = phi i32 [ %3, %.lr.ph ], [ 0, %0 ]
+ %.01 = phi i32* [ %4, %.lr.ph ], [ %a, %0 ]
+ %2 = load i32, i32* %.01, align 4
+ %3 = add nsw i32 %2, %i.02
+ %4 = getelementptr inbounds i32, i32* %.01, i32 1
+ %5 = icmp eq i32* %4, %b
+ br i1 %5, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %i.0.lcssa = phi i32 [ 0, %0 ], [ %3, %.lr.ph ]
+ ret i32 %i.0.lcssa
+}
+
+; CHECK-LABEL: @g
+; CHECK-NOT: ldm
+define i32 @g(i32* readonly %a, i32* readnone %b) {
+ %1 = icmp eq i32* %a, %b
+ br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph: ; preds = %.lr.ph, %0
+ %i.02 = phi i32 [ %3, %.lr.ph ], [ 0, %0 ]
+ %.01 = phi i32* [ %4, %.lr.ph ], [ %a, %0 ]
+ %2 = load i32, i32* %.01, align 4
+ %3 = add nsw i32 %2, %i.02
+ %4 = getelementptr inbounds i32, i32* %.01, i32 2
+ %5 = icmp eq i32* %4, %b
+ br i1 %5, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %i.0.lcssa = phi i32 [ 0, %0 ], [ %3, %.lr.ph ]
+ ret i32 %i.0.lcssa
+}
+
+; CHECK-LABEL: @h
+; CHECK: stm {{r[0-9]}}!, {r{{[0-9]}}}
+define void @h(i32* %a, i32* readnone %b) {
+ %1 = icmp eq i32* %a, %b
+ br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph: ; preds = %.lr.ph, %0
+ %i.02 = phi i32 [ %2, %.lr.ph ], [ 0, %0 ]
+ %.01 = phi i32* [ %3, %.lr.ph ], [ %a, %0 ]
+ %2 = add nsw i32 %i.02, 1
+ store i32 %i.02, i32* %.01, align 4
+ %3 = getelementptr inbounds i32, i32* %.01, i32 1
+ %4 = icmp eq i32* %3, %b
+ br i1 %4, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ ret void
+}
+
+; CHECK-LABEL: @j
+; CHECK-NOT: stm
+define void @j(i32* %a, i32* readnone %b) {
+ %1 = icmp eq i32* %a, %b
+ br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph: ; preds = %.lr.ph, %0
+ %i.02 = phi i32 [ %2, %.lr.ph ], [ 0, %0 ]
+ %.01 = phi i32* [ %3, %.lr.ph ], [ %a, %0 ]
+ %2 = add nsw i32 %i.02, 1
+ store i32 %i.02, i32* %.01, align 4
+ %3 = getelementptr inbounds i32, i32* %.01, i32 2
+ %4 = icmp eq i32* %3, %b
+ br i1 %4, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ ret void
+}
diff --git a/test/CodeGen/Thumb/segmented-stacks.ll b/test/CodeGen/Thumb/segmented-stacks.ll
index 251c29534727..7340842a42f4 100644
--- a/test/CodeGen/Thumb/segmented-stacks.ll
+++ b/test/CodeGen/Thumb/segmented-stacks.ll
@@ -32,7 +32,7 @@ define void @test_basic() #0 {
; Thumb-android: pop {r4, r5}
-; Thumb-android: .align 2
+; Thumb-android: .p2align 2
; Thumb-android: .LCPI0_0:
; Thumb-android-NEXT: .long __STACK_LIMIT
diff --git a/test/CodeGen/Thumb2/2009-09-01-PostRAProlog.ll b/test/CodeGen/Thumb2/2009-09-01-PostRAProlog.ll
index 84f69f4b6e0b..5b7107604e6f 100644
--- a/test/CodeGen/Thumb2/2009-09-01-PostRAProlog.ll
+++ b/test/CodeGen/Thumb2/2009-09-01-PostRAProlog.ll
@@ -102,5 +102,3 @@ bb7: ; preds = %bb5
}
declare noalias i8* @calloc(i32, i32) nounwind
-
-declare void @llvm.memset.i64(i8* nocapture, i8, i64, i32) nounwind
diff --git a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
index e59e84d49ecf..b4248b81748d 100644
--- a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
+++ b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
@@ -8,7 +8,6 @@ entry:
; -- The loop following the load should only use a single add-literation
; instruction.
; CHECK: vldr
-; CHECK: adds r{{[0-9]+.*}}#1
; CHECK-NOT: adds
; CHECK: subsections_via_symbols
diff --git a/test/CodeGen/Thumb2/2010-02-11-phi-cycle.ll b/test/CodeGen/Thumb2/2010-02-11-phi-cycle.ll
index c662620b19e2..fff83c546678 100644
--- a/test/CodeGen/Thumb2/2010-02-11-phi-cycle.ll
+++ b/test/CodeGen/Thumb2/2010-02-11-phi-cycle.ll
@@ -31,9 +31,9 @@ return: ; preds = %bb, %entry
define i32 @test_dead_cycle(i32 %n) nounwind {
; CHECK-LABEL: test_dead_cycle:
-; CHECK: blx
+; CHECK: bl
; CHECK-NOT: mov
-; CHECK: blx
+; CHECK: bl
entry:
%0 = icmp eq i32 %n, 1 ; <i1> [#uses=1]
br i1 %0, label %return, label %bb.nph
diff --git a/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll b/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll
index d3a44957a2eb..d02947fc3b1c 100644
--- a/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll
+++ b/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
%s5 = type { i32 }
; Make sure the cmp is not scheduled before the InlineAsm that clobbers cc.
-; CHECK: blx _f2
+; CHECK: bl _f2
; CHECK: cmp r0, #0
; CHECK-NOT: cmp
; CHECK: InlineAsm Start
diff --git a/test/CodeGen/Thumb2/aligned-constants.ll b/test/CodeGen/Thumb2/aligned-constants.ll
index 13cca113452c..df3b19dbb5cf 100644
--- a/test/CodeGen/Thumb2/aligned-constants.ll
+++ b/test/CodeGen/Thumb2/aligned-constants.ll
@@ -4,11 +4,11 @@ target triple = "thumbv7-apple-ios"
; The double in the constant pool is 8-byte aligned, forcing the function
; alignment.
-; CHECK: .align 3
+; CHECK: .p2align 3
; CHECK: func
;
; Constant pool with 8-byte entry before 4-byte entry:
-; CHECK: .align 3
+; CHECK: .p2align 3
; CHECK: LCPI
; CHECK: .long 2370821947
; CHECK: .long 1080815255
diff --git a/test/CodeGen/Thumb2/bicbfi.ll b/test/CodeGen/Thumb2/bicbfi.ll
new file mode 100644
index 000000000000..fcdb1225db5d
--- /dev/null
+++ b/test/CodeGen/Thumb2/bicbfi.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7--linux-gnueabihf"
+
+; CHECK-LABEL: f:
+; CHECK: bic
+define void @f(i32* nocapture %b, i32* nocapture %c, i32 %a) {
+ %1 = and i32 %a, -4096
+ store i32 %1, i32* %c, align 4
+ %2 = and i32 %a, 4095
+ %3 = or i32 %2, 4096
+ %4 = load i32, i32* %b, align 4
+ %5 = add nsw i32 %4, %3
+ store i32 %5, i32* %b, align 4
+ ret void
+} \ No newline at end of file
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index 26622e23dd44..1e2b332be982 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -12,10 +12,10 @@ entry:
define i64 @f2(i64 %a, i64 %b) {
entry:
; CHECK-LABEL: f2:
-; CHECK: adds r0, r0, r0
-; CHECK: adcs r1, r1
-; CHECK: subs r0, r0, r2
-; CHECK: sbcs r1, r3
+; CHECK: lsls r1, r1, #1
+; CHECK: orr.w r1, r1, r0, lsr #31
+; CHECK: rsbs r0, r2, r0, lsl #1
+; CHECK: sbcs r1, r3
%tmp1 = shl i64 %a, 1
%tmp2 = sub i64 %tmp1, %b
ret i64 %tmp2
diff --git a/test/CodeGen/Thumb2/emit-unwinding.ll b/test/CodeGen/Thumb2/emit-unwinding.ll
index 1f1ea1b48af0..b77bb9e6b13c 100644
--- a/test/CodeGen/Thumb2/emit-unwinding.ll
+++ b/test/CodeGen/Thumb2/emit-unwinding.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple thumbv7em-apple-unknown-eabi-macho %s -o - -O0 | FileCheck %s
-; CHECK: add.w r11, sp, #{{[1-9]+}}
+; CHECK: add r7, sp, #{{[1-9]+}}
define void @foo1() {
call void asm sideeffect "", "~{r4}"()
diff --git a/test/CodeGen/Thumb2/ldr-str-imm12.ll b/test/CodeGen/Thumb2/ldr-str-imm12.ll
index d20eef0c8bb7..3e4bd02097ad 100644
--- a/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -29,16 +29,16 @@ entry:
bb20: ; preds = %entry
switch i32 undef, label %bb1287 [
- i32 11, label %bb119
- i32 12, label %bb119
- i32 21, label %bb420
- i32 23, label %bb420
- i32 45, label %bb438
- i32 46, label %bb438
- i32 55, label %bb533
- i32 56, label %bb569
- i32 64, label %bb745
- i32 78, label %bb1098
+ i32 110, label %bb119
+ i32 120, label %bb119
+ i32 210, label %bb420
+ i32 230, label %bb420
+ i32 450, label %bb438
+ i32 460, label %bb438
+ i32 550, label %bb533
+ i32 560, label %bb569
+ i32 640, label %bb745
+ i32 780, label %bb1098
]
bb119: ; preds = %bb20, %bb20
diff --git a/test/CodeGen/Thumb2/thumb2-call.ll b/test/CodeGen/Thumb2/thumb2-call.ll
index 62b47a44b494..e06df642a93a 100644
--- a/test/CodeGen/Thumb2/thumb2-call.ll
+++ b/test/CodeGen/Thumb2/thumb2-call.ll
@@ -1,26 +1,20 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 | FileCheck %s -check-prefix=DARWIN
-; RUN: llc < %s -mtriple=thumbv7-linux -mattr=+thumb2 | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-linux -mattr=+thumb2 | FileCheck %s
@t = weak global i32 ()* null ; <i32 ()**> [#uses=1]
declare void @g(i32, i32, i32, i32)
define void @f() {
-; DARWIN-LABEL: f:
-; DARWIN: blx _g
-
-; LINUX-LABEL: f:
-; LINUX: bl g
+; CHECK-LABEL: f:
+; CHECK: bl {{_?}}g
call void @g( i32 1, i32 2, i32 3, i32 4 )
ret void
}
define void @h() {
-; DARWIN-LABEL: h:
-; DARWIN: blx r0
-
-; LINUX-LABEL: h:
-; LINUX: blx r0
+; CHECK-LABEL: h:
+; CHECK: blx r0
%tmp = load i32 ()*, i32 ()** @t ; <i32 ()*> [#uses=1]
%tmp.upgrd.2 = call i32 %tmp( ) ; <i32> [#uses=0]
ret void
diff --git a/test/CodeGen/Thumb2/thumb2-cbnz.ll b/test/CodeGen/Thumb2/thumb2-cbnz.ll
index 8104dc714da0..c1a53825e3b1 100644
--- a/test/CodeGen/Thumb2/thumb2-cbnz.ll
+++ b/test/CodeGen/Thumb2/thumb2-cbnz.ll
@@ -5,6 +5,7 @@ declare double @foo(double) nounwind readnone
define void @t(i32 %c, double %b) {
entry:
+; CHECK: cmp r0, #0
%cmp1 = icmp ne i32 %c, 0
br i1 %cmp1, label %bb3, label %bb1
@@ -23,8 +24,7 @@ bb7: ; preds = %bb3
br i1 %cmp3, label %bb11, label %bb9
bb9: ; preds = %bb7
-; CHECK: cmp r0, #0
-; CHECK-NEXT: cbnz
+; CHECK: cbnz
%0 = tail call double @foo(double %b) nounwind readnone ; <double> [#uses=0]
br label %bb11
diff --git a/test/CodeGen/Thumb2/thumb2-cpsr-liveness.ll b/test/CodeGen/Thumb2/thumb2-cpsr-liveness.ll
new file mode 100644
index 000000000000..798785988938
--- /dev/null
+++ b/test/CodeGen/Thumb2/thumb2-cpsr-liveness.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -misched-postra=true
+
+define i32 @test_cpsr() {
+entry:
+ %a = alloca [10 x i32], align 4
+ %0 = bitcast [10 x i32]* %a to i8*
+ %arrayidx.gep = getelementptr [10 x i32], [10 x i32]* %a, i32 0, i32 0
+ br label %for.body
+
+for.cond.cleanup:
+ %c.1.reg2mem.0.lcssa = phi i32 [ %c.1.reg2mem.0, %for.inc ]
+ ret i32 %c.1.reg2mem.0.lcssa
+
+for.body:
+ %1 = phi i32 [ 0, %entry ], [ %.pre, %for.inc.for.body_crit_edge ]
+ %c.018.reg2mem.0 = phi i32 [ 0, %entry ], [ %c.1.reg2mem.0, %for.inc.for.body_crit_edge ]
+ %b.017.reg2mem.0 = phi double [ 0.000000e+00, %entry ], [ %b.1.reg2mem.0, %for.inc.for.body_crit_edge ]
+ %arrayidx.phi = phi i32* [ %arrayidx.gep, %entry ], [ %arrayidx.inc, %for.inc.for.body_crit_edge ]
+ %i.019 = phi i32 [ 0, %entry ], [ %inc, %for.inc.for.body_crit_edge ]
+ %cmp1 = icmp slt i32 %1, 10
+ %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+ br i1 %cmp1, label %for.inc, label %if.end
+
+if.end:
+ %conv = sitofp i32 %i.019 to double
+ %cmp2 = fcmp nsz ogt double %conv, %b.017.reg2mem.0
+ %selv = select i1 %cmp2, double %conv, double %b.017.reg2mem.0
+ %selv7 = select i1 %cmp2, i32 %i.019, i32 %c.018.reg2mem.0
+ br label %for.inc
+
+for.inc:
+ %b.1.reg2mem.0 = phi double [ %b.017.reg2mem.0, %for.body ], [ %selv, %if.end ]
+ %c.1.reg2mem.0 = phi i32 [ %c.018.reg2mem.0, %for.body ], [ %selv7, %if.end ]
+ %exitcond = icmp eq i32 %i.019, 9
+ br i1 %exitcond, label %for.cond.cleanup, label %for.inc.for.body_crit_edge
+
+for.inc.for.body_crit_edge:
+ %inc = add nuw nsw i32 %i.019, 1
+ %.pre = load i32, i32* %arrayidx.inc, align 4
+ br label %for.body
+}
diff --git a/test/CodeGen/Thumb2/thumb2-ldm.ll b/test/CodeGen/Thumb2/thumb2-ldm.ll
index 28903aca3267..e733d5c99262 100644
--- a/test/CodeGen/Thumb2/thumb2-ldm.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldm.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s -check-prefix=ALL -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=ALL -check-prefix=CONSERVATIVE
@X = external global [0 x i32] ; <[0 x i32]*> [#uses=5]
define i32 @t1() {
-; CHECK-LABEL: t1:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t1:
+; ALL: push {r7, lr}
; CHECK: ldrd
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
%tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 0) ; <i32> [#uses=1]
%tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1) ; <i32> [#uses=1]
%tmp4 = call i32 @f1( i32 %tmp, i32 %tmp3 ) ; <i32> [#uses=1]
@@ -14,10 +17,12 @@ define i32 @t1() {
}
define i32 @t2() {
-; CHECK-LABEL: t2:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t2:
+; ALL: push {r7, lr}
; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
%tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2) ; <i32> [#uses=1]
%tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3) ; <i32> [#uses=1]
%tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 4) ; <i32> [#uses=1]
@@ -26,10 +31,12 @@ define i32 @t2() {
}
define i32 @t3() {
-; CHECK-LABEL: t3:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t3:
+; ALL: push {r7, lr}
; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
%tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1) ; <i32> [#uses=1]
%tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2) ; <i32> [#uses=1]
%tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3) ; <i32> [#uses=1]
@@ -37,6 +44,34 @@ define i32 @t3() {
ret i32 %tmp6
}
+@g = common global i32* null
+
+define void @t4(i32 %a0, i32 %a1, i32 %a2) {
+; ALL-LABEL: t4:
+; ALL: stm.w sp, {r0, r1, r2}
+; ALL: bl _ext
+; ALL: ldm.w sp, {r0, r1, r2}
+; ALL: bl _f2
+ %arr = alloca [4 x i32], align 4
+ %p0 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 0
+ %p1 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 1
+ %p2 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 2
+ store i32* %p0, i32** @g, align 8
+
+ store i32 %a0, i32* %p0, align 4
+ store i32 %a1, i32* %p1, align 4
+ store i32 %a2, i32* %p2, align 4
+ call void @ext()
+
+ %v0 = load i32, i32* %p0, align 4
+ %v1 = load i32, i32* %p1, align 4
+ %v2 = load i32, i32* %p2, align 4
+ call i32 @f2(i32 %v0, i32 %v1, i32 %v2)
+ ret void
+}
+
declare i32 @f1(i32, i32)
declare i32 @f2(i32, i32, i32)
+
+declare void @ext()
diff --git a/test/CodeGen/Thumb2/thumb2-tbb.ll b/test/CodeGen/Thumb2/thumb2-tbb.ll
index 758f792695fd..9e6285199135 100644
--- a/test/CodeGen/Thumb2/thumb2-tbb.ll
+++ b/test/CodeGen/Thumb2/thumb2-tbb.ll
@@ -7,7 +7,7 @@ entry:
; CHECK: tbb
; CHECK: .data_region jt8
; CHECK: .end_data_region
-; CHECK-NEXT: .align 1
+; CHECK-NEXT: .p2align 1
switch i32 %n.u, label %bb12 [i32 1, label %bb i32 2, label %bb6 i32 4, label %bb7 i32 5, label %bb8 i32 6, label %bb10 i32 7, label %bb1 i32 8, label %bb3 i32 9, label %bb4 i32 10, label %bb9 i32 11, label %bb2 i32 12, label %bb5 i32 13, label %bb11 ]
bb:
diff --git a/test/CodeGen/Thumb2/tls2.ll b/test/CodeGen/Thumb2/tls2.ll
index 8f05ceab19fc..98ae8e6d90d9 100644
--- a/test/CodeGen/Thumb2/tls2.ll
+++ b/test/CodeGen/Thumb2/tls2.ll
@@ -11,7 +11,7 @@ entry:
; CHECK-NOT-PIC: i(GOTTPOFF)
; CHECK-PIC-LABEL: f:
-; CHECK-PIC: bl __tls_get_addr(PLT)
+; CHECK-PIC: bl __tls_get_addr
%tmp1 = load i32, i32* @i ; <i32> [#uses=1]
ret i32 %tmp1
}
@@ -24,6 +24,6 @@ entry:
; CHECK-NOT-PIC: i(GOTTPOFF)
; CHECK-PIC-LABEL: g:
-; CHECK-PIC: bl __tls_get_addr(PLT)
+; CHECK-PIC: bl __tls_get_addr
ret i32* @i
}
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
index 6a7a7a0b0aa0..d8d60413cb0e 100644
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -17,31 +17,13 @@
define i32 @t(i32 %type) optsize {
entry:
- br i1 undef, label %if.then, label %if.else
-
-if.then:
- unreachable
-
-if.else:
- br i1 undef, label %if.then15, label %if.else18
-
-if.then15:
- unreachable
-
-if.else18:
switch i32 %type, label %if.else173 [
- i32 3, label %if.then115
- i32 1, label %if.then102
+ i32 13, label %if.then115
+ i32 6, label %if.then102
]
if.then102:
- br i1 undef, label %cond.true10.i, label %t.exit
-
-cond.true10.i:
- br label %t.exit
-
-t.exit:
- unreachable
+ br label %if.then115
if.then115:
br i1 undef, label %if.else163, label %if.else145
@@ -62,4 +44,3 @@ if.else173:
}
declare hidden fastcc %struct.hc* @foo(%struct.hc* nocapture, i32) nounwind optsize
-
diff --git a/test/CodeGen/WebAssembly/address-offsets.ll b/test/CodeGen/WebAssembly/address-offsets.ll
new file mode 100644
index 000000000000..6403b3762992
--- /dev/null
+++ b/test/CodeGen/WebAssembly/address-offsets.ll
@@ -0,0 +1,672 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+
+; Test folding constant offsets and symbols into load and store addresses under
+; a variety of circumstances.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+@g = external global [0 x i32], align 4
+
+; CHECK-LABEL: load_test0:
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.load $push1=, g+40($pop0){{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define i32 @load_test0() {
+ %t = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 10), align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test0_noinbounds:
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.load $push1=, g+40($pop0){{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define i32 @load_test0_noinbounds() {
+ %t = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @g, i32 0, i32 10), align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test1:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test1(i32 %n) {
+ %add = add nsw i32 %n, 10
+ %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test2:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test2(i32 %n) {
+ %add = add nsw i32 10, %n
+ %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test3:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test3(i32 %n) {
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ %t = load i32, i32* %add.ptr1, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test4:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test4(i32 %n) {
+ %add.ptr = getelementptr inbounds i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 10), i32 %n
+ %t = load i32, i32* %add.ptr, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test5:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test5(i32 %n) {
+ %add.ptr = getelementptr inbounds i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 10), i32 %n
+ %t = load i32, i32* %add.ptr, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test6:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test6(i32 %n) {
+ %add = add nsw i32 %n, 10
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ %t = load i32, i32* %add.ptr, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test7:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test7(i32 %n) {
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ %t = load i32, i32* %add.ptr1, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test8:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, g+40($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test8(i32 %n) {
+ %add = add nsw i32 10, %n
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ %t = load i32, i32* %add.ptr, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test9:
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.load $push1=, g-40($pop0){{$}}
+; CHECK-NEXT: return $pop1{{$}}
+define i32 @load_test9() {
+ %t = load i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 1073741814), align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test10:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.const $push2=, g-40{{$}}
+; CHECK-NEXT: i32.add $push3=, $pop1, $pop2{{$}}
+; CHECK-NEXT: i32.load $push4=, 0($pop3){{$}}
+; CHECK-NEXT: return $pop4{{$}}
+define i32 @load_test10(i32 %n) {
+ %add = add nsw i32 %n, -10
+ %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test11:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.load $push0=, 40($0){{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @load_test11(i32* %p) {
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 10
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test11_noinbounds:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 40{{$}}
+; CHECK-NEXT: i32.add $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, 0($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test11_noinbounds(i32* %p) {
+ %arrayidx = getelementptr i32, i32* %p, i32 10
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test12:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}}
+; CHECK-NEXT: return $pop5{{$}}
+define i32 @load_test12(i32* %p, i32 %n) {
+ %add = add nsw i32 %n, 10
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 %add
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test13:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}}
+; CHECK-NEXT: return $pop5{{$}}
+define i32 @load_test13(i32* %p, i32 %n) {
+ %add = add nsw i32 10, %n
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 %add
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test14:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.load $push3=, 40($pop2){{$}}
+; CHECK-NEXT: return $pop3{{$}}
+define i32 @load_test14(i32* %p, i32 %n) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ %t = load i32, i32* %add.ptr1, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test15:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}}
+; CHECK-NEXT: return $pop5{{$}}
+define i32 @load_test15(i32* %p, i32 %n) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 10
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %n
+ %t = load i32, i32* %add.ptr1, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test16:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}}
+; CHECK-NEXT: return $pop5{{$}}
+define i32 @load_test16(i32* %p, i32 %n) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 10
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %n
+ %t = load i32, i32* %add.ptr1, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test17:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}}
+; CHECK-NEXT: return $pop5{{$}}
+define i32 @load_test17(i32* %p, i32 %n) {
+ %add = add nsw i32 %n, 10
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %add
+ %t = load i32, i32* %add.ptr, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test18:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.load $push3=, 40($pop2){{$}}
+; CHECK-NEXT: return $pop3{{$}}
+define i32 @load_test18(i32* %p, i32 %n) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ %t = load i32, i32* %add.ptr1, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test19:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}}
+; CHECK-NEXT: return $pop5{{$}}
+define i32 @load_test19(i32* %p, i32 %n) {
+ %add = add nsw i32 10, %n
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %add
+ %t = load i32, i32* %add.ptr, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test20:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, -40{{$}}
+; CHECK-NEXT: i32.add $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load $push2=, 0($pop1){{$}}
+; CHECK-NEXT: return $pop2{{$}}
+define i32 @load_test20(i32* %p) {
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 -10
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: load_test21:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, -40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.load $push5=, 0($pop4){{$}}
+; CHECK-NEXT: return $pop5{{$}}
+define i32 @load_test21(i32* %p, i32 %n) {
+ %add = add nsw i32 %n, -10
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 %add
+ %t = load i32, i32* %arrayidx, align 4
+ ret i32 %t
+}
+
+; CHECK-LABEL: store_test0:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop0), $0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test0(i32 %i) {
+ store i32 %i, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 10), align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test0_noinbounds:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop0), $0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test0_noinbounds(i32 %i) {
+ store i32 %i, i32* getelementptr ([0 x i32], [0 x i32]* @g, i32 0, i32 10), align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test1:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test1(i32 %n, i32 %i) {
+ %add = add nsw i32 %n, 10
+ %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test2:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test2(i32 %n, i32 %i) {
+ %add = add nsw i32 10, %n
+ %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test3:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test3(i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ store i32 %i, i32* %add.ptr1, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test4:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test4(i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 10), i32 %n
+ store i32 %i, i32* %add.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test5:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test5(i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds i32, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 10), i32 %n
+ store i32 %i, i32* %add.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test6:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test6(i32 %n, i32 %i) {
+ %add = add nsw i32 %n, 10
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ store i32 %i, i32* %add.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test7:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test7(i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ store i32 %i, i32* %add.ptr1, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test8:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, g+40($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test8(i32 %n, i32 %i) {
+ %add = add nsw i32 10, %n
+ %add.ptr = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ store i32 %i, i32* %add.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test9:
+; CHECK-NEXT: param i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.store $drop=, g-40($pop0), $0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test9(i32 %i) {
+ store i32 %i, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @g, i32 0, i32 1073741814), align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test10:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.const $push2=, g-40{{$}}
+; CHECK-NEXT: i32.add $push3=, $pop1, $pop2{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop3), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test10(i32 %n, i32 %i) {
+ %add = add nsw i32 %n, -10
+ %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i32 0, i32 %add
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test11:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.store $drop=, 40($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test11(i32* %p, i32 %i) {
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 10
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test11_noinbounds:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 40{{$}}
+; CHECK-NEXT: i32.add $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test11_noinbounds(i32* %p, i32 %i) {
+ %arrayidx = getelementptr i32, i32* %p, i32 10
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test12:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop4), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test12(i32* %p, i32 %n, i32 %i) {
+ %add = add nsw i32 %n, 10
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 %add
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test13:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop4), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test13(i32* %p, i32 %n, i32 %i) {
+ %add = add nsw i32 10, %n
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 %add
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test14:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.store $drop=, 40($pop2), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test14(i32* %p, i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ store i32 %i, i32* %add.ptr1, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test15:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop4), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test15(i32* %p, i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 10
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %n
+ store i32 %i, i32* %add.ptr1, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test16:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop4), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test16(i32* %p, i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 10
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %n
+ store i32 %i, i32* %add.ptr1, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test17:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop4), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test17(i32* %p, i32 %n, i32 %i) {
+ %add = add nsw i32 %n, 10
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %add
+ store i32 %i, i32* %add.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test18:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.store $drop=, 40($pop2), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test18(i32* %p, i32 %n, i32 %i) {
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %n
+ %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 10
+ store i32 %i, i32* %add.ptr1, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test19:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, 40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop4), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test19(i32* %p, i32 %n, i32 %i) {
+ %add = add nsw i32 10, %n
+ %add.ptr = getelementptr inbounds i32, i32* %p, i32 %add
+ store i32 %i, i32* %add.ptr, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test20:
+; CHECK-NEXT: param i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, -40{{$}}
+; CHECK-NEXT: i32.add $push1=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop1), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test20(i32* %p, i32 %i) {
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 -10
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
+
+; CHECK-LABEL: store_test21:
+; CHECK-NEXT: param i32, i32, i32{{$}}
+; CHECK-NEXT: i32.const $push0=, 2{{$}}
+; CHECK-NEXT: i32.shl $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.add $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.const $push3=, -40{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($pop4), $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_test21(i32* %p, i32 %n, i32 %i) {
+ %add = add nsw i32 %n, -10
+ %arrayidx = getelementptr inbounds i32, i32* %p, i32 %add
+ store i32 %i, i32* %arrayidx, align 4
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/byval.ll b/test/CodeGen/WebAssembly/byval.ll
new file mode 100644
index 000000000000..ebbf50313a58
--- /dev/null
+++ b/test/CodeGen/WebAssembly/byval.ll
@@ -0,0 +1,131 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+%SmallStruct = type { i32 }
+%OddStruct = type { i32, i8, i32 }
+%AlignedStruct = type { double, double }
+%BigStruct = type { double, double, double, double, double, double, double, double, double, double, double, i8, i8, i8 }
+%EmptyStruct = type { }
+
+%BigArray = type { [33 x i8] }
+
+declare void @ext_func(%SmallStruct*)
+declare void @ext_func_empty(%EmptyStruct* byval)
+declare void @ext_byval_func(%SmallStruct* byval)
+declare void @ext_byval_func_align8(%SmallStruct* byval align 8)
+declare void @ext_byval_func_alignedstruct(%AlignedStruct* byval)
+declare void @ext_byval_func_bigarray(%BigArray* byval)
+declare void @ext_byval_func_empty(%EmptyStruct* byval)
+
+; CHECK-LABEL: byval_arg
+define void @byval_arg(%SmallStruct* %ptr) {
+ ; CHECK: .param i32
+ ; CHECK: i32.const $push[[L4:.+]]=, 0
+ ; Subtract 16 from SP (SP is 16-byte aligned)
+ ; CHECK: i32.const $push[[L1:.+]]=, 0
+ ; CHECK-NEXT: i32.load $push[[L2:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L10:.+]]=, $pop[[L2]], $pop[[L3]]
+ ; Ensure SP is stored back before the call
+ ; CHECK-NEXT: i32.store $push[[L12:.+]]=, __stack_pointer($pop[[L4]]), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L11:.+]]=, $[[SP:.+]]=, $pop[[L12]]{{$}}
+ ; Copy the SmallStruct argument to the stack (SP+12, original SP-4)
+ ; CHECK-NEXT: i32.load $push[[L0:.+]]=, 0($0)
+ ; CHECK-NEXT: i32.store $drop=, 12($pop[[L11]]), $pop[[L0]]
+ ; Pass a pointer to the stack slot to the function
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 12{{$}}
+ ; CHECK-NEXT: i32.add $push[[ARG:.+]]=, $[[SP]], $pop[[L5]]{{$}}
+ ; CHECK-NEXT: call ext_byval_func@FUNCTION, $pop[[ARG]]{{$}}
+ call void @ext_byval_func(%SmallStruct* byval %ptr)
+ ; Restore the stack
+ ; CHECK-NEXT: i32.const $push[[L7:.+]]=, 0
+ ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 16
+ ; CHECK-NEXT: i32.add $push[[L8:.+]]=, $[[SP]], $pop[[L6]]
+ ; CHECK-NEXT: i32.store {{.*}}=, __stack_pointer($pop[[L7]]), $pop[[L8]]
+ ; CHECK-NEXT: return
+ ret void
+}
+
+; CHECK-LABEL: byval_arg_align8
+define void @byval_arg_align8(%SmallStruct* %ptr) {
+ ; CHECK: .param i32
+ ; Don't check the entire SP sequence, just enough to get the alignment.
+ ; CHECK: i32.const $push[[L1:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L10:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.store $push[[L12:.+]]=, __stack_pointer($pop{{.+}}), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L11:.+]]=, $[[SP:.+]]=, $pop[[L12]]{{$}}
+ ; Copy the SmallStruct argument to the stack (SP+8, original SP-8)
+ ; CHECK-NEXT: i32.load $push[[L0:.+]]=, 0($0){{$}}
+ ; CHECK-NEXT: i32.store $drop=, 8($pop[[L11]]), $pop[[L0]]{{$}}
+ ; Pass a pointer to the stack slot to the function
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 8{{$}}
+ ; CHECK-NEXT: i32.add $push[[ARG:.+]]=, $[[SP]], $pop[[L5]]{{$}}
+ ; CHECK-NEXT: call ext_byval_func_align8@FUNCTION, $pop[[ARG]]{{$}}
+ call void @ext_byval_func_align8(%SmallStruct* byval align 8 %ptr)
+ ret void
+}
+
+; CHECK-LABEL: byval_arg_double
+define void @byval_arg_double(%AlignedStruct* %ptr) {
+ ; CHECK: .param i32
+ ; Subtract 16 from SP (SP is 16-byte aligned)
+ ; CHECK: i32.const $push[[L1:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L12:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.store $push[[L15:.+]]=, {{.+}}, $pop[[L12]]
+ ; CHECK-NEXT: tee_local $push[[L14:.+]]=, $[[SP:.+]]=, $pop[[L15]]
+ ; Copy the AlignedStruct argument to the stack (SP+0, original SP-16)
+ ; Just check the last load/store pair of the memcpy
+ ; CHECK: i64.load $push[[L4:.+]]=, 0($0)
+ ; CHECK-NEXT: i64.store $drop=, 0($[[SP]]), $pop[[L4]]
+ ; Pass a pointer to the stack slot to the function
+ ; CHECK-NEXT: call ext_byval_func_alignedstruct@FUNCTION, $[[SP]]
+ tail call void @ext_byval_func_alignedstruct(%AlignedStruct* byval %ptr)
+ ret void
+}
+
+; CHECK-LABEL: byval_param
+define void @byval_param(%SmallStruct* byval align 32 %ptr) {
+ ; CHECK: .param i32
+ ; %ptr is just a pointer to a struct, so pass it directly through
+ ; CHECK: call ext_func@FUNCTION, $0
+ call void @ext_func(%SmallStruct* %ptr)
+ ret void
+}
+
+; CHECK-LABEL: byval_empty_caller
+define void @byval_empty_caller(%EmptyStruct* %ptr) {
+ ; CHECK: .param i32
+ ; CHECK: call ext_byval_func_empty@FUNCTION, $0
+ call void @ext_byval_func_empty(%EmptyStruct* byval %ptr)
+ ret void
+}
+
+; CHECK-LABEL: byval_empty_callee
+define void @byval_empty_callee(%EmptyStruct* byval %ptr) {
+ ; CHECK: .param i32
+ ; CHECK: call ext_func_empty@FUNCTION, $0
+ call void @ext_func_empty(%EmptyStruct* %ptr)
+ ret void
+}
+
+; Call memcpy for "big" byvals.
+; CHECK-LABEL: big_byval:
+; CHECK: i32.const $push[[L4:.+]]=, 0
+; CHECK: i32.const $push[[L1:.+]]=, 0
+; CHECK-NEXT: i32.load $push[[L2:.+]]=, __stack_pointer($pop[[L1]])
+; CHECK-NEXT: i32.const $push[[L3:.+]]=, 131072
+; CHECK-NEXT: i32.sub $push[[L8:.+]]=, $pop[[L2]], $pop[[L3]]
+; CHECK-NEXT: i32.store $push[[L12:.+]]=, __stack_pointer($pop[[L4]]), $pop[[L8]]{{$}}
+; CHECK-NEXT: i32.const $push[[L0:.+]]=, 131072
+; CHECK-NEXT: i32.call $push[[L11:.+]]=, memcpy@FUNCTION, $pop{{.+}}, ${{.+}}, $pop{{.+}}
+; CHECK-NEXT: tee_local $push[[L9:.+]]=, $[[SP:.+]]=, $pop[[L11]]{{$}}
+; CHECK-NEXT: call big_byval_callee@FUNCTION,
+%big = type [131072 x i8]
+declare void @big_byval_callee(%big* byval align 1)
+define void @big_byval(%big* byval align 1 %x) {
+ call void @big_byval_callee(%big* byval align 1 %x)
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/call.ll b/test/CodeGen/WebAssembly/call.ll
index 6d5542c89d3d..bd5f7b80edba 100644
--- a/test/CodeGen/WebAssembly/call.ll
+++ b/test/CodeGen/WebAssembly/call.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel -fast-isel-abort=1 | FileCheck %s
; Test that basic call operations assemble as expected.
@@ -120,7 +121,7 @@ define void @coldcc_tail_call_void_nullary() {
ret void
}
-; FIXME test the following:
+; TODO: test the following:
; - More argument combinations.
; - Tail call.
; - Interesting returns (struct, multiple).
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index f0e5f4471678..d46898b44703 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -asm-verbose=false -disable-block-placement -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck -check-prefix=OPT %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s
; Test the CFG stackifier pass.
+; Explicitly disable fast-isel, since it gets implicitly enabled in the
+; optnone test.
+
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
@@ -12,19 +15,23 @@ declare void @something()
; CHECK-LABEL: test0:
; CHECK: loop
-; CHECK-NOT: br
-; CHECK: i32.add
-; CHECK-NEXT: i32.ge_s
+; CHECK-NEXT: block
+; CHECK-NEXT: i32.const
+; CHECK-NEXT: i32.add
+; CHECK: i32.lt_s
; CHECK-NEXT: br_if
-; CHECK-NOT: br
-; CHECK: call
-; CHECK: br 0{{$}}
-; CHECK: return{{$}}
+; CHECK-NEXT: return
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: end_block
+; CHECK-NEXT: call
+; CHECK-NEXT: br
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: end_loop
; OPT-LABEL: test0:
; OPT: loop
-; OPT-NOT: br
-; OPT: i32.add
-; OPT-NEXT: i32.ge_s
+; OPT-NEXT: i32.const
+; OPT-NEXT: i32.add
+; OPT: i32.ge_s
; OPT-NEXT: br_if
; OPT-NOT: br
; OPT: call
@@ -53,19 +60,23 @@ back:
; CHECK-LABEL: test1:
; CHECK: loop
-; CHECK-NOT: br
-; CHECK: i32.add
-; CHECK-NEXT: i32.ge_s
+; CHECK-NEXT: block
+; CHECK-NEXT: i32.const
+; CHECK-NEXT: i32.add
+; CHECK: i32.lt_s
; CHECK-NEXT: br_if
-; CHECK-NOT: br
-; CHECK: call
-; CHECK: br 0{{$}}
-; CHECK: return{{$}}
+; CHECK-NEXT: return
+; CHECK-NEXT: .LBB1_3:
+; CHECK-NEXT: end_block
+; CHECK-NEXT: call
+; CHECK-NEXT: br
+; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: end_loop
; OPT-LABEL: test1:
; OPT: loop
-; OPT-NOT: br
-; OPT: i32.add
-; OPT-NEXT: i32.ge_s
+; OPT-NEXT: i32.const
+; OPT-NEXT: i32.add
+; OPT: i32.ge_s
; OPT-NEXT: br_if
; OPT-NOT: br
; OPT: call
@@ -95,18 +106,24 @@ back:
; CHECK-LABEL: test2:
; CHECK-NOT: local
; CHECK: block{{$}}
-; CHECK: br_if {{[^,]*}}, 0{{$}}
-; CHECK: .LBB2_1:
-; CHECK: br_if ${{[0-9]+}}, 0{{$}}
-; CHECK: .LBB2_2:
+; CHECK: br_if 0, {{[^,]+}}{{$}}
+; CHECK: .LBB2_{{[0-9]+}}:
+; CHECK: loop
+; CHECK: br_if 0, $pop{{[0-9]+}}{{$}}
+; CHECK: .LBB2_{{[0-9]+}}:
+; CHECK: end_loop
+; CHECK: end_block
; CHECK: return{{$}}
; OPT-LABEL: test2:
; OPT-NOT: local
; OPT: block{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
-; OPT: .LBB2_1:
-; OPT: br_if ${{[0-9]+}}, 0{{$}}
-; OPT: .LBB2_2:
+; OPT: br_if 0, {{[^,]+}}{{$}}
+; OPT: .LBB2_{{[0-9]+}}:
+; OPT: loop
+; OPT: br_if 0, $pop{{[0-9]+}}{{$}}
+; OPT: .LBB2_{{[0-9]+}}:
+; OPT: end_loop
+; OPT: end_block
; OPT: return{{$}}
define void @test2(double* nocapture %p, i32 %n) {
entry:
@@ -136,28 +153,33 @@ for.end:
; CHECK-LABEL: doublediamond:
; CHECK: block{{$}}
; CHECK-NEXT: block{{$}}
-; CHECK: br_if ${{[^,]*}}, 0{{$}}
+; CHECK: br_if 0, ${{[^,]+}}{{$}}
; CHECK: br 1{{$}}
; CHECK: .LBB3_2:
; CHECK-NEXT: end_block{{$}}
; CHECK: block{{$}}
-; CHECK: br_if ${{[^,]*}}, 0{{$}}
+; CHECK: br_if 0, ${{[^,]+}}{{$}}
; CHECK: br 1{{$}}
; CHECK: .LBB3_4:
; CHECK-NEXT: end_block{{$}}
; CHECK: .LBB3_5:
; CHECK-NEXT: end_block{{$}}
-; CHECK: return ${{[0-9]+}}{{$}}
+; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
+; CHECK-NEXT: return $pop{{[0-9]+}}{{$}}
; OPT-LABEL: doublediamond:
-; OPT: block{{$}}
+; OPT: block{{$}}
; OPT-NEXT: block{{$}}
-; OPT: br_if ${{[^,]*}}, 0{{$}}
-; OPT: block{{$}}
-; OPT: br_if ${{[^,]*}}, 0{{$}}
-; OPT: br 1{{$}}
-; OPT: .LBB3_4:
-; OPT: .LBB3_5:
-; OPT: return ${{[0-9]+}}{{$}}
+; OPT-NEXT: block{{$}}
+; OPT: br_if 0, ${{[^,]+}}{{$}}
+; OPT: br_if 1, ${{[^,]+}}{{$}}
+; OPT: br 2{{$}}
+; OPT-NEXT: .LBB3_3:
+; OPT-NEXT: end_block
+; OPT: br 1{{$}}
+; OPT-NEXT: .LBB3_4:
+; OPT: .LBB3_5:
+; OPT-NEXT: end_block
+; OPT: return $pop{{[0-9]+}}{{$}}
define i32 @doublediamond(i32 %a, i32 %b, i32* %p) {
entry:
%c = icmp eq i32 %a, 0
@@ -183,12 +205,12 @@ exit:
; CHECK-LABEL: triangle:
; CHECK: block{{$}}
-; CHECK: br_if $1, 0{{$}}
+; CHECK: br_if 0, $1{{$}}
; CHECK: .LBB4_2:
; CHECK: return ${{[0-9]+}}{{$}}
; OPT-LABEL: triangle:
; OPT: block{{$}}
-; OPT: br_if $1, 0{{$}}
+; OPT: br_if 0, $1{{$}}
; OPT: .LBB4_2:
; OPT: return ${{[0-9]+}}{{$}}
define i32 @triangle(i32* %p, i32 %a) {
@@ -207,19 +229,21 @@ exit:
; CHECK-LABEL: diamond:
; CHECK: block{{$}}
; CHECK: block{{$}}
-; CHECK: br_if $1, 0{{$}}
+; CHECK: br_if 0, $1{{$}}
; CHECK: br 1{{$}}
; CHECK: .LBB5_2:
; CHECK: .LBB5_3:
-; CHECK: return ${{[0-9]+}}{{$}}
+; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
+; CHECK-NEXT: return $pop{{[0-9]+}}{{$}}
; OPT-LABEL: diamond:
; OPT: block{{$}}
; OPT: block{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT: br 1{{$}}
; OPT: .LBB5_2:
; OPT: .LBB5_3:
-; OPT: return ${{[0-9]+}}{{$}}
+; OPT: i32.const $push{{[0-9]+}}=, 0{{$}}
+; OPT-NEXT: return $pop{{[0-9]+}}{{$}}
define i32 @diamond(i32* %p, i32 %a) {
entry:
%c = icmp eq i32 %a, 0
@@ -251,13 +275,13 @@ entry:
; CHECK-LABEL: minimal_loop:
; CHECK-NOT: br
; CHECK: .LBB7_1:
-; CHECK: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
+; CHECK: i32.store $drop=, 0($0), $pop{{[0-9]+}}{{$}}
; CHECK: br 0{{$}}
; CHECK: .LBB7_2:
; OPT-LABEL: minimal_loop:
; OPT-NOT: br
; OPT: .LBB7_1:
-; OPT: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
+; OPT: i32.store $drop=, 0($0), $pop{{[0-9]+}}{{$}}
; OPT: br 0{{$}}
; OPT: .LBB7_2:
define i32 @minimal_loop(i32* %p) {
@@ -273,16 +297,18 @@ loop:
; CHECK-NOT: br
; CHECK: .LBB8_1:
; CHECK: loop{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, 0{{$}}
+; CHECK: br_if 0, $pop{{[0-9]+}}{{$}}
; CHECK-NEXT: end_loop{{$}}
-; CHECK: return ${{[0-9]+}}{{$}}
+; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
+; CHECK-NEXT: return $pop{{[0-9]+}}{{$}}
; OPT-LABEL: simple_loop:
; OPT-NOT: br
; OPT: .LBB8_1:
; OPT: loop{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NEXT: end_loop{{$}}
-; OPT: return ${{[0-9]+}}{{$}}
+; OPT: i32.const $push{{[0-9]+}}=, 0{{$}}
+; OPT-NEXT: return $pop{{[0-9]+}}{{$}}
define i32 @simple_loop(i32* %p, i32 %a) {
entry:
%c = icmp eq i32 %a, 0
@@ -298,17 +324,17 @@ exit:
; CHECK-LABEL: doubletriangle:
; CHECK: block{{$}}
-; CHECK: br_if $0, 0{{$}}
+; CHECK: br_if 0, $0{{$}}
; CHECK: block{{$}}
-; CHECK: br_if $1, 0{{$}}
+; CHECK: br_if 0, $1{{$}}
; CHECK: .LBB9_3:
; CHECK: .LBB9_4:
; CHECK: return ${{[0-9]+}}{{$}}
; OPT-LABEL: doubletriangle:
; OPT: block{{$}}
-; OPT: br_if $0, 0{{$}}
+; OPT: br_if 0, $0{{$}}
; OPT: block{{$}}
-; OPT: br_if $1, 0{{$}}
+; OPT: br_if 0, $1{{$}}
; OPT: .LBB9_3:
; OPT: .LBB9_4:
; OPT: return ${{[0-9]+}}{{$}}
@@ -335,21 +361,23 @@ exit:
; CHECK-LABEL: ifelse_earlyexits:
; CHECK: block{{$}}
; CHECK: block{{$}}
-; CHECK: br_if $0, 0{{$}}
+; CHECK: br_if 0, $0{{$}}
; CHECK: br 1{{$}}
; CHECK: .LBB10_2:
-; CHECK: br_if $1, 0{{$}}
+; CHECK: br_if 0, $1{{$}}
; CHECK: .LBB10_4:
-; CHECK: return ${{[0-9]+}}{{$}}
+; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
+; CHECK-NEXT: return $pop{{[0-9]+}}{{$}}
; OPT-LABEL: ifelse_earlyexits:
; OPT: block{{$}}
; OPT: block{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
-; OPT: br_if $1, 1{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
+; OPT: br_if 1, $1{{$}}
; OPT: br 1{{$}}
; OPT: .LBB10_3:
; OPT: .LBB10_4:
-; OPT: return ${{[0-9]+}}{{$}}
+; OPT: i32.const $push{{[0-9]+}}=, 0{{$}}
+; OPT-NEXT: return $pop{{[0-9]+}}{{$}}
define i32 @ifelse_earlyexits(i32 %a, i32 %b, i32* %p) {
entry:
%c = icmp eq i32 %a, 0
@@ -374,36 +402,32 @@ exit:
; CHECK: .LBB11_1:
; CHECK: loop{{$}}
; CHECK: block{{$}}
-; CHECK: block{{$}}
-; CHECK: br_if $0, 0{{$}}
+; CHECK: br_if 0, $0{{$}}
; CHECK: br 1{{$}}
; CHECK: .LBB11_3:
+; CHECK: end_block{{$}}
; CHECK: block{{$}}
-; CHECK: br_if $1, 0{{$}}
+; CHECK: br_if 0, $1{{$}}
; CHECK: br 1{{$}}
; CHECK: .LBB11_5:
-; CHECK: .LBB11_6:
; CHECK: br 0{{$}}
-; CHECK: .LBB11_7:
+; CHECK: .LBB11_6:
; CHECK-NEXT: end_loop{{$}}
; OPT-LABEL: doublediamond_in_a_loop:
-; OPT: .LBB11_1:
-; OPT: loop{{$}}
-; OPT: block{{$}}
-; OPT: block{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
-; OPT: block{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
-; OPT: br 2{{$}}
-; OPT: .LBB11_4:
-; OPT-NEXT: end_block{{$}}
-; OPT: br 1{{$}}
-; OPT: .LBB11_5:
+; OPT: .LBB11_1:
+; OPT: loop{{$}}
+; OPT: block{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
+; OPT: block{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
+; OPT: br 2{{$}}
+; OPT-NEXT: .LBB11_4:
; OPT-NEXT: end_block{{$}}
-; OPT: .LBB11_6:
+; OPT: br 1{{$}}
+; OPT: .LBB11_5:
; OPT-NEXT: end_block{{$}}
-; OPT: br 0{{$}}
-; OPT: .LBB11_7:
+; OPT: br 0{{$}}
+; OPT: .LBB11_6:
; OPT-NEXT: end_loop{{$}}
define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) {
entry:
@@ -438,10 +462,26 @@ exit:
; CHECK-NEXT: .LBB{{[0-9]+}}_{{[0-9]+}}:
; CHECK-NEXT: loop
; OPT-LABEL: test3:
-; OPT: loop
+; OPT: block
+; OPT: br_if
+; OPT: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; OPT-NEXT: loop
+; OPT-NEXT: block
+; OPT-NEXT: block
; OPT-NEXT: br_if
; OPT-NEXT: .LBB{{[0-9]+}}_{{[0-9]+}}:
; OPT-NEXT: loop
+; OPT: br_if
+; OPT-NEXT: br
+; OPT-NEXT: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; OPT-NEXT: end_loop
+; OPT-NEXT: end_block
+; OPT-NEXT: unreachable
+; OPT-NEXT: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; OPT-NEXT: end_block
+; OPT: br
+; OPT-NEXT: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; OPT-NEXT: end_loop
declare void @bar()
define void @test3(i32 %w) {
entry:
@@ -475,44 +515,36 @@ if.end:
; CHECK-NEXT: .param i32{{$}}
; CHECK: block{{$}}
; CHECK-NEXT: block{{$}}
-; CHECK-NEXT: block{{$}}
-; CHECK: br_if $pop{{[0-9]*}}, 0{{$}}
-; CHECK-NEXT: block{{$}}
-; CHECK: br_if $pop{{[0-9]*}}, 0{{$}}
-; CHECK: br_if $pop{{[0-9]*}}, 2{{$}}
+; CHECK: br_if 0, $pop{{[0-9]+}}{{$}}
+; CHECK: br_if 1, $pop{{[0-9]+}}{{$}}
+; CHECK: br 1{{$}}
; CHECK-NEXT: .LBB13_3:
; CHECK-NEXT: end_block{{$}}
-; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: .LBB13_4:
-; CHECK: br_if $pop{{[0-9]*}}, 1{{$}}
-; CHECK: br_if $pop{{[0-9]*}}, 0{{$}}
-; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: .LBB13_7:
+; CHECK-NEXT: block{{$}}
+; CHECK: br_if 0, $pop{{[0-9]+}}{{$}}
+; CHECK: br_if 1, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: .LBB13_5:
; CHECK-NEXT: end_block{{$}}
; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: .LBB13_8:
+; CHECK-NEXT: .LBB13_6:
; CHECK-NEXT: end_block{{$}}
; CHECK-NEXT: return{{$}}
; OPT-LABEL: test4:
; OPT-NEXT: .param i32{{$}}
; OPT: block{{$}}
; OPT-NEXT: block{{$}}
-; OPT-NEXT: block{{$}}
-; OPT: br_if $pop{{[0-9]*}}, 0{{$}}
-; OPT-NEXT: block{{$}}
-; OPT: br_if $pop{{[0-9]*}}, 0{{$}}
-; OPT: br_if $pop{{[0-9]*}}, 2{{$}}
+; OPT: br_if 0, $pop{{[0-9]+}}{{$}}
+; OPT: br_if 1, $pop{{[0-9]+}}{{$}}
+; OPT: br 1{{$}}
; OPT-NEXT: .LBB13_3:
; OPT-NEXT: end_block{{$}}
-; OPT-NEXT: return{{$}}
-; OPT-NEXT: .LBB13_4:
-; OPT: br_if $pop{{[0-9]*}}, 1{{$}}
-; OPT: br_if $pop{{[0-9]*}}, 0{{$}}
-; OPT-NEXT: return{{$}}
-; OPT-NEXT: .LBB13_7:
+; OPT-NEXT: block{{$}}
+; OPT: br_if 0, $pop{{[0-9]+}}{{$}}
+; OPT: br_if 1, $pop{{[0-9]+}}{{$}}
+; OPT-NEXT: .LBB13_5:
; OPT-NEXT: end_block{{$}}
; OPT-NEXT: return{{$}}
-; OPT-NEXT: .LBB13_8:
+; OPT-NEXT: .LBB13_6:
; OPT-NEXT: end_block{{$}}
; OPT-NEXT: return{{$}}
define void @test4(i32 %t) {
@@ -544,8 +576,8 @@ default:
; CHECK: .LBB14_1:
; CHECK-NEXT: block{{$}}
; CHECK-NEXT: loop{{$}}
-; CHECK: br_if {{[^,]*}}, 2{{$}}
-; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: br_if 2, {{[^,]+}}{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NEXT: end_loop{{$}}
; CHECK: return{{$}}
; CHECK-NEXT: .LBB14_4:
@@ -554,8 +586,8 @@ default:
; OPT: .LBB14_1:
; OPT-NEXT: block{{$}}
; OPT-NEXT: loop{{$}}
-; OPT: br_if {{[^,]*}}, 2{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 2, {{[^,]+}}{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NEXT: end_loop{{$}}
; OPT: return{{$}}
; OPT-NEXT: .LBB14_4:
@@ -591,11 +623,11 @@ return:
; CHECK-NEXT: block{{$}}
; CHECK-NEXT: loop{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 3{{$}}
+; CHECK: br_if 3, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 2{{$}}
+; CHECK: br_if 2, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NEXT: end_loop{{$}}
; CHECK-NOT: block
; CHECK: return{{$}}
@@ -612,11 +644,11 @@ return:
; OPT-NEXT: block{{$}}
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 3{{$}}
+; OPT: br_if 3, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 2{{$}}
+; OPT: br_if 2, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NEXT: end_loop{{$}}
; OPT-NOT: block
; OPT: return{{$}}
@@ -664,34 +696,38 @@ second:
; CHECK-NEXT: loop{{$}}
; CHECK-NOT: block
; CHECK: block{{$}}
-; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 1{{$}}
+; CHECK: br_if 1, {{[^,]+}}{{$}}
; CHECK-NOT: block
; CHECK: unreachable
; CHECK-NEXT: .LBB16_4:
; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NEXT: end_loop{{$}}
; CHECK-NOT: block
; CHECK: unreachable
; OPT-LABEL: test7:
; OPT: .LBB16_1:
+; OPT-NEXT: block
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
; OPT: block{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 1{{$}}
+; OPT: br_if 1, {{[^,]+}}{{$}}
+; OPT: br 3{{$}}
+; OPT-NEXT: .LBB16_3:
+; OPT-NEXT: end_block
; OPT-NOT: block
-; OPT: unreachable
-; OPT-NEXT: .LBB16_4:
-; OPT-NEXT: end_block{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
+; OPT-NEXT: end_loop
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 0{{$}}
-; OPT-NEXT: end_loop{{$}}
+; OPT: unreachable
+; OPT-NEXT: .LBB16_5:
+; OPT-NEXT: end_block
; OPT-NOT: block
; OPT: unreachable
define void @test7(i1 %tobool2, i1 %tobool9) {
@@ -725,31 +761,19 @@ u1:
; CHECK-LABEL: test8:
; CHECK: .LBB17_1:
; CHECK-NEXT: loop{{$}}
-; CHECK-NEXT: block{{$}}
-; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 0{{$}}
-; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 1{{$}}
-; CHECK-NEXT: .LBB17_3:
-; CHECK-NEXT: end_block{{$}}
-; CHECK-NEXT: loop{{$}}
-; CHECK-NEXT: br_if {{[^,]*}}, 0{{$}}
-; CHECK-NEXT: br 2{{$}}
-; CHECK-NEXT: .LBB17_4:
+; CHECK-NEXT: i32.const $push{{[^,]+}}, 0{{$}}
+; CHECK-NEXT: br_if 0, {{[^,]+}}{{$}}
+; CHECK-NEXT: br 0{{$}}
+; CHECK-NEXT: .LBB17_2:
+; CHECK-NEXT: end_loop{{$}}
; OPT-LABEL: test8:
; OPT: .LBB17_1:
; OPT-NEXT: loop{{$}}
-; OPT-NEXT: block{{$}}
-; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 0{{$}}
-; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 1{{$}}
-; OPT-NEXT: .LBB17_3:
-; OPT-NEXT: end_block{{$}}
-; OPT-NEXT: loop{{$}}
-; OPT-NEXT: br_if {{[^,]*}}, 0{{$}}
-; OPT-NEXT: br 2{{$}}
-; OPT-NEXT: .LBB17_4:
+; OPT-NEXT: i32.const $push{{[^,]+}}, 0{{$}}
+; OPT-NEXT: br_if 0, {{[^,]+}}{{$}}
+; OPT-NEXT: br 0{{$}}
+; OPT-NEXT: .LBB17_2:
+; OPT-NEXT: end_loop{{$}}
define i32 @test8() {
bb:
br label %bb1
@@ -774,21 +798,21 @@ bb3:
; CHECK: .LBB18_1:
; CHECK-NEXT: loop{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 1{{$}}
+; CHECK: br_if 1, {{[^,]+}}{{$}}
; CHECK-NEXT: .LBB18_2:
; CHECK-NEXT: loop{{$}}
; CHECK-NOT: block
; CHECK: block{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 1{{$}}
-; CHECK-NEXT: br 3{{$}}
+; CHECK: br_if 3, {{[^,]+}}{{$}}
+; CHECK-NEXT: br 1{{$}}
; CHECK-NEXT: .LBB18_4:
; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 0{{$}}
-; CHECK-NEXT: br 2{{$}}
+; CHECK: br_if 2, {{[^,]+}}{{$}}
+; CHECK-NEXT: br 0{{$}}
; CHECK-NEXT: .LBB18_5:
; CHECK-NOT: block
; CHECK: return{{$}}
@@ -796,20 +820,20 @@ bb3:
; OPT: .LBB18_1:
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 1{{$}}
+; OPT: br_if 1, {{[^,]+}}{{$}}
; OPT-NEXT: .LBB18_2:
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
; OPT: block{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 1{{$}}
+; OPT: br_if 1, {{[^,]+}}{{$}}
; OPT-NEXT: br 3{{$}}
; OPT-NEXT: .LBB18_4:
; OPT-NEXT: end_block{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NEXT: br 2{{$}}
; OPT-NEXT: .LBB18_5:
; OPT-NOT: block
@@ -852,50 +876,50 @@ end:
; CHECK: .LBB19_1:
; CHECK-NEXT: loop{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 0{{$}}
-; CHECK-NEXT: .LBB19_2:
+; CHECK: br_if 0, {{[^,]+}}{{$}}
+; CHECK: .LBB19_3:
; CHECK-NEXT: block{{$}}
; CHECK-NEXT: loop{{$}}
; CHECK-NOT: block
-; CHECK: .LBB19_3:
+; CHECK: .LBB19_4:
; CHECK-NEXT: loop{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 5{{$}}
+; CHECK: br_if 5, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: tableswitch {{[^,]*}}, 0, 0, 1, 5, 2, 4{{$}}
-; CHECK-NEXT: .LBB19_5:
+; CHECK: br_table {{[^,]+}}, 0, 1, 5, 2, 4, 0{{$}}
+; CHECK-NEXT: .LBB19_6:
; CHECK-NEXT: end_loop{{$}}
; CHECK-NEXT: end_loop{{$}}
; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: .LBB19_6:
+; CHECK-NEXT: .LBB19_7:
; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
; CHECK: br 0{{$}}
-; CHECK-NEXT: .LBB19_7:
+; CHECK-NEXT: .LBB19_8:
; OPT-LABEL: test10:
; OPT: .LBB19_1:
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 0{{$}}
-; OPT-NEXT: .LBB19_2:
+; OPT: br_if 0, {{[^,]+}}{{$}}
+; OPT: .LBB19_3:
; OPT-NEXT: block{{$}}
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
-; OPT: .LBB19_3:
+; OPT: .LBB19_4:
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 5{{$}}
+; OPT: br_if 5, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: tableswitch {{[^,]*}}, 0, 0, 1, 5, 2, 4{{$}}
-; OPT-NEXT: .LBB19_5:
+; OPT: br_table {{[^,]+}}, 0, 1, 5, 2, 4, 0{{$}}
+; OPT-NEXT: .LBB19_6:
; OPT-NEXT: end_loop{{$}}
; OPT-NEXT: end_loop{{$}}
; OPT-NEXT: return{{$}}
-; OPT-NEXT: .LBB19_6:
+; OPT-NEXT: .LBB19_7:
; OPT-NEXT: end_block{{$}}
; OPT-NOT: block
; OPT: br 0{{$}}
-; OPT-NEXT: .LBB19_7:
+; OPT-NEXT: .LBB19_8:
define void @test10() {
bb0:
br label %bb1
@@ -938,12 +962,12 @@ bb6:
; CHECK-NEXT: block{{$}}
; CHECK-NEXT: block{{$}}
; CHECK-NEXT: block{{$}}
-; CHECK-NEXT: br_if {{[^,]*}}, 0{{$}}
-; CHECK-NEXT: block{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: block{{$}}
+; CHECK-NEXT: br_if 0, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 2{{$}}
+; CHECK: br_if 2, {{[^,]+}}{{$}}
; CHECK-NEXT: .LBB20_3:
; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
@@ -951,9 +975,9 @@ bb6:
; CHECK-NEXT: .LBB20_4:
; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 2{{$}}
+; CHECK: br_if 1, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 1{{$}}
+; CHECK: br_if 2, {{[^,]+}}{{$}}
; CHECK-NEXT: .LBB20_6:
; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
@@ -969,12 +993,12 @@ bb6:
; OPT-LABEL: test11:
; OPT: block{{$}}
; OPT-NEXT: block{{$}}
-; OPT-NEXT: br_if $0, 0{{$}}
-; OPT-NEXT: block{{$}}
+; OPT: br_if 0, $pop{{[0-9]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if $0, 0{{$}}
+; OPT: block{{$}}
+; OPT-NEXT: br_if 0, $0{{$}}
; OPT-NOT: block
-; OPT: br_if $0, 2{{$}}
+; OPT: br_if 2, {{[^,]+}}{{$}}
; OPT-NEXT: .LBB20_3:
; OPT-NEXT: end_block{{$}}
; OPT-NOT: block
@@ -984,13 +1008,13 @@ bb6:
; OPT-NOT: block
; OPT: block{{$}}
; OPT-NOT: block
-; OPT: br_if $pop9, 0{{$}}
+; OPT: br_if 0, $pop{{[0-9]+}}{{$}}
; OPT-NOT: block
; OPT: return{{$}}
; OPT-NEXT: .LBB20_6:
; OPT-NEXT: end_block{{$}}
; OPT-NOT: block
-; OPT: br_if $0, 0{{$}}
+; OPT: br_if 0, $pop{{[0-9]+}}{{$}}
; OPT-NOT: block
; OPT: return{{$}}
; OPT-NEXT: .LBB20_8:
@@ -1033,54 +1057,49 @@ bb8:
; CHECK-NOT: block
; CHECK: block{{$}}
; CHECK-NEXT: block{{$}}
-; CHECK-NEXT: block{{$}}
-; CHECK: br_if {{[^,]*}}, 0{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 2{{$}}
+; CHECK: br_if 1, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 2{{$}}
-; CHECK-NEXT: br 1{{$}}
+; CHECK: br_if 1, {{[^,]+}}{{$}}
+; CHECK-NEXT: br 3{{$}}
; CHECK-NEXT: .LBB21_4:
; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 1{{$}}
+; CHECK: br_if 0, {{[^,]+}}{{$}}
; CHECK-NOT: block
-; CHECK: br_if {{[^,]*}}, 1{{$}}
+; CHECK: br_if 2, {{[^,]+}}{{$}}
; CHECK-NEXT: .LBB21_6:
; CHECK-NEXT: end_block{{$}}
-; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: .LBB21_7:
-; CHECK-NEXT: end_block{{$}}
; CHECK-NOT: block
; CHECK: br 0{{$}}
-; CHECK-NEXT: .LBB21_8:
+; CHECK-NEXT: .LBB21_7:
+; CHECK-NEXT: end_loop{{$}}
+; CHECK-NEXT: return{{$}}
; OPT-LABEL: test12:
; OPT: .LBB21_1:
; OPT-NEXT: loop{{$}}
; OPT-NOT: block
; OPT: block{{$}}
; OPT-NEXT: block{{$}}
-; OPT-NEXT: block{{$}}
-; OPT: br_if {{[^,]*}}, 0{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 2{{$}}
+; OPT: br_if 1, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 2{{$}}
-; OPT-NEXT: br 1{{$}}
+; OPT: br_if 1, {{[^,]+}}{{$}}
+; OPT-NEXT: br 3{{$}}
; OPT-NEXT: .LBB21_4:
; OPT-NEXT: end_block{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 1{{$}}
+; OPT: br_if 0, {{[^,]+}}{{$}}
; OPT-NOT: block
-; OPT: br_if {{[^,]*}}, 1{{$}}
+; OPT: br_if 2, {{[^,]+}}{{$}}
; OPT-NEXT: .LBB21_6:
; OPT-NEXT: end_block{{$}}
-; OPT-NEXT: return{{$}}
-; OPT-NEXT: .LBB21_7:
-; OPT-NEXT: end_block{{$}}
-; OPT-NOT: block
; OPT: br 0{{$}}
-; OPT-NEXT: .LBB21_8:
+; OPT-NEXT: .LBB21_7:
+; OPT-NEXT: end_loop{{$}}
+; OPT-NEXT: return{{$}}
define void @test12(i8* %arg) {
bb:
br label %bb1
@@ -1109,33 +1128,37 @@ bb7:
; optnone to disable optimizations to test this case.
; CHECK-LABEL: test13:
-; CHECK-NEXT: local i32{{$}}
+; CHECK-NEXT: .local i32{{$}}
+; CHECK-NEXT: block{{$}}
+; CHECK-NEXT: block{{$}}
+; CHECK: br_if 0, $pop0{{$}}
; CHECK: block{{$}}
-; CHECK: br_if $pop4, 0{{$}}
-; CHECK-NEXT: return{{$}}
-; CHECK-NEXT: .LBB22_2:
+; CHECK: br_if 0, $pop3{{$}}
+; CHECK: .LBB22_3:
; CHECK-NEXT: end_block{{$}}
-; CHECK: block{{$}}
-; CHECK-NEXT: br_if $0, 0{{$}}
-; CHECK: .LBB22_4:
+; CHECK: br_if 1, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: br 1{{$}}
+; CHECK-NEXT: .LBB22_4:
; CHECK-NEXT: end_block{{$}}
-; CHECK: block{{$}}
-; CHECK: br_if $pop6, 0{{$}}
+; CHECK-NEXT: return{{$}}
+; CHECK-NEXT: .LBB22_5:
; CHECK-NEXT: end_block{{$}}
; CHECK-NEXT: unreachable{{$}}
; OPT-LABEL: test13:
-; OPT-NEXT: local i32{{$}}
-; OPT: block{{$}}
-; OPT: br_if $pop4, 0{{$}}
-; OPT-NEXT: return{{$}}
-; OPT-NEXT: .LBB22_2:
-; OPT-NEXT: end_block{{$}}
+; OPT-NEXT: .local i32{{$}}
+; OPT-NEXT: block{{$}}
+; OPT-NEXT: block{{$}}
+; OPT: br_if 0, $pop0{{$}}
; OPT: block{{$}}
-; OPT-NEXT: br_if $0, 0{{$}}
-; OPT: .LBB22_4:
+; OPT: br_if 0, $pop3{{$}}
+; OPT: .LBB22_3:
; OPT-NEXT: end_block{{$}}
-; OPT: block{{$}}
-; OPT: br_if $pop6, 0{{$}}
+; OPT: br_if 1, $pop{{[0-9]+}}{{$}}
+; OPT-NEXT: br 1{{$}}
+; OPT-NEXT: .LBB22_4:
+; OPT-NEXT: end_block
+; OPT-NEXT: return
+; OPT-NEXT: .LBB22_5:
; OPT-NEXT: end_block{{$}}
; OPT-NEXT: unreachable{{$}}
define void @test13() noinline optnone {
@@ -1159,15 +1182,15 @@ bb5:
; before the loop for the second.
; CHECK-LABEL: test14:
-; CHECK-NEXT: local i32{{$}}
-; CHECK-NEXT: i32.const $0=, 0{{$}}
; CHECK-NEXT: .LBB23_1:{{$}}
; CHECK-NEXT: loop{{$}}
-; CHECK-NEXT: br_if $0, 0{{$}}
-; CHECK-NEXT: .LBB23_2:{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: br_if 0, $pop0{{$}}
; CHECK-NEXT: end_loop{{$}}
+; CHECK-NEXT: .LBB23_3:{{$}}
; CHECK-NEXT: loop{{$}}
-; CHECK-NEXT: br_if $0, 0{{$}}
+; CHECK-NEXT: i32.const $push1=, 0{{$}}
+; CHECK-NEXT: br_if 0, $pop1{{$}}
; CHECK-NEXT: end_loop{{$}}
; CHECK-NEXT: return{{$}}
define void @test14() {
@@ -1215,3 +1238,81 @@ bb48:
bb50:
ret void
}
+
+; Test that a block boundary which ends one block, begins another block, and
+; also begins a loop, has the markers placed in the correct order.
+
+; CHECK-LABEL: test15:
+; CHECK: block
+; CHECK-NEXT: block
+; CHECK: br_if 0, $pop{{.*}}{{$}}
+; CHECK: .LBB24_2:
+; CHECK-NEXT: block{{$}}
+; CHECK-NEXT: loop{{$}}
+; CHECK: br_if 1, $pop{{.*}}{{$}}
+; CHECK: br_if 0, ${{.*}}{{$}}
+; CHECK-NEXT: br 2{{$}}
+; CHECK-NEXT: .LBB24_4:
+; CHECK-NEXT: end_loop{{$}}
+; CHECK: .LBB24_5:
+; CHECK-NEXT: end_block{{$}}
+; CHECK: br_if 1, $pop{{.*}}{{$}}
+; CHECK: return{{$}}
+; CHECK: .LBB24_7:
+; CHECK-NEXT: end_block{{$}}
+; CHECK: .LBB24_8:
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: return{{$}}
+; OPT-LABEL: test15:
+; OPT: block
+; OPT: block
+; OPT-NEXT: i32.const $push
+; OPT-NEXT: i32.eqz $push{{.*}}=, $pop{{.*}}{{$}}
+; OPT-NEXT: br_if 0, $pop{{.*}}{{$}}
+; OPT-NEXT: call test15_callee1@FUNCTION{{$}}
+; OPT-NEXT: br 1{{$}}
+; OPT-NEXT: .LBB24_2:
+; OPT-NEXT: end_block
+; OPT-NEXT: i32.const
+; OPT-NEXT: .LBB24_3:
+; OPT-NEXT: block
+; OPT-NEXT: loop
+%0 = type { i8, i32 }
+declare void @test15_callee0()
+declare void @test15_callee1()
+define void @test15() {
+bb:
+ %tmp1 = icmp eq i8 1, 0
+ br i1 %tmp1, label %bb2, label %bb14
+
+bb2:
+ %tmp3 = phi %0** [ %tmp6, %bb5 ], [ null, %bb ]
+ %tmp4 = icmp eq i32 0, 11
+ br i1 %tmp4, label %bb5, label %bb8
+
+bb5:
+ %tmp = bitcast i8* null to %0**
+ %tmp6 = getelementptr %0*, %0** %tmp3, i32 1
+ %tmp7 = icmp eq %0** %tmp6, null
+ br i1 %tmp7, label %bb10, label %bb2
+
+bb8:
+ %tmp9 = icmp eq %0** null, undef
+ br label %bb10
+
+bb10:
+ %tmp11 = phi %0** [ null, %bb8 ], [ %tmp, %bb5 ]
+ %tmp12 = icmp eq %0** null, %tmp11
+ br i1 %tmp12, label %bb15, label %bb13
+
+bb13:
+ call void @test15_callee0()
+ ret void
+
+bb14:
+ call void @test15_callee1()
+ ret void
+
+bb15:
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/comparisons_f32.ll b/test/CodeGen/WebAssembly/comparisons_f32.ll
index 2d324f7f2083..10e037d57a7a 100644
--- a/test/CodeGen/WebAssembly/comparisons_f32.ll
+++ b/test/CodeGen/WebAssembly/comparisons_f32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic 32-bit floating-point comparison operations assemble as
; expected.
diff --git a/test/CodeGen/WebAssembly/comparisons_f64.ll b/test/CodeGen/WebAssembly/comparisons_f64.ll
index 22fbc1ae4c1f..7d038a09ccbf 100644
--- a/test/CodeGen/WebAssembly/comparisons_f64.ll
+++ b/test/CodeGen/WebAssembly/comparisons_f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic 64-bit floating-point comparison operations assemble as
; expected.
diff --git a/test/CodeGen/WebAssembly/comparisons_i32.ll b/test/CodeGen/WebAssembly/comparisons_i32.ll
index db81ef36e270..d2ba73f79a3d 100644
--- a/test/CodeGen/WebAssembly/comparisons_i32.ll
+++ b/test/CodeGen/WebAssembly/comparisons_i32.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel -fast-isel-abort=1 | FileCheck %s
; Test that basic 32-bit integer comparison operations assemble as expected.
diff --git a/test/CodeGen/WebAssembly/comparisons_i64.ll b/test/CodeGen/WebAssembly/comparisons_i64.ll
index 19e5cf8603bf..80950ae5cd9a 100644
--- a/test/CodeGen/WebAssembly/comparisons_i64.ll
+++ b/test/CodeGen/WebAssembly/comparisons_i64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel -fast-isel-abort=1 | FileCheck %s
; Test that basic 64-bit integer comparison operations assemble as expected.
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index 1a4bd72d72d6..27cebb117dd4 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic conversion operations assemble as expected.
diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll
index 51856fcd12c2..78aee0f59d92 100644
--- a/test/CodeGen/WebAssembly/cpus.ll
+++ b/test/CodeGen/WebAssembly/cpus.ll
@@ -9,8 +9,8 @@
; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
-; CHECK-NOT: {{.*}} is not a recognized processor for this target
-; INVALID: {{.*}} is not a recognized processor for this target
+; CHECK-NOT: is not a recognized processor for this target
+; INVALID: {{.+}} is not a recognized processor for this target
define i32 @f(i32 %i_like_the_web) {
ret i32 %i_like_the_web
diff --git a/test/CodeGen/WebAssembly/dead-vreg.ll b/test/CodeGen/WebAssembly/dead-vreg.ll
index 29a41990961d..190a08564001 100644
--- a/test/CodeGen/WebAssembly/dead-vreg.ll
+++ b/test/CodeGen/WebAssembly/dead-vreg.ll
@@ -8,7 +8,7 @@ target triple = "wasm32-unknown-unknown"
define void @foo(i32* nocapture %a, i32 %w, i32 %h) {
; CHECK-LABEL: foo:
; CHECK-NEXT: .param i32, i32, i32{{$}}
-; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32, i32{{$}}
+; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32{{$}}
entry:
%cmp.19 = icmp sgt i32 %h, 0
br i1 %cmp.19, label %for.cond.1.preheader.lr.ph, label %for.end.7
diff --git a/test/CodeGen/WebAssembly/divrem-constant.ll b/test/CodeGen/WebAssembly/divrem-constant.ll
new file mode 100644
index 000000000000..6150cab4d4fd
--- /dev/null
+++ b/test/CodeGen/WebAssembly/divrem-constant.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that integer div and rem by constant are optimized appropriately.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: test_udiv_2:
+; CHECK: i32.shr_u
+define i32 @test_udiv_2(i32 %x) {
+ %t = udiv i32 %x, 2
+ ret i32 %t
+}
+
+; CHECK-LABEL: test_udiv_5:
+; CHECK: i32.div_u
+define i32 @test_udiv_5(i32 %x) {
+ %t = udiv i32 %x, 5
+ ret i32 %t
+}
+
+; CHECK-LABEL: test_sdiv_2:
+; CHECK: i32.div_s
+define i32 @test_sdiv_2(i32 %x) {
+ %t = sdiv i32 %x, 2
+ ret i32 %t
+}
+
+; CHECK-LABEL: test_sdiv_5:
+; CHECK: i32.div_s
+define i32 @test_sdiv_5(i32 %x) {
+ %t = sdiv i32 %x, 5
+ ret i32 %t
+}
+
+; CHECK-LABEL: test_urem_2:
+; CHECK: i32.and
+define i32 @test_urem_2(i32 %x) {
+ %t = urem i32 %x, 2
+ ret i32 %t
+}
+
+; CHECK-LABEL: test_urem_5:
+; CHECK: i32.rem_u
+define i32 @test_urem_5(i32 %x) {
+ %t = urem i32 %x, 5
+ ret i32 %t
+}
+
+; CHECK-LABEL: test_srem_2:
+; CHECK: i32.rem_s
+define i32 @test_srem_2(i32 %x) {
+ %t = srem i32 %x, 2
+ ret i32 %t
+}
+
+; CHECK-LABEL: test_srem_5:
+; CHECK: i32.rem_s
+define i32 @test_srem_5(i32 %x) {
+ %t = srem i32 %x, 5
+ ret i32 %t
+}
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index c32a7c3dc7d9..1c1d8191a987 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic 32-bit floating-point operations assemble as expected.
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index 92284999cbf7..670f3f0b6978 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic 64-bit floating-point operations assemble as expected.
diff --git a/test/CodeGen/WebAssembly/fast-isel.ll b/test/CodeGen/WebAssembly/fast-isel.ll
index 7f9f20fa7083..d3ee77632bca 100644
--- a/test/CodeGen/WebAssembly/fast-isel.ll
+++ b/test/CodeGen/WebAssembly/fast-isel.ll
@@ -18,3 +18,31 @@ define float @immediate_f32() {
define double @immediate_f64() {
ret double 2.5
}
+
+; CHECK-LABEL: bitcast_i32_f32:
+; CHECK: i32.reinterpret/f32 $push{{[0-9]+}}=, $0{{$}}
+define i32 @bitcast_i32_f32(float %x) {
+ %y = bitcast float %x to i32
+ ret i32 %y
+}
+
+; CHECK-LABEL: bitcast_f32_i32:
+; CHECK: f32.reinterpret/i32 $push{{[0-9]+}}=, $0{{$}}
+define float @bitcast_f32_i32(i32 %x) {
+ %y = bitcast i32 %x to float
+ ret float %y
+}
+
+; CHECK-LABEL: bitcast_i64_f64:
+; CHECK: i64.reinterpret/f64 $push{{[0-9]+}}=, $0{{$}}
+define i64 @bitcast_i64_f64(double %x) {
+ %y = bitcast double %x to i64
+ ret i64 %y
+}
+
+; CHECK-LABEL: bitcast_f64_i64:
+; CHECK: f64.reinterpret/i64 $push{{[0-9]+}}=, $0{{$}}
+define double @bitcast_f64_i64(i64 %x) {
+ %y = bitcast i64 %x to double
+ ret double %y
+}
diff --git a/test/CodeGen/WebAssembly/frem.ll b/test/CodeGen/WebAssembly/frem.ll
index b8c80fbe6997..b8745224ab82 100644
--- a/test/CodeGen/WebAssembly/frem.ll
+++ b/test/CodeGen/WebAssembly/frem.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that the frem instruction works.
diff --git a/test/CodeGen/WebAssembly/func.ll b/test/CodeGen/WebAssembly/func.ll
index 9857dadee414..71c00a46de86 100644
--- a/test/CodeGen/WebAssembly/func.ll
+++ b/test/CodeGen/WebAssembly/func.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic functions assemble as expected.
@@ -44,7 +44,8 @@ define void @f3(i32 %p1, float %p2) {
; CHECK-LABEL: f4:
; CHECK-NEXT: .param i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: local
+; CHECK-NOT: local
+; CHECK: .size f4,
define i32 @f4(i32 %x) {
entry:
%c = trunc i32 %x to i1
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
index 85fe5c896565..1d24035d8dd4 100644
--- a/test/CodeGen/WebAssembly/global.ll
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that globals assemble as expected.
@@ -21,8 +21,8 @@ define i32 @foo() {
; CHECK-LABEL: call_memcpy:
; CHECK-NEXT: .param i32, i32, i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: call memcpy@FUNCTION, $0, $1, $2{{$}}
-; CHECK-NEXT: return $0{{$}}
+; CHECK-NEXT: i32.call $push0=, memcpy@FUNCTION, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
@@ -30,7 +30,7 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
}
; CHECK: .type .Lg,@object
-; CHECK: .align 2{{$}}
+; CHECK: .p2align 2{{$}}
; CHECK-NEXT: .Lg:
; CHECK-NEXT: .int32 1337{{$}}
; CHECK-NEXT: .size .Lg, 4{{$}}
@@ -50,28 +50,28 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
@z = internal global i32 0
; CHECK-NEXT: .type one,@object
-; CHECK-NEXT: .align 2{{$}}
+; CHECK-NEXT: .p2align 2{{$}}
; CHECK-NEXT: one:
; CHECK-NEXT: .int32 1{{$}}
; CHECK-NEXT: .size one, 4{{$}}
@one = internal global i32 1
; CHECK: .type answer,@object
-; CHECK: .align 2{{$}}
+; CHECK: .p2align 2{{$}}
; CHECK-NEXT: answer:
; CHECK-NEXT: .int32 42{{$}}
; CHECK-NEXT: .size answer, 4{{$}}
@answer = internal global i32 42
; CHECK: .type u32max,@object
-; CHECK: .align 2{{$}}
+; CHECK: .p2align 2{{$}}
; CHECK-NEXT: u32max:
; CHECK-NEXT: .int32 4294967295{{$}}
; CHECK-NEXT: .size u32max, 4{{$}}
@u32max = internal global i32 -1
; CHECK: .type ud64,@object
-; CHECK: .align 3{{$}}
+; CHECK: .p2align 3{{$}}
; CHECK-NEXT: ud64:
; CHECK-NEXT: .skip 8{{$}}
; CHECK-NEXT: .size ud64, 8{{$}}
@@ -86,21 +86,21 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
@z64 = internal global i64 0
; CHECK: .type twoP32,@object
-; CHECK: .align 3{{$}}
+; CHECK: .p2align 3{{$}}
; CHECK-NEXT: twoP32:
; CHECK-NEXT: .int64 4294967296{{$}}
; CHECK-NEXT: .size twoP32, 8{{$}}
@twoP32 = internal global i64 4294967296
; CHECK: .type u64max,@object
-; CHECK: .align 3{{$}}
+; CHECK: .p2align 3{{$}}
; CHECK-NEXT: u64max:
; CHECK-NEXT: .int64 -1{{$}}
; CHECK-NEXT: .size u64max, 8{{$}}
@u64max = internal global i64 -1
; CHECK: .type f32ud,@object
-; CHECK: .align 2{{$}}
+; CHECK: .p2align 2{{$}}
; CHECK-NEXT: f32ud:
; CHECK-NEXT: .skip 4{{$}}
; CHECK-NEXT: .size f32ud, 4{{$}}
@@ -115,21 +115,21 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
@f32z = internal global float 0.0
; CHECK: .type f32nz,@object
-; CHECK: .align 2{{$}}
+; CHECK: .p2align 2{{$}}
; CHECK: f32nz:
; CHECK: .int32 2147483648{{$}}
; CHECK: .size f32nz, 4{{$}}
@f32nz = internal global float -0.0
; CHECK: .type f32two,@object
-; CHECK: .align 2{{$}}
+; CHECK: .p2align 2{{$}}
; CHECK-NEXT: f32two:
; CHECK-NEXT: .int32 1073741824{{$}}
; CHECK-NEXT: .size f32two, 4{{$}}
@f32two = internal global float 2.0
; CHECK: .type f64ud,@object
-; CHECK: .align 3{{$}}
+; CHECK: .p2align 3{{$}}
; CHECK-NEXT: f64ud:
; CHECK-NEXT: .skip 8{{$}}
; CHECK-NEXT: .size f64ud, 8{{$}}
@@ -144,14 +144,14 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
@f64z = internal global double 0.0
; CHECK: .type f64nz,@object
-; CHECK: .align 3{{$}}
+; CHECK: .p2align 3{{$}}
; CHECK-NEXT: f64nz:
; CHECK-NEXT: .int64 -9223372036854775808{{$}}
; CHECK-NEXT: .size f64nz, 8{{$}}
@f64nz = internal global double -0.0
; CHECK: .type f64two,@object
-; CHECK: .align 3{{$}}
+; CHECK: .p2align 3{{$}}
; CHECK-NEXT: f64two:
; CHECK-NEXT: .int64 4611686018427387904{{$}}
; CHECK-NEXT: .size f64two, 8{{$}}
@@ -170,8 +170,22 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
; CHECK: .type rom,@object{{$}}
; CHECK: .section .rodata,"a",@progbits{{$}}
; CHECK: .globl rom{{$}}
-; CHECK: .align 4{{$}}
+; CHECK: .p2align 4{{$}}
; CHECK: rom:
; CHECK: .skip 512{{$}}
; CHECK: .size rom, 512{{$}}
@rom = constant [128 x i32] zeroinitializer, align 16
+
+; CHECK: .type array,@object
+; CHECK-NEXT: array:
+; CHECK-NEXT: .skip 8
+; CHECK-NEXT: .size array, 8
+; CHECK: .type pointer_to_array,@object
+; CHECK-NEXT: .section .data.rel.ro,"aw",@progbits
+; CHECK-NEXT: .globl pointer_to_array
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: pointer_to_array:
+; CHECK-NEXT: .int32 array+4
+; CHECK-NEXT: .size pointer_to_array, 4
+@array = internal constant [8 x i8] zeroinitializer, align 1
+@pointer_to_array = constant i8* getelementptr inbounds ([8 x i8], [8 x i8]* @array, i32 0, i32 4), align 4
diff --git a/test/CodeGen/WebAssembly/i128.ll b/test/CodeGen/WebAssembly/i128.ll
new file mode 100644
index 000000000000..29bf787863d5
--- /dev/null
+++ b/test/CodeGen/WebAssembly/i128.ll
@@ -0,0 +1,280 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+
+; Test that basic 128-bit integer operations assemble as expected.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare i128 @llvm.ctlz.i128(i128, i1)
+declare i128 @llvm.cttz.i128(i128, i1)
+declare i128 @llvm.ctpop.i128(i128)
+
+; CHECK-LABEL: add128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.add
+; CHECK: i64.store
+; CHECK: i64.add
+; CHECK: i64.store
+; CHECK-NEXT: return{{$}}
+define i128 @add128(i128 %x, i128 %y) {
+ %a = add i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: sub128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.sub
+; CHECK: i64.store
+; CHECK: i64.sub
+; CHECK: i64.store
+; CHECK-NEXT: return{{$}}
+define i128 @sub128(i128 %x, i128 %y) {
+ %a = sub i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: mul128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __multi3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @mul128(i128 %x, i128 %y) {
+ %a = mul i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: sdiv128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __divti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @sdiv128(i128 %x, i128 %y) {
+ %a = sdiv i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: udiv128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __udivti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @udiv128(i128 %x, i128 %y) {
+ %a = udiv i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: srem128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __modti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @srem128(i128 %x, i128 %y) {
+ %a = srem i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: urem128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __umodti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @urem128(i128 %x, i128 %y) {
+ %a = urem i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: and128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.and
+; CHECK: i64.store
+; CHECK: i64.and
+; CHECK: i64.store
+; CHECK-NEXT: return{{$}}
+define i128 @and128(i128 %x, i128 %y) {
+ %a = and i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: or128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.or
+; CHECK: i64.store
+; CHECK: i64.or
+; CHECK: i64.store
+; CHECK-NEXT: return{{$}}
+define i128 @or128(i128 %x, i128 %y) {
+ %a = or i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: xor128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.xor
+; CHECK: i64.store
+; CHECK: i64.xor
+; CHECK: i64.store
+; CHECK-NEXT: return{{$}}
+define i128 @xor128(i128 %x, i128 %y) {
+ %a = xor i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: shl128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @shl128(i128 %x, i128 %y) {
+ %a = shl i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: shr128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @shr128(i128 %x, i128 %y) {
+ %a = lshr i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: sar128:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __ashrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @sar128(i128 %x, i128 %y) {
+ %a = ashr i128 %x, %y
+ ret i128 %a
+}
+
+; CHECK-LABEL: clz128:
+; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.clz
+; CHECK: i64.clz
+; CHECK: return{{$}}
+define i128 @clz128(i128 %x) {
+ %a = call i128 @llvm.ctlz.i128(i128 %x, i1 false)
+ ret i128 %a
+}
+
+; CHECK-LABEL: clz128_zero_undef:
+; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.clz
+; CHECK: i64.clz
+; CHECK: return{{$}}
+define i128 @clz128_zero_undef(i128 %x) {
+ %a = call i128 @llvm.ctlz.i128(i128 %x, i1 true)
+ ret i128 %a
+}
+
+; CHECK-LABEL: ctz128:
+; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.ctz
+; CHECK: i64.ctz
+; CHECK: return{{$}}
+define i128 @ctz128(i128 %x) {
+ %a = call i128 @llvm.cttz.i128(i128 %x, i1 false)
+ ret i128 %a
+}
+
+; CHECK-LABEL: ctz128_zero_undef:
+; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.ctz
+; CHECK: i64.ctz
+; CHECK: return{{$}}
+define i128 @ctz128_zero_undef(i128 %x) {
+ %a = call i128 @llvm.cttz.i128(i128 %x, i1 true)
+ ret i128 %a
+}
+
+; CHECK-LABEL: popcnt128:
+; CHECK-NEXT: .param i32, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: i64.popcnt
+; CHECK: i64.popcnt
+; CHECK: return{{$}}
+define i128 @popcnt128(i128 %x) {
+ %a = call i128 @llvm.ctpop.i128(i128 %x)
+ ret i128 %a
+}
+
+; CHECK-LABEL: eqz128:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK: i64.or
+; CHECK: i64.eqz
+; CHECK: return $
+define i32 @eqz128(i128 %x) {
+ %a = icmp eq i128 %x, 0
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: rotl:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @rotl(i128 %x, i128 %y) {
+ %z = sub i128 128, %y
+ %b = shl i128 %x, %y
+ %c = lshr i128 %x, %z
+ %d = or i128 %b, %c
+ ret i128 %d
+}
+
+; CHECK-LABEL: masked_rotl:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @masked_rotl(i128 %x, i128 %y) {
+ %a = and i128 %y, 127
+ %z = sub i128 128, %a
+ %b = shl i128 %x, %a
+ %c = lshr i128 %x, %z
+ %d = or i128 %b, %c
+ ret i128 %d
+}
+
+; CHECK-LABEL: rotr:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @rotr(i128 %x, i128 %y) {
+ %z = sub i128 128, %y
+ %b = lshr i128 %x, %y
+ %c = shl i128 %x, %z
+ %d = or i128 %b, %c
+ ret i128 %d
+}
+
+; CHECK-LABEL: masked_rotr:
+; CHECK-NEXT: .param i32, i64, i64, i64, i64{{$}}
+; CHECK-NOT: .result
+; CHECK: call __lshrti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: call __ashlti3@FUNCTION, ${{.+}}, ${{.+}}, ${{.+}}, ${{.+}}{{$}}
+; CHECK: return{{$}}
+define i128 @masked_rotr(i128 %x, i128 %y) {
+ %a = and i128 %y, 127
+ %z = sub i128 128, %a
+ %b = lshr i128 %x, %a
+ %c = shl i128 %x, %z
+ %d = or i128 %b, %c
+ ret i128 %d
+}
diff --git a/test/CodeGen/WebAssembly/i32-load-store-alignment.ll b/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
new file mode 100644
index 000000000000..b254413d380f
--- /dev/null
+++ b/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
@@ -0,0 +1,212 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+
+; Test loads and stores with custom alignment values.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: ldi32_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ldi32_a1(i32 *%p) {
+ %v = load i32, i32* %p, align 1
+ ret i32 %v
+}
+
+; CHECK-LABEL: ldi32_a2:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0):p2align=1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ldi32_a2(i32 *%p) {
+ %v = load i32, i32* %p, align 2
+ ret i32 %v
+}
+
+; 4 is the default alignment for i32 so no attribute is needed.
+
+; CHECK-LABEL: ldi32_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ldi32_a4(i32 *%p) {
+ %v = load i32, i32* %p, align 4
+ ret i32 %v
+}
+
+; The default alignment in LLVM is the same as the defualt alignment in wasm.
+
+; CHECK-LABEL: ldi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ldi32(i32 *%p) {
+ %v = load i32, i32* %p
+ ret i32 %v
+}
+
+; 8 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: ldi32_a8:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i32 @ldi32_a8(i32 *%p) {
+ %v = load i32, i32* %p, align 8
+ ret i32 %v
+}
+
+; Extending loads.
+
+; CHECK-LABEL: ldi8_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i8 @ldi8_a1(i8 *%p) {
+ %v = load i8, i8* %p, align 1
+ ret i8 %v
+}
+
+; CHECK-LABEL: ldi8_a2:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i8 @ldi8_a2(i8 *%p) {
+ %v = load i8, i8* %p, align 2
+ ret i8 %v
+}
+
+; CHECK-LABEL: ldi16_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load16_u $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i16 @ldi16_a1(i16 *%p) {
+ %v = load i16, i16* %p, align 1
+ ret i16 %v
+}
+
+; CHECK-LABEL: ldi16_a2:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i16 @ldi16_a2(i16 *%p) {
+ %v = load i16, i16* %p, align 2
+ ret i16 %v
+}
+
+; CHECK-LABEL: ldi16_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i16 @ldi16_a4(i16 *%p) {
+ %v = load i16, i16* %p, align 4
+ ret i16 %v
+}
+
+; Stores.
+
+; CHECK-LABEL: sti32_a1:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a1(i32 *%p, i32 %v) {
+ store i32 %v, i32* %p, align 1
+ ret void
+}
+
+; CHECK-LABEL: sti32_a2:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0):p2align=1, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a2(i32 *%p, i32 %v) {
+ store i32 %v, i32* %p, align 2
+ ret void
+}
+
+; 4 is the default alignment for i32 so no attribute is needed.
+
+; CHECK-LABEL: sti32_a4:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a4(i32 *%p, i32 %v) {
+ store i32 %v, i32* %p, align 4
+ ret void
+}
+
+; The default alignment in LLVM is the same as the defualt alignment in wasm.
+
+; CHECK-LABEL: sti32:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32(i32 *%p, i32 %v) {
+ store i32 %v, i32* %p
+ ret void
+}
+
+; CHECK-LABEL: sti32_a8:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a8(i32 *%p, i32 %v) {
+ store i32 %v, i32* %p, align 8
+ ret void
+}
+
+; Truncating stores.
+
+; CHECK-LABEL: sti8_a1:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store8 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti8_a1(i8 *%p, i8 %v) {
+ store i8 %v, i8* %p, align 1
+ ret void
+}
+
+; CHECK-LABEL: sti8_a2:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store8 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti8_a2(i8 *%p, i8 %v) {
+ store i8 %v, i8* %p, align 2
+ ret void
+}
+
+; CHECK-LABEL: sti16_a1:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store16 $drop=, 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti16_a1(i16 *%p, i16 %v) {
+ store i16 %v, i16* %p, align 1
+ ret void
+}
+
+; CHECK-LABEL: sti16_a2:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store16 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti16_a2(i16 *%p, i16 %v) {
+ store i16 %v, i16* %p, align 2
+ ret void
+}
+
+; CHECK-LABEL: sti16_a4:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.store16 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti16_a4(i16 *%p, i16 %v) {
+ store i16 %v, i16* %p, align 4
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/i32.ll b/test/CodeGen/WebAssembly/i32.ll
index 10d97ad9e6d1..a07dd02beced 100644
--- a/test/CodeGen/WebAssembly/i32.ll
+++ b/test/CodeGen/WebAssembly/i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic 32-bit integer operations assemble as expected.
@@ -188,3 +188,68 @@ define i32 @popcnt32(i32 %x) {
%a = call i32 @llvm.ctpop.i32(i32 %x)
ret i32 %a
}
+
+; CHECK-LABEL: eqz32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.eqz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @eqz32(i32 %x) {
+ %a = icmp eq i32 %x, 0
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: rotl:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.rotl $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @rotl(i32 %x, i32 %y) {
+ %z = sub i32 32, %y
+ %b = shl i32 %x, %y
+ %c = lshr i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+; CHECK-LABEL: masked_rotl:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.rotl $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @masked_rotl(i32 %x, i32 %y) {
+ %a = and i32 %y, 31
+ %z = sub i32 32, %a
+ %b = shl i32 %x, %a
+ %c = lshr i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+; CHECK-LABEL: rotr:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.rotr $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @rotr(i32 %x, i32 %y) {
+ %z = sub i32 32, %y
+ %b = lshr i32 %x, %y
+ %c = shl i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
+
+; CHECK-LABEL: masked_rotr:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.rotr $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @masked_rotr(i32 %x, i32 %y) {
+ %a = and i32 %y, 31
+ %z = sub i32 32, %a
+ %b = lshr i32 %x, %a
+ %c = shl i32 %x, %z
+ %d = or i32 %b, %c
+ ret i32 %d
+}
diff --git a/test/CodeGen/WebAssembly/i64-load-store-alignment.ll b/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
new file mode 100644
index 000000000000..b2fb96290391
--- /dev/null
+++ b/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
@@ -0,0 +1,325 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+
+; Test loads and stores with custom alignment values.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: ldi64_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi64_a1(i64 *%p) {
+ %v = load i64, i64* %p, align 1
+ ret i64 %v
+}
+
+; CHECK-LABEL: ldi64_a2:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0):p2align=1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi64_a2(i64 *%p) {
+ %v = load i64, i64* %p, align 2
+ ret i64 %v
+}
+
+; CHECK-LABEL: ldi64_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi64_a4(i64 *%p) {
+ %v = load i64, i64* %p, align 4
+ ret i64 %v
+}
+
+; 8 is the default alignment for i64 so no attribute is needed.
+
+; CHECK-LABEL: ldi64_a8:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi64_a8(i64 *%p) {
+ %v = load i64, i64* %p, align 8
+ ret i64 %v
+}
+
+; The default alignment in LLVM is the same as the defualt alignment in wasm.
+
+; CHECK-LABEL: ldi64:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi64(i64 *%p) {
+ %v = load i64, i64* %p
+ ret i64 %v
+}
+
+; 16 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: ldi64_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi64_a16(i64 *%p) {
+ %v = load i64, i64* %p, align 16
+ ret i64 %v
+}
+
+; Extending loads.
+
+; CHECK-LABEL: ldi8_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi8_a1(i8 *%p) {
+ %v = load i8, i8* %p, align 1
+ %w = zext i8 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi8_a2:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load8_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi8_a2(i8 *%p) {
+ %v = load i8, i8* %p, align 2
+ %w = zext i8 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi16_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load16_u $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi16_a1(i16 *%p) {
+ %v = load i16, i16* %p, align 1
+ %w = zext i16 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi16_a2:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi16_a2(i16 *%p) {
+ %v = load i16, i16* %p, align 2
+ %w = zext i16 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi16_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load16_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi16_a4(i16 *%p) {
+ %v = load i16, i16* %p, align 4
+ %w = zext i16 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi32_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi32_a1(i32 *%p) {
+ %v = load i32, i32* %p, align 1
+ %w = zext i32 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi32_a2:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0):p2align=1{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi32_a2(i32 *%p) {
+ %v = load i32, i32* %p, align 2
+ %w = zext i32 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi32_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi32_a4(i32 *%p) {
+ %v = load i32, i32* %p, align 4
+ %w = zext i32 %v to i64
+ ret i64 %w
+}
+
+; CHECK-LABEL: ldi32_a8:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.load32_u $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define i64 @ldi32_a8(i32 *%p) {
+ %v = load i32, i32* %p, align 8
+ %w = zext i32 %v to i64
+ ret i64 %w
+}
+
+; Stores.
+
+; CHECK-LABEL: sti64_a1:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store $drop=, 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti64_a1(i64 *%p, i64 %v) {
+ store i64 %v, i64* %p, align 1
+ ret void
+}
+
+; CHECK-LABEL: sti64_a2:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store $drop=, 0($0):p2align=1, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti64_a2(i64 *%p, i64 %v) {
+ store i64 %v, i64* %p, align 2
+ ret void
+}
+
+; CHECK-LABEL: sti64_a4:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store $drop=, 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti64_a4(i64 *%p, i64 %v) {
+ store i64 %v, i64* %p, align 4
+ ret void
+}
+
+; 8 is the default alignment for i32 so no attribute is needed.
+
+; CHECK-LABEL: sti64_a8:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti64_a8(i64 *%p, i64 %v) {
+ store i64 %v, i64* %p, align 8
+ ret void
+}
+
+; The default alignment in LLVM is the same as the defualt alignment in wasm.
+
+; CHECK-LABEL: sti64:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti64(i64 *%p, i64 %v) {
+ store i64 %v, i64* %p
+ ret void
+}
+
+; CHECK-LABEL: sti64_a16:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti64_a16(i64 *%p, i64 %v) {
+ store i64 %v, i64* %p, align 16
+ ret void
+}
+
+; Truncating stores.
+
+; CHECK-LABEL: sti8_a1:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store8 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti8_a1(i8 *%p, i64 %w) {
+ %v = trunc i64 %w to i8
+ store i8 %v, i8* %p, align 1
+ ret void
+}
+
+; CHECK-LABEL: sti8_a2:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store8 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti8_a2(i8 *%p, i64 %w) {
+ %v = trunc i64 %w to i8
+ store i8 %v, i8* %p, align 2
+ ret void
+}
+
+; CHECK-LABEL: sti16_a1:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store16 $drop=, 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti16_a1(i16 *%p, i64 %w) {
+ %v = trunc i64 %w to i16
+ store i16 %v, i16* %p, align 1
+ ret void
+}
+
+; CHECK-LABEL: sti16_a2:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store16 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti16_a2(i16 *%p, i64 %w) {
+ %v = trunc i64 %w to i16
+ store i16 %v, i16* %p, align 2
+ ret void
+}
+
+; CHECK-LABEL: sti16_a4:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store16 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti16_a4(i16 *%p, i64 %w) {
+ %v = trunc i64 %w to i16
+ store i16 %v, i16* %p, align 4
+ ret void
+}
+
+; CHECK-LABEL: sti32_a1:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store32 $drop=, 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a1(i32 *%p, i64 %w) {
+ %v = trunc i64 %w to i32
+ store i32 %v, i32* %p, align 1
+ ret void
+}
+
+; CHECK-LABEL: sti32_a2:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store32 $drop=, 0($0):p2align=1, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a2(i32 *%p, i64 %w) {
+ %v = trunc i64 %w to i32
+ store i32 %v, i32* %p, align 2
+ ret void
+}
+
+; CHECK-LABEL: sti32_a4:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store32 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a4(i32 *%p, i64 %w) {
+ %v = trunc i64 %w to i32
+ store i32 %v, i32* %p, align 4
+ ret void
+}
+
+; CHECK-LABEL: sti32_a8:
+; CHECK-NEXT: .param i32, i64{{$}}
+; CHECK-NEXT: i64.store32 $drop=, 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @sti32_a8(i32 *%p, i64 %w) {
+ %v = trunc i64 %w to i32
+ store i32 %v, i32* %p, align 8
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/i64.ll b/test/CodeGen/WebAssembly/i64.ll
index 6dd46a91fad0..93e32bfc0e1d 100644
--- a/test/CodeGen/WebAssembly/i64.ll
+++ b/test/CodeGen/WebAssembly/i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic 64-bit integer operations assemble as expected.
@@ -188,3 +188,68 @@ define i64 @popcnt64(i64 %x) {
%a = call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %a
}
+
+; CHECK-LABEL: eqz64:
+; CHECK-NEXT: .param i64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i64.eqz $push0=, $0{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i32 @eqz64(i64 %x) {
+ %a = icmp eq i64 %x, 0
+ %b = zext i1 %a to i32
+ ret i32 %b
+}
+
+; CHECK-LABEL: rotl:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.rotl $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @rotl(i64 %x, i64 %y) {
+ %z = sub i64 64, %y
+ %b = shl i64 %x, %y
+ %c = lshr i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+; CHECK-LABEL: masked_rotl:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.rotl $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @masked_rotl(i64 %x, i64 %y) {
+ %a = and i64 %y, 63
+ %z = sub i64 64, %a
+ %b = shl i64 %x, %a
+ %c = lshr i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+; CHECK-LABEL: rotr:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.rotr $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @rotr(i64 %x, i64 %y) {
+ %z = sub i64 64, %y
+ %b = lshr i64 %x, %y
+ %c = shl i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
+
+; CHECK-LABEL: masked_rotr:
+; CHECK-NEXT: .param i64, i64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.rotr $push0=, $0, $1
+; CHECK-NEXT: return $pop0{{$}}
+define i64 @masked_rotr(i64 %x, i64 %y) {
+ %a = and i64 %y, 63
+ %z = sub i64 64, %a
+ %b = lshr i64 %x, %a
+ %c = shl i64 %x, %z
+ %d = or i64 %b, %c
+ ret i64 %d
+}
diff --git a/test/CodeGen/WebAssembly/immediates.ll b/test/CodeGen/WebAssembly/immediates.ll
index 735b386b4fc0..3d11f9410a79 100644
--- a/test/CodeGen/WebAssembly/immediates.ll
+++ b/test/CodeGen/WebAssembly/immediates.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic immediates assemble as expected.
@@ -133,6 +133,25 @@ define float @neginf_f32() {
ret float 0xFFF0000000000000
}
+; CHECK-LABEL: custom_nan_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -nan:0x6bcdef{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @custom_nan_f32() {
+ ret float 0xFFFD79BDE0000000
+}
+
+; TODO: LLVM's MC layer stores f32 operands as host doubles, requiring a
+; conversion, so the bits of the NaN are not fully preserved.
+
+; CHECK-LABEL: custom_nans_f32:
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: f32.const $push[[NUM:[0-9]+]]=, -nan:0x6bcdef{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define float @custom_nans_f32() {
+ ret float 0xFFF579BDE0000000
+}
+
; CHECK-LABEL: negzero_f64:
; CHECK-NEXT: .result f64{{$}}
; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -0x0p0{{$}}
@@ -196,3 +215,19 @@ define double @inf_f64() {
define double @neginf_f64() {
ret double 0xFFF0000000000000
}
+
+; CHECK-LABEL: custom_nan_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan:0xabcdef0123456{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @custom_nan_f64() {
+ ret double 0xFFFABCDEF0123456
+}
+
+; CHECK-LABEL: custom_nans_f64:
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: f64.const $push[[NUM:[0-9]+]]=, -nan:0x2bcdef0123456{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+define double @custom_nans_f64() {
+ ret double 0xFFF2BCDEF0123456
+}
diff --git a/test/CodeGen/WebAssembly/indirect-import.ll b/test/CodeGen/WebAssembly/indirect-import.ll
new file mode 100644
index 000000000000..1bde65bcbbba
--- /dev/null
+++ b/test/CodeGen/WebAssembly/indirect-import.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel | FileCheck %s
+
+; ModuleID = 'test/dot_s/indirect-import.c'
+source_filename = "test/dot_s/indirect-import.c"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32"
+
+%struct.big = type { float, double, i32 }
+
+; Function Attrs: nounwind
+; CHECK: bar:
+define hidden i32 @bar() #0 {
+entry:
+ %fd = alloca float (double)*, align 4
+ %vj = alloca void (i64)*, align 4
+ %v = alloca void ()*, align 4
+ %ijidf = alloca i32 (i64, i32, double, float)*, align 4
+ %vs = alloca void (%struct.big*)*, align 4
+ %s = alloca void (%struct.big*)*, align 4
+
+; CHECK: i32.const {{.+}}=, extern_fd@FUNCTION
+ store float (double)* @extern_fd, float (double)** %fd, align 4
+; CHECK: i32.const {{.+}}=, extern_vj@FUNCTION
+ store void (i64)* @extern_vj, void (i64)** %vj, align 4
+ %0 = load void (i64)*, void (i64)** %vj, align 4
+ call void %0(i64 1)
+
+; CHECK: i32.const {{.+}}=, extern_v@FUNCTION
+ store void ()* @extern_v, void ()** %v, align 4
+ %1 = load void ()*, void ()** %v, align 4
+ call void %1()
+
+; CHECK: i32.const {{.+}}=, extern_ijidf@FUNCTION
+ store i32 (i64, i32, double, float)* @extern_ijidf, i32 (i64, i32, double, float)** %ijidf, align 4
+ %2 = load i32 (i64, i32, double, float)*, i32 (i64, i32, double, float)** %ijidf, align 4
+ %call = call i32 %2(i64 1, i32 2, double 3.000000e+00, float 4.000000e+00)
+
+; CHECK: i32.const {{.+}}=, extern_struct@FUNCTION
+ store void (%struct.big*)* @extern_struct, void (%struct.big*)** %vs, align 4
+
+; CHECK: i32.const {{.+}}=, extern_sret@FUNCTION
+ store void (%struct.big*)* @extern_sret, void (%struct.big*)** %s, align 4
+ %3 = load float (double)*, float (double)** %fd, align 4
+ %4 = ptrtoint float (double)* %3 to i32
+ ret i32 %4
+}
+
+declare float @extern_fd(double) #1
+
+declare void @extern_vj(i64) #1
+
+declare void @extern_v() #1
+
+declare i32 @extern_ijidf(i64, i32, double, float) #1
+
+declare void @extern_struct(%struct.big* byval align 8) #1
+
+declare void @extern_sret(%struct.big* sret) #1
+
+declare i128 @extern_i128ret(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
+; CHECK: .functype extern_fd, f32, f64
+; CHECK: .functype extern_vj, void, i64
+; CHECK: .functype extern_v, void
+; CHECK: .functype extern_ijidf, i32, i64, i32, f64, f32
+; CHECK: .functype extern_struct, void, i32
+; CHECK: .functype extern_sret, void, i32
+; CHECK: .functype extern_i128ret, void, i32, i64
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index f35042e64f86..d36c32b546d3 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -no-integrated-as | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -no-integrated-as | FileCheck %s
; Test basic inline assembly. Pass -no-integrated-as since these aren't
; actually valid assembly syntax.
@@ -59,7 +59,7 @@ entry:
; CHECK-LABEL: X_i16:
; CHECK: foo $1{{$}}
-; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
+; CHECK: i32.store16 $drop=, 0($0), $1{{$}}
define void @X_i16(i16 * %t) {
call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16* %t)
ret void
@@ -67,7 +67,7 @@ define void @X_i16(i16 * %t) {
; CHECK-LABEL: X_ptr:
; CHECK: foo $1{{$}}
-; CHECK: i32.store $discard=, 0($0), $1{{$}}
+; CHECK: i32.store $drop=, 0($0), $1{{$}}
define void @X_ptr(i16 ** %t) {
call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t)
ret void
diff --git a/test/CodeGen/WebAssembly/irreducible-cfg.ll b/test/CodeGen/WebAssembly/irreducible-cfg.ll
new file mode 100644
index 000000000000..8fe7d10c5f31
--- /dev/null
+++ b/test/CodeGen/WebAssembly/irreducible-cfg.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-block-placement | FileCheck %s
+
+; Test irreducible CFG handling.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; A simple loop with two entries.
+
+; CHECK-LABEL: test0:
+; CHECK: f64.load
+; CHECK: i32.const $[[REG:[^,]+]]=, 0{{$}}
+; CHECK: br_table $[[REG]],
+define void @test0(double* %arg, i32 %arg1, i32 %arg2, i32 %arg3) {
+bb:
+ %tmp = icmp eq i32 %arg2, 0
+ br i1 %tmp, label %bb6, label %bb3
+
+bb3:
+ %tmp4 = getelementptr double, double* %arg, i32 %arg3
+ %tmp5 = load double, double* %tmp4, align 4
+ br label %bb13
+
+bb6:
+ %tmp7 = phi i32 [ %tmp18, %bb13 ], [ 0, %bb ]
+ %tmp8 = icmp slt i32 %tmp7, %arg1
+ br i1 %tmp8, label %bb9, label %bb19
+
+bb9:
+ %tmp10 = getelementptr double, double* %arg, i32 %tmp7
+ %tmp11 = load double, double* %tmp10, align 4
+ %tmp12 = fmul double %tmp11, 2.300000e+00
+ store double %tmp12, double* %tmp10, align 4
+ br label %bb13
+
+bb13:
+ %tmp14 = phi double [ %tmp5, %bb3 ], [ %tmp12, %bb9 ]
+ %tmp15 = phi i32 [ undef, %bb3 ], [ %tmp7, %bb9 ]
+ %tmp16 = getelementptr double, double* %arg, i32 %tmp15
+ %tmp17 = fadd double %tmp14, 1.300000e+00
+ store double %tmp17, double* %tmp16, align 4
+ %tmp18 = add nsw i32 %tmp15, 1
+ br label %bb6
+
+bb19:
+ ret void
+}
+
+; A simple loop with two entries and an inner natural loop.
+
+; CHECK-LABEL: test1:
+; CHECK: f64.load
+; CHECK: i32.const $[[REG:[^,]+]]=, 0{{$}}
+; CHECK: br_table $[[REG]],
+define void @test1(double* %arg, i32 %arg1, i32 %arg2, i32 %arg3) {
+bb:
+ %tmp = icmp eq i32 %arg2, 0
+ br i1 %tmp, label %bb6, label %bb3
+
+bb3:
+ %tmp4 = getelementptr double, double* %arg, i32 %arg3
+ %tmp5 = load double, double* %tmp4, align 4
+ br label %bb13
+
+bb6:
+ %tmp7 = phi i32 [ %tmp18, %bb13 ], [ 0, %bb ]
+ %tmp8 = icmp slt i32 %tmp7, %arg1
+ br i1 %tmp8, label %bb9, label %bb19
+
+bb9:
+ %tmp10 = getelementptr double, double* %arg, i32 %tmp7
+ %tmp11 = load double, double* %tmp10, align 4
+ %tmp12 = fmul double %tmp11, 2.300000e+00
+ store double %tmp12, double* %tmp10, align 4
+ br label %bb10
+
+bb10:
+ %p = phi i32 [ 0, %bb9 ], [ %pn, %bb10 ]
+ %pn = add i32 %p, 1
+ %c = icmp slt i32 %pn, 256
+ br i1 %c, label %bb10, label %bb13
+
+bb13:
+ %tmp14 = phi double [ %tmp5, %bb3 ], [ %tmp12, %bb10 ]
+ %tmp15 = phi i32 [ undef, %bb3 ], [ %tmp7, %bb10 ]
+ %tmp16 = getelementptr double, double* %arg, i32 %tmp15
+ %tmp17 = fadd double %tmp14, 1.300000e+00
+ store double %tmp17, double* %tmp16, align 4
+ %tmp18 = add nsw i32 %tmp15, 1
+ br label %bb6
+
+bb19:
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
index 5feb2e8c8c75..5cbfb8ace9ed 100644
--- a/test/CodeGen/WebAssembly/legalize.ll
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test various types and operators that need to be legalized.
diff --git a/test/CodeGen/WebAssembly/load-ext.ll b/test/CodeGen/WebAssembly/load-ext.ll
index d52df3361a38..48a7ce7c4bd2 100644
--- a/test/CodeGen/WebAssembly/load-ext.ll
+++ b/test/CodeGen/WebAssembly/load-ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that extending loads are assembled properly.
diff --git a/test/CodeGen/WebAssembly/load-store-i1.ll b/test/CodeGen/WebAssembly/load-store-i1.ll
index 47e2e8cb254f..2a2318fde10e 100644
--- a/test/CodeGen/WebAssembly/load-store-i1.ll
+++ b/test/CodeGen/WebAssembly/load-store-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that i1 extending loads and truncating stores are assembled properly.
@@ -15,10 +15,11 @@ define i32 @load_u_i1_i32(i1* %p) {
}
; CHECK-LABEL: load_s_i1_i32:
-; CHECK: i32.const $[[NUM1:[0-9]+]]=, 31{{$}}
-; CHECK-NEXT: i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
-; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
-; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
+; CHECK: i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i32.const $push[[NUM1:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM4:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $pop[[NUM4]]{{$}}
; CHECK-NEXT: return $pop[[NUM3]]{{$}}
define i32 @load_s_i1_i32(i1* %p) {
%v = load i1, i1* %p
@@ -36,10 +37,11 @@ define i64 @load_u_i1_i64(i1* %p) {
}
; CHECK-LABEL: load_s_i1_i64:
-; CHECK: i64.const $[[NUM1:[0-9]+]]=, 63{{$}}
-; CHECK-NEXT: i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
-; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
-; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
+; CHECK: i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i64.const $push[[NUM1:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i64.const $push[[NUM4:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $pop[[NUM4]]{{$}}
; CHECK-NEXT: return $pop[[NUM3]]{{$}}
define i64 @load_s_i1_i64(i1* %p) {
%v = load i1, i1* %p
@@ -50,7 +52,7 @@ define i64 @load_s_i1_i64(i1* %p) {
; CHECK-LABEL: store_i32_i1:
; CHECK: i32.const $push[[NUM0:[0-9]+]]=, 1{{$}}
; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
-; CHECK-NEXT: i32.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i32.store8 $drop=, 0($0), $pop[[NUM1]]{{$}}
define void @store_i32_i1(i1* %p, i32 %v) {
%t = trunc i32 %v to i1
store i1 %t, i1* %p
@@ -60,7 +62,7 @@ define void @store_i32_i1(i1* %p, i32 %v) {
; CHECK-LABEL: store_i64_i1:
; CHECK: i64.const $push[[NUM0:[0-9]+]]=, 1{{$}}
; CHECK-NEXT: i64.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
-; CHECK-NEXT: i64.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i64.store8 $drop=, 0($0), $pop[[NUM1]]{{$}}
define void @store_i64_i1(i1* %p, i64 %v) {
%t = trunc i64 %v to i1
store i1 %t, i1* %p
diff --git a/test/CodeGen/WebAssembly/load.ll b/test/CodeGen/WebAssembly/load.ll
index 243fa9d50ad6..a8e174e914e1 100644
--- a/test/CodeGen/WebAssembly/load.ll
+++ b/test/CodeGen/WebAssembly/load.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel -fast-isel-abort=1 | FileCheck %s
; Test that basic loads are assembled properly.
diff --git a/test/CodeGen/WebAssembly/loop-idiom.ll b/test/CodeGen/WebAssembly/loop-idiom.ll
deleted file mode 100644
index 2a233c406900..000000000000
--- a/test/CodeGen/WebAssembly/loop-idiom.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: opt -loop-idiom -S < %s -march=wasm32 | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
-
-
-; Make sure loop-idiom doesn't create memcpy or memset. These aren't well
-; supported in WebAssembly for now.
-;
-; TODO Check the patterns are recognized once memcpy / memset are supported.
-
-; CHECK-LABEL: @cpy(
-; CHECK-NOT: llvm.memcpy
-; CHECK: load
-; CHECK: store
-define void @cpy(i64 %Size) {
-bb.nph:
- %Base = alloca i8, i32 10000
- %Dest = alloca i8, i32 10000
- br label %for.body
-
-for.body:
- %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
- %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
- %DestI = getelementptr i8, i8* %Dest, i64 %indvar
- %V = load i8, i8* %I.0.014, align 1
- store i8 %V, i8* %DestI, align 1
- %indvar.next = add i64 %indvar, 1
- %exitcond = icmp eq i64 %indvar.next, %Size
- br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
- ret void
-}
-
-; CHECK-LABEL: @set(
-; CHECK-NOT: llvm.memset
-; CHECK: store
-define void @set(i8* %Base, i64 %Size) {
-bb.nph:
- br label %for.body
-
-for.body:
- %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
- %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
- store i8 0, i8* %I.0.014, align 1
- %indvar.next = add i64 %indvar, 1
- %exitcond = icmp eq i64 %indvar.next, %Size
- br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
- ret void
-}
diff --git a/test/CodeGen/WebAssembly/mem-intrinsics.ll b/test/CodeGen/WebAssembly/mem-intrinsics.ll
new file mode 100644
index 000000000000..71787feb77dc
--- /dev/null
+++ b/test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -0,0 +1,140 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+
+; Test memcpy, memmove, and memset intrinsics.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
+
+; Test that return values are optimized.
+
+; CHECK-LABEL: copy_yes:
+; CHECK: i32.call $push0=, memcpy@FUNCTION, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i8* @copy_yes(i8* %dst, i8* %src, i32 %len) {
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %len, i32 1, i1 false)
+ ret i8* %dst
+}
+
+; CHECK-LABEL: copy_no:
+; CHECK: i32.call $drop=, memcpy@FUNCTION, $0, $1, $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @copy_no(i8* %dst, i8* %src, i32 %len) {
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %len, i32 1, i1 false)
+ ret void
+}
+
+; CHECK-LABEL: move_yes:
+; CHECK: i32.call $push0=, memmove@FUNCTION, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i8* @move_yes(i8* %dst, i8* %src, i32 %len) {
+ call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %len, i32 1, i1 false)
+ ret i8* %dst
+}
+
+; CHECK-LABEL: move_no:
+; CHECK: i32.call $drop=, memmove@FUNCTION, $0, $1, $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @move_no(i8* %dst, i8* %src, i32 %len) {
+ call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %len, i32 1, i1 false)
+ ret void
+}
+
+; CHECK-LABEL: set_yes:
+; CHECK: i32.call $push0=, memset@FUNCTION, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define i8* @set_yes(i8* %dst, i8 %src, i32 %len) {
+ call void @llvm.memset.p0i8.i32(i8* %dst, i8 %src, i32 %len, i32 1, i1 false)
+ ret i8* %dst
+}
+
+; CHECK-LABEL: set_no:
+; CHECK: i32.call $drop=, memset@FUNCTION, $0, $1, $2{{$}}
+; CHECK-NEXT: return{{$}}
+define void @set_no(i8* %dst, i8 %src, i32 %len) {
+ call void @llvm.memset.p0i8.i32(i8* %dst, i8 %src, i32 %len, i32 1, i1 false)
+ ret void
+}
+
+
+; CHECK-LABEL: frame_index:
+; CHECK: i32.call $drop=, memset@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK: i32.call $push{{[0-9]+}}=, memset@FUNCTION, ${{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK: return{{$}}
+define void @frame_index() {
+entry:
+ %a = alloca [2048 x i8], align 16
+ %b = alloca [2048 x i8], align 16
+ %0 = getelementptr inbounds [2048 x i8], [2048 x i8]* %a, i32 0, i32 0
+ %1 = getelementptr inbounds [2048 x i8], [2048 x i8]* %b, i32 0, i32 0
+ call void @llvm.memset.p0i8.i32(i8* %0, i8 256, i32 1024, i32 16, i1 false)
+ call void @llvm.memset.p0i8.i32(i8* %1, i8 256, i32 1024, i32 16, i1 false)
+ ret void
+}
+
+; If the result value of memset doesn't get stackified, it should be marked
+; $drop. Note that we use a call to prevent tail dup so that we can test
+; this specific functionality.
+
+; CHECK-LABEL: drop_result:
+; CHECK: i32.call $drop=, memset@FUNCTION, $0, $1, $2
+declare i8* @def()
+declare void @block_tail_dup()
+define i8* @drop_result(i8* %arg, i8 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
+bb:
+ %tmp = icmp eq i32 %arg3, 0
+ br i1 %tmp, label %bb5, label %bb9
+
+bb5:
+ %tmp6 = icmp eq i32 %arg4, 0
+ br i1 %tmp6, label %bb7, label %bb8
+
+bb7:
+ call void @llvm.memset.p0i8.i32(i8* %arg, i8 %arg1, i32 %arg2, i32 1, i1 false)
+ br label %bb11
+
+bb8:
+ br label %bb11
+
+bb9:
+ %tmp10 = call i8* @def()
+ br label %bb11
+
+bb11:
+ %tmp12 = phi i8* [ %arg, %bb7 ], [ %arg, %bb8 ], [ %tmp10, %bb9 ]
+ call void @block_tail_dup()
+ ret i8* %tmp12
+}
+
+; This is the same as drop_result, except we let tail dup happen, so the
+; result of the memset *is* stackified.
+
+; CHECK-LABEL: tail_dup_to_reuse_result:
+; CHECK: i32.call $push{{[0-9]+}}=, memset@FUNCTION, $0, $1, $2
+define i8* @tail_dup_to_reuse_result(i8* %arg, i8 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
+bb:
+ %tmp = icmp eq i32 %arg3, 0
+ br i1 %tmp, label %bb5, label %bb9
+
+bb5:
+ %tmp6 = icmp eq i32 %arg4, 0
+ br i1 %tmp6, label %bb7, label %bb8
+
+bb7:
+ call void @llvm.memset.p0i8.i32(i8* %arg, i8 %arg1, i32 %arg2, i32 1, i1 false)
+ br label %bb11
+
+bb8:
+ br label %bb11
+
+bb9:
+ %tmp10 = call i8* @def()
+ br label %bb11
+
+bb11:
+ %tmp12 = phi i8* [ %arg, %bb7 ], [ %arg, %bb8 ], [ %tmp10, %bb9 ]
+ ret i8* %tmp12
+}
diff --git a/test/CodeGen/WebAssembly/memory-addr32.ll b/test/CodeGen/WebAssembly/memory-addr32.ll
index e6c15633fd63..583201b15f99 100644
--- a/test/CodeGen/WebAssembly/memory-addr32.ll
+++ b/test/CodeGen/WebAssembly/memory-addr32.ll
@@ -1,19 +1,19 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic memory operations assemble as expected with 32-bit addresses.
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
-declare i32 @llvm.wasm.memory.size.i32() nounwind readonly
+declare i32 @llvm.wasm.current.memory.i32() nounwind readonly
declare void @llvm.wasm.grow.memory.i32(i32) nounwind
-; CHECK-LABEL: memory_size:
+; CHECK-LABEL: current_memory:
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: memory_size $push0={{$}}
+; CHECK-NEXT: current_memory $push0={{$}}
; CHECK-NEXT: return $pop0{{$}}
-define i32 @memory_size() {
- %a = call i32 @llvm.wasm.memory.size.i32()
+define i32 @current_memory() {
+ %a = call i32 @llvm.wasm.current.memory.i32()
ret i32 %a
}
diff --git a/test/CodeGen/WebAssembly/memory-addr64.ll b/test/CodeGen/WebAssembly/memory-addr64.ll
index d504c277f306..dc6da6121718 100644
--- a/test/CodeGen/WebAssembly/memory-addr64.ll
+++ b/test/CodeGen/WebAssembly/memory-addr64.ll
@@ -1,19 +1,19 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that basic memory operations assemble as expected with 64-bit addresses.
target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
target triple = "wasm64-unknown-unknown"
-declare i64 @llvm.wasm.memory.size.i64() nounwind readonly
+declare i64 @llvm.wasm.current.memory.i64() nounwind readonly
declare void @llvm.wasm.grow.memory.i64(i64) nounwind
-; CHECK-LABEL: memory_size:
+; CHECK-LABEL: current_memory:
; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: memory_size $push0={{$}}
+; CHECK-NEXT: current_memory $push0={{$}}
; CHECK-NEXT: return $pop0{{$}}
-define i64 @memory_size() {
- %a = call i64 @llvm.wasm.memory.size.i64()
+define i64 @current_memory() {
+ %a = call i64 @llvm.wasm.current.memory.i64()
ret i64 %a
}
diff --git a/test/CodeGen/WebAssembly/non-executable-stack.ll b/test/CodeGen/WebAssembly/non-executable-stack.ll
new file mode 100644
index 000000000000..b81063724e9c
--- /dev/null
+++ b/test/CodeGen/WebAssembly/non-executable-stack.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that we don't emit anything declaring a non-executable stack,
+; because wasm's stack is always non-executable.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-NOT: .note.GNU-stack
diff --git a/test/CodeGen/WebAssembly/offset-folding.ll b/test/CodeGen/WebAssembly/offset-folding.ll
index 159a25eba358..863549fc20fc 100644
--- a/test/CodeGen/WebAssembly/offset-folding.ll
+++ b/test/CodeGen/WebAssembly/offset-folding.ll
@@ -5,10 +5,7 @@
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
-; FIXME: make this 'external' and make sure it still works. WebAssembly
-; currently only supports linking single files, so 'external' makes
-; little sense.
-@x = global [0 x i32] zeroinitializer
+@x = external global [0 x i32]
@y = global [50 x i32] zeroinitializer
; Test basic constant offsets of both defined and external symbols.
@@ -46,3 +43,21 @@ define i32* @test2() {
define i32* @test3() {
ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 0)
}
+
+; Test negative offsets.
+
+; CHECK-LABEL: test4:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, x-188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test4() {
+ ret i32* getelementptr ([0 x i32], [0 x i32]* @x, i32 0, i32 -47)
+}
+
+; CHECK-LABEL: test5:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, y-188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test5() {
+ ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 -47)
+}
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
index 828f40206a96..fcd8b49758ed 100644
--- a/test/CodeGen/WebAssembly/offset.ll
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -125,10 +125,21 @@ define i64 @load_i64_with_unfolded_gep_offset(i64* %p) {
ret i64 %t
}
+; CHECK-LABEL: load_i32_with_folded_or_offset:
+; CHECK: i32.load8_s $push{{[0-9]+}}=, 2($pop{{[0-9]+}}){{$}}
+define i32 @load_i32_with_folded_or_offset(i32 %x) {
+ %and = and i32 %x, -4
+ %t0 = inttoptr i32 %and to i8*
+ %arrayidx = getelementptr inbounds i8, i8* %t0, i32 2
+ %t1 = load i8, i8* %arrayidx, align 1
+ %conv = sext i8 %t1 to i32
+ ret i32 %conv
+}
+
; Same as above but with store.
; CHECK-LABEL: store_i32_with_folded_offset:
-; CHECK: i32.store $discard=, 24($0), $pop0{{$}}
+; CHECK: i32.store $drop=, 24($0), $pop0{{$}}
define void @store_i32_with_folded_offset(i32* %p) {
%q = ptrtoint i32* %p to i32
%r = add nuw i32 %q, 24
@@ -140,7 +151,7 @@ define void @store_i32_with_folded_offset(i32* %p) {
; Same as above but with store.
; CHECK-LABEL: store_i32_with_folded_gep_offset:
-; CHECK: i32.store $discard=, 24($0), $pop0{{$}}
+; CHECK: i32.store $drop=, 24($0), $pop0{{$}}
define void @store_i32_with_folded_gep_offset(i32* %p) {
%s = getelementptr inbounds i32, i32* %p, i32 6
store i32 0, i32* %s
@@ -152,7 +163,7 @@ define void @store_i32_with_folded_gep_offset(i32* %p) {
; CHECK-LABEL: store_i32_with_unfolded_gep_negative_offset:
; CHECK: i32.const $push0=, -24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
-; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+; CHECK: i32.store $drop=, 0($pop1), $pop2{{$}}
define void @store_i32_with_unfolded_gep_negative_offset(i32* %p) {
%s = getelementptr inbounds i32, i32* %p, i32 -6
store i32 0, i32* %s
@@ -164,7 +175,7 @@ define void @store_i32_with_unfolded_gep_negative_offset(i32* %p) {
; CHECK-LABEL: store_i32_with_unfolded_offset:
; CHECK: i32.const $push0=, 24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
-; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+; CHECK: i32.store $drop=, 0($pop1), $pop2{{$}}
define void @store_i32_with_unfolded_offset(i32* %p) {
%q = ptrtoint i32* %p to i32
%r = add nsw i32 %q, 24
@@ -178,7 +189,7 @@ define void @store_i32_with_unfolded_offset(i32* %p) {
; CHECK-LABEL: store_i32_with_unfolded_gep_offset:
; CHECK: i32.const $push0=, 24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
-; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+; CHECK: i32.store $drop=, 0($pop1), $pop2{{$}}
define void @store_i32_with_unfolded_gep_offset(i32* %p) {
%s = getelementptr i32, i32* %p, i32 6
store i32 0, i32* %s
@@ -188,7 +199,7 @@ define void @store_i32_with_unfolded_gep_offset(i32* %p) {
; Same as above but with store with i64.
; CHECK-LABEL: store_i64_with_folded_offset:
-; CHECK: i64.store $discard=, 24($0), $pop0{{$}}
+; CHECK: i64.store $drop=, 24($0), $pop0{{$}}
define void @store_i64_with_folded_offset(i64* %p) {
%q = ptrtoint i64* %p to i32
%r = add nuw i32 %q, 24
@@ -200,7 +211,7 @@ define void @store_i64_with_folded_offset(i64* %p) {
; Same as above but with store with i64.
; CHECK-LABEL: store_i64_with_folded_gep_offset:
-; CHECK: i64.store $discard=, 24($0), $pop0{{$}}
+; CHECK: i64.store $drop=, 24($0), $pop0{{$}}
define void @store_i64_with_folded_gep_offset(i64* %p) {
%s = getelementptr inbounds i64, i64* %p, i32 3
store i64 0, i64* %s
@@ -212,7 +223,7 @@ define void @store_i64_with_folded_gep_offset(i64* %p) {
; CHECK-LABEL: store_i64_with_unfolded_gep_negative_offset:
; CHECK: i32.const $push0=, -24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
-; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+; CHECK: i64.store $drop=, 0($pop1), $pop2{{$}}
define void @store_i64_with_unfolded_gep_negative_offset(i64* %p) {
%s = getelementptr inbounds i64, i64* %p, i32 -3
store i64 0, i64* %s
@@ -224,7 +235,7 @@ define void @store_i64_with_unfolded_gep_negative_offset(i64* %p) {
; CHECK-LABEL: store_i64_with_unfolded_offset:
; CHECK: i32.const $push0=, 24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
-; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+; CHECK: i64.store $drop=, 0($pop1), $pop2{{$}}
define void @store_i64_with_unfolded_offset(i64* %p) {
%q = ptrtoint i64* %p to i32
%r = add nsw i32 %q, 24
@@ -238,13 +249,23 @@ define void @store_i64_with_unfolded_offset(i64* %p) {
; CHECK-LABEL: store_i64_with_unfolded_gep_offset:
; CHECK: i32.const $push0=, 24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
-; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+; CHECK: i64.store $drop=, 0($pop1), $pop2{{$}}
define void @store_i64_with_unfolded_gep_offset(i64* %p) {
%s = getelementptr i64, i64* %p, i32 3
store i64 0, i64* %s
ret void
}
+; CHECK-LABEL: store_i32_with_folded_or_offset:
+; CHECK: i32.store8 $drop=, 2($pop{{[0-9]+}}), $pop{{[0-9]+}}{{$}}
+define void @store_i32_with_folded_or_offset(i32 %x) {
+ %and = and i32 %x, -4
+ %t0 = inttoptr i32 %and to i8*
+ %arrayidx = getelementptr inbounds i8, i8* %t0, i32 2
+ store i8 0, i8* %arrayidx, align 1
+ ret void
+}
+
; When loading from a fixed address, materialize a zero.
; CHECK-LABEL: load_i32_from_numeric_address
@@ -266,8 +287,9 @@ define i32 @load_i32_from_global_address() {
}
; CHECK-LABEL: store_i32_to_numeric_address:
-; CHECK: i32.const $0=, 0{{$}}
-; CHECK: i32.store $discard=, 42($0), $0{{$}}
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: i32.const $push1=, 0{{$}}
+; CHECK-NEXT: i32.store $drop=, 42($pop0), $pop1{{$}}
define void @store_i32_to_numeric_address() {
%s = inttoptr i32 42 to i32*
store i32 0, i32* %s
@@ -275,8 +297,9 @@ define void @store_i32_to_numeric_address() {
}
; CHECK-LABEL: store_i32_to_global_address:
-; CHECK: i32.const $0=, 0{{$}}
-; CHECK: i32.store $discard=, gv($0), $0{{$}}
+; CHECK: i32.const $push0=, 0{{$}}
+; CHECK: i32.const $push1=, 0{{$}}
+; CHECK: i32.store $drop=, gv($pop0), $pop1{{$}}
define void @store_i32_to_global_address() {
store i32 0, i32* @gv
ret void
@@ -333,7 +356,7 @@ define i32 @load_i8_u_with_folded_gep_offset(i8* %p) {
; Fold an offset into a truncating store.
; CHECK-LABEL: store_i8_with_folded_offset:
-; CHECK: i32.store8 $discard=, 24($0), $pop0{{$}}
+; CHECK: i32.store8 $drop=, 24($0), $pop0{{$}}
define void @store_i8_with_folded_offset(i8* %p) {
%q = ptrtoint i8* %p to i32
%r = add nuw i32 %q, 24
@@ -345,7 +368,7 @@ define void @store_i8_with_folded_offset(i8* %p) {
; Fold a gep offset into a truncating store.
; CHECK-LABEL: store_i8_with_folded_gep_offset:
-; CHECK: i32.store8 $discard=, 24($0), $pop0{{$}}
+; CHECK: i32.store8 $drop=, 24($0), $pop0{{$}}
define void @store_i8_with_folded_gep_offset(i8* %p) {
%s = getelementptr inbounds i8, i8* %p, i32 24
store i8 0, i8* %s
@@ -359,10 +382,10 @@ define void @store_i8_with_folded_gep_offset(i8* %p) {
; CHECK: i32.load $3=, 4($0){{$}}
; CHECK: i32.load $4=, 8($0){{$}}
; CHECK: i32.load $push0=, 12($0){{$}}
-; CHECK: i32.store $discard=, 12($1), $pop0{{$}}
-; CHECK: i32.store $discard=, 8($1), $4{{$}}
-; CHECK: i32.store $discard=, 4($1), $3{{$}}
-; CHECK: i32.store $discard=, 0($1), $2{{$}}
+; CHECK: i32.store $drop=, 12($1), $pop0{{$}}
+; CHECK: i32.store $drop=, 8($1), $4{{$}}
+; CHECK: i32.store $drop=, 4($1), $3{{$}}
+; CHECK: i32.store $drop=, 0($1), $2{{$}}
define void @aggregate_load_store({i32,i32,i32,i32}* %p, {i32,i32,i32,i32}* %q) {
; volatile so that things stay in order for the tests above
%t = load volatile {i32,i32,i32,i32}, {i32, i32,i32,i32}* %p
@@ -370,14 +393,27 @@ define void @aggregate_load_store({i32,i32,i32,i32}* %p, {i32,i32,i32,i32}* %q)
ret void
}
-; Fold the offsets when lowering aggregate return values.
+; Fold the offsets when lowering aggregate return values. The stores get
+; merged into i64 stores.
; CHECK-LABEL: aggregate_return:
-; CHECK: i32.const $push0=, 0{{$}}
-; CHECK: i32.store $push1=, 12($0), $pop0{{$}}
-; CHECK: i32.store $push2=, 8($0), $pop1{{$}}
-; CHECK: i32.store $push3=, 4($0), $pop2{{$}}
-; CHECK: i32.store $discard=, 0($0), $pop3{{$}}
+; CHECK: i64.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK: i64.store $push[[L1:[0-9]+]]=, 8($0):p2align=2, $pop[[L0]]{{$}}
+; CHECK: i64.store $drop=, 0($0):p2align=2, $pop[[L1]]{{$}}
define {i32,i32,i32,i32} @aggregate_return() {
ret {i32,i32,i32,i32} zeroinitializer
}
+
+; Fold the offsets when lowering aggregate return values. The stores are not
+; merged.
+
+; CHECK-LABEL: aggregate_return_without_merge:
+; CHECK: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK: i32.store8 $push[[L1:[0-9]+]]=, 14($0), $pop[[L0]]{{$}}
+; CHECK: i32.store16 $push[[L2:[0-9]+]]=, 12($0), $pop[[L1]]{{$}}
+; CHECK: i32.store $drop=, 8($0), $pop[[L2]]{{$}}
+; CHECK: i64.const $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK: i64.store $drop=, 0($0), $pop[[L3]]{{$}}
+define {i64,i32,i16,i8} @aggregate_return_without_merge() {
+ ret {i64,i32,i16,i8} zeroinitializer
+}
diff --git a/test/CodeGen/WebAssembly/phi.ll b/test/CodeGen/WebAssembly/phi.ll
index 00e5859b75cf..747ae5cb15d4 100644
--- a/test/CodeGen/WebAssembly/phi.ll
+++ b/test/CodeGen/WebAssembly/phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
; Test that phis are lowered.
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index f8cae7f92404..23cbd03aa080 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
; Test the register stackifier pass.
@@ -28,7 +28,7 @@ define i32 @no1(i32* %p, i32* dereferenceable(4) %q) {
; Yes because of invariant load and no side effects.
; CHECK-LABEL: yes0:
-; CHECK: return $pop0{{$}}
+; CHECK: return $pop{{[0-9]+}}{{$}}
define i32 @yes0(i32* %p, i32* dereferenceable(4) %q) {
%t = load i32, i32* %q, !invariant.load !0
store i32 0, i32* %p
@@ -44,32 +44,67 @@ define i32 @yes1(i32* %q) {
ret i32 %t
}
+; Yes because undefined behavior can be sunk past a store.
+
+; CHECK-LABEL: sink_trap:
+; CHECK: return $pop{{[0-9]+}}{{$}}
+define i32 @sink_trap(i32 %x, i32 %y, i32* %p) {
+ %t = sdiv i32 %x, %y
+ store volatile i32 0, i32* %p
+ ret i32 %t
+}
+
+; Yes because the call is readnone.
+
+; CHECK-LABEL: sink_readnone_call:
+; CHECK: return $pop0{{$}}
+declare i32 @readnone_callee() readnone nounwind
+define i32 @sink_readnone_call(i32 %x, i32 %y, i32* %p) {
+ %t = call i32 @readnone_callee()
+ store volatile i32 0, i32* %p
+ ret i32 %t
+}
+
+; No because the call is readonly and there's an intervening store.
+
+; CHECK-LABEL: no_sink_readonly_call:
+; CHECK: return ${{[0-9]+}}{{$}}
+declare i32 @readonly_callee() readonly nounwind
+define i32 @no_sink_readonly_call(i32 %x, i32 %y, i32* %p) {
+ %t = call i32 @readonly_callee()
+ store i32 0, i32* %p
+ ret i32 %t
+}
+
; Don't schedule stack uses into the stack. To reduce register pressure, the
; scheduler might be tempted to move the definition of $2 down. However, this
; would risk getting incorrect liveness if the instructions are later
; rearranged to make the stack contiguous.
; CHECK-LABEL: stack_uses:
-; CHECK-NEXT: .param i32, i32, i32, i32{{$}}
+; CHECK: .param i32, i32, i32, i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: .local i32, i32{{$}}
-; CHECK-NEXT: i32.const $5=, 2{{$}}
-; CHECK-NEXT: i32.const $4=, 1{{$}}
; CHECK-NEXT: block{{$}}
-; CHECK-NEXT: i32.lt_s $push0=, $0, $4{{$}}
-; CHECK-NEXT: i32.lt_s $push1=, $1, $5{{$}}
-; CHECK-NEXT: i32.xor $push4=, $pop0, $pop1{{$}}
-; CHECK-NEXT: i32.lt_s $push2=, $2, $4{{$}}
-; CHECK-NEXT: i32.lt_s $push3=, $3, $5{{$}}
-; CHECK-NEXT: i32.xor $push5=, $pop2, $pop3{{$}}
-; CHECK-NEXT: i32.xor $push6=, $pop4, $pop5{{$}}
-; CHECK-NEXT: i32.ne $push7=, $pop6, $4{{$}}
-; CHECK-NEXT: br_if $pop7, 0{{$}}
-; CHECK-NEXT: i32.const $push8=, 0{{$}}
-; CHECK-NEXT: return $pop8{{$}}
-; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: i32.const $push[[L13:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.lt_s $push[[L0:[0-9]+]]=, $0, $pop[[L13]]{{$}}
+; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 2{{$}}
+; CHECK-NEXT: i32.lt_s $push[[L2:[0-9]+]]=, $1, $pop[[L1]]{{$}}
+; CHECK-NEXT: i32.xor $push[[L5:[0-9]+]]=, $pop[[L0]], $pop[[L2]]{{$}}
+; CHECK-NEXT: i32.const $push[[L12:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.lt_s $push[[L3:[0-9]+]]=, $2, $pop[[L12]]{{$}}
+; CHECK-NEXT: i32.const $push[[L11:[0-9]+]]=, 2{{$}}
+; CHECK-NEXT: i32.lt_s $push[[L4:[0-9]+]]=, $3, $pop[[L11]]{{$}}
+; CHECK-NEXT: i32.xor $push[[L6:[0-9]+]]=, $pop[[L3]], $pop[[L4]]{{$}}
+; CHECK-NEXT: i32.xor $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; CHECK-NEXT: i32.const $push10=, 1{{$}}
+; CHECK-NEXT: i32.ne $push8=, $pop7, $pop10{{$}}
+; CHECK-NEXT: br_if 0, $pop8{{$}}
+; CHECK-NEXT: i32.const $push9=, 0{{$}}
+; CHECK-NEXT: return $pop9{{$}}
+; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: end_block{{$}}
-; CHECK-NEXT: return $4{{$}}
+; CHECK-NEXT: i32.const $push14=, 1{{$}}
+; CHECK-NEXT: return $pop14{{$}}
define i32 @stack_uses(i32 %x, i32 %y, i32 %z, i32 %w) {
entry:
%c = icmp sle i32 %x, 0
@@ -87,19 +122,20 @@ false:
}
; Test an interesting case where the load has multiple uses and cannot
-; be trivially stackified.
+; be trivially stackified. However, it can be stackified with a tee_local.
; CHECK-LABEL: multiple_uses:
-; CHECK-NEXT: .param i32, i32, i32{{$}}
+; CHECK: .param i32, i32, i32{{$}}
; CHECK-NEXT: .local i32{{$}}
-; CHECK-NEXT: i32.load $3=, 0($2){{$}}
; CHECK-NEXT: block{{$}}
-; CHECK-NEXT: i32.ge_u $push0=, $3, $1{{$}}
-; CHECK-NEXT: br_if $pop0, 0{{$}}
-; CHECK-NEXT: i32.lt_u $push1=, $3, $0{{$}}
-; CHECK-NEXT: br_if $pop1, 0{{$}}
-; CHECK-NEXT: i32.store $discard=, 0($2), $3{{$}}
-; CHECK-NEXT: .LBB5_3:
+; CHECK-NEXT: i32.load $push[[NUM0:[0-9]+]]=, 0($2){{$}}
+; CHECK-NEXT: tee_local $push[[NUM1:[0-9]+]]=, $3=, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: i32.ge_u $push[[NUM2:[0-9]+]]=, $pop[[NUM1]], $1{{$}}
+; CHECK-NEXT: br_if 0, $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.lt_u $push[[NUM3:[0-9]+]]=, $3, $0{{$}}
+; CHECK-NEXT: br_if 0, $pop[[NUM3]]{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($2), $3{{$}}
+; CHECK-NEXT: .LBB8_3:
; CHECK-NEXT: end_block{{$}}
; CHECK-NEXT: return{{$}}
define void @multiple_uses(i32* %arg0, i32* %arg1, i32* %arg2) nounwind {
@@ -125,4 +161,314 @@ return:
ret void
}
-!0 = !{}
+; Don't stackify stores effects across other instructions with side effects.
+
+; CHECK: side_effects:
+; CHECK: store
+; CHECK-NEXT: call
+; CHECK-NEXT: store
+; CHECK-NEXT: call
+declare void @evoke_side_effects()
+define hidden void @stackify_store_across_side_effects(double* nocapture %d) {
+entry:
+ store double 2.0, double* %d
+ call void @evoke_side_effects()
+ store double 2.0, double* %d
+ call void @evoke_side_effects()
+ ret void
+}
+
+; Div instructions have side effects and can't be reordered, but this entire
+; function should still be able to be stackified because it's already in
+; tree order.
+
+; CHECK-LABEL: div_tree:
+; CHECK: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.div_s $push[[L0:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: i32.div_s $push[[L1:[0-9]+]]=, $2, $3{{$}}
+; CHECK-NEXT: i32.div_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: i32.div_s $push[[L3:[0-9]+]]=, $4, $5{{$}}
+; CHECK-NEXT: i32.div_s $push[[L4:[0-9]+]]=, $6, $7{{$}}
+; CHECK-NEXT: i32.div_s $push[[L5:[0-9]+]]=, $pop[[L3]], $pop[[L4]]{{$}}
+; CHECK-NEXT: i32.div_s $push[[L6:[0-9]+]]=, $pop[[L2]], $pop[[L5]]{{$}}
+; CHECK-NEXT: i32.div_s $push[[L7:[0-9]+]]=, $8, $9{{$}}
+; CHECK-NEXT: i32.div_s $push[[L8:[0-9]+]]=, $10, $11{{$}}
+; CHECK-NEXT: i32.div_s $push[[L9:[0-9]+]]=, $pop[[L7]], $pop[[L8]]{{$}}
+; CHECK-NEXT: i32.div_s $push[[L10:[0-9]+]]=, $12, $13{{$}}
+; CHECK-NEXT: i32.div_s $push[[L11:[0-9]+]]=, $14, $15{{$}}
+; CHECK-NEXT: i32.div_s $push[[L12:[0-9]+]]=, $pop[[L10]], $pop[[L11]]{{$}}
+; CHECK-NEXT: i32.div_s $push[[L13:[0-9]+]]=, $pop[[L9]], $pop[[L12]]{{$}}
+; CHECK-NEXT: i32.div_s $push[[L14:[0-9]+]]=, $pop[[L6]], $pop[[L13]]{{$}}
+; CHECK-NEXT: return $pop[[L14]]{{$}}
+define i32 @div_tree(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) {
+entry:
+ %div = sdiv i32 %a, %b
+ %div1 = sdiv i32 %c, %d
+ %div2 = sdiv i32 %div, %div1
+ %div3 = sdiv i32 %e, %f
+ %div4 = sdiv i32 %g, %h
+ %div5 = sdiv i32 %div3, %div4
+ %div6 = sdiv i32 %div2, %div5
+ %div7 = sdiv i32 %i, %j
+ %div8 = sdiv i32 %k, %l
+ %div9 = sdiv i32 %div7, %div8
+ %div10 = sdiv i32 %m, %n
+ %div11 = sdiv i32 %o, %p
+ %div12 = sdiv i32 %div10, %div11
+ %div13 = sdiv i32 %div9, %div12
+ %div14 = sdiv i32 %div6, %div13
+ ret i32 %div14
+}
+
+; A simple multiple-use case.
+
+; CHECK-LABEL: simple_multiple_use:
+; CHECK: .param i32, i32{{$}}
+; CHECK-NEXT: i32.mul $push[[NUM0:[0-9]+]]=, $1, $0{{$}}
+; CHECK-NEXT: tee_local $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: call use_a@FUNCTION, $pop[[NUM1]]{{$}}
+; CHECK-NEXT: call use_b@FUNCTION, $[[NUM2]]{{$}}
+; CHECK-NEXT: return{{$}}
+declare void @use_a(i32)
+declare void @use_b(i32)
+define void @simple_multiple_use(i32 %x, i32 %y) {
+ %mul = mul i32 %y, %x
+ call void @use_a(i32 %mul)
+ call void @use_b(i32 %mul)
+ ret void
+}
+
+; Multiple uses of the same value in one instruction.
+
+; CHECK-LABEL: multiple_uses_in_same_insn:
+; CHECK: .param i32, i32{{$}}
+; CHECK-NEXT: i32.mul $push[[NUM0:[0-9]+]]=, $1, $0{{$}}
+; CHECK-NEXT: tee_local $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: call use_2@FUNCTION, $pop[[NUM1]], $[[NUM2]]{{$}}
+; CHECK-NEXT: return{{$}}
+declare void @use_2(i32, i32)
+define void @multiple_uses_in_same_insn(i32 %x, i32 %y) {
+ %mul = mul i32 %y, %x
+ call void @use_2(i32 %mul, i32 %mul)
+ ret void
+}
+
+; Commute operands to achieve better stackifying.
+
+; CHECK-LABEL: commute:
+; CHECK-NOT: param
+; CHECK: .result i32{{$}}
+; CHECK-NEXT: i32.call $push0=, red@FUNCTION{{$}}
+; CHECK-NEXT: i32.call $push1=, green@FUNCTION{{$}}
+; CHECK-NEXT: i32.add $push2=, $pop0, $pop1{{$}}
+; CHECK-NEXT: i32.call $push3=, blue@FUNCTION{{$}}
+; CHECK-NEXT: i32.add $push4=, $pop2, $pop3{{$}}
+; CHECK-NEXT: return $pop4{{$}}
+declare i32 @red()
+declare i32 @green()
+declare i32 @blue()
+define i32 @commute() {
+ %call = call i32 @red()
+ %call1 = call i32 @green()
+ %add = add i32 %call1, %call
+ %call2 = call i32 @blue()
+ %add3 = add i32 %add, %call2
+ ret i32 %add3
+}
+
+; Don't stackify a register when it would move a the def of the register past
+; an implicit get_local for the register.
+
+; CHECK-LABEL: no_stackify_past_use:
+; CHECK: i32.call $1=, callee@FUNCTION, $0
+; CHECK-NEXT: i32.const $push0=, 1
+; CHECK-NEXT: i32.add $push1=, $0, $pop0
+; CHECK-NEXT: i32.call $push2=, callee@FUNCTION, $pop1
+; CHECK-NEXT: i32.sub $push3=, $pop2, $1
+; CHECK-NEXT: i32.div_s $push4=, $pop3, $1
+; CHECK-NEXT: return $pop4
+declare i32 @callee(i32)
+define i32 @no_stackify_past_use(i32 %arg) {
+ %tmp1 = call i32 @callee(i32 %arg)
+ %tmp2 = add i32 %arg, 1
+ %tmp3 = call i32 @callee(i32 %tmp2)
+ %tmp5 = sub i32 %tmp3, %tmp1
+ %tmp6 = sdiv i32 %tmp5, %tmp1
+ ret i32 %tmp6
+}
+
+; This is the same as no_stackify_past_use, except using a commutative operator,
+; so we can reorder the operands and stackify.
+
+; CHECK-LABEL: commute_to_fix_ordering:
+; CHECK: i32.call $push[[L0:.+]]=, callee@FUNCTION, $0
+; CHECK: tee_local $push[[L1:.+]]=, $1=, $pop[[L0]]
+; CHECK: i32.const $push0=, 1
+; CHECK: i32.add $push1=, $0, $pop0
+; CHECK: i32.call $push2=, callee@FUNCTION, $pop1
+; CHECK: i32.add $push3=, $1, $pop2
+; CHECK: i32.mul $push4=, $pop[[L1]], $pop3
+; CHECK: return $pop4
+define i32 @commute_to_fix_ordering(i32 %arg) {
+ %tmp1 = call i32 @callee(i32 %arg)
+ %tmp2 = add i32 %arg, 1
+ %tmp3 = call i32 @callee(i32 %tmp2)
+ %tmp5 = add i32 %tmp3, %tmp1
+ %tmp6 = mul i32 %tmp5, %tmp1
+ ret i32 %tmp6
+}
+
+; Stackify individual defs of virtual registers with multiple defs.
+
+; CHECK-LABEL: multiple_defs:
+; CHECK: f64.add $push[[NUM0:[0-9]+]]=, ${{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: tee_local $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: f64.select $push{{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}}
+; CHECK: $[[NUM2]]=,
+define void @multiple_defs(i32 %arg, i32 %arg1, i1 %arg2, i1 %arg3, i1 %arg4) {
+bb:
+ br label %bb5
+
+bb5: ; preds = %bb21, %bb
+ %tmp = phi double [ 0.000000e+00, %bb ], [ %tmp22, %bb21 ]
+ %tmp6 = phi double [ 0.000000e+00, %bb ], [ %tmp23, %bb21 ]
+ %tmp7 = fcmp olt double %tmp6, 2.323450e+01
+ br i1 %tmp7, label %bb8, label %bb21
+
+bb8: ; preds = %bb17, %bb5
+ %tmp9 = phi double [ %tmp19, %bb17 ], [ %tmp, %bb5 ]
+ %tmp10 = fadd double %tmp6, -1.000000e+00
+ %tmp11 = select i1 %arg2, double -1.135357e+04, double %tmp10
+ %tmp12 = fadd double %tmp11, %tmp9
+ br i1 %arg3, label %bb17, label %bb13
+
+bb13: ; preds = %bb8
+ %tmp14 = or i32 %arg1, 2
+ %tmp15 = icmp eq i32 %tmp14, 14
+ %tmp16 = select i1 %tmp15, double -1.135357e+04, double 0xBFCE147AE147B000
+ br label %bb17
+
+bb17: ; preds = %bb13, %bb8
+ %tmp18 = phi double [ %tmp16, %bb13 ], [ %tmp10, %bb8 ]
+ %tmp19 = fadd double %tmp18, %tmp12
+ %tmp20 = fcmp olt double %tmp6, 2.323450e+01
+ br i1 %tmp20, label %bb8, label %bb21
+
+bb21: ; preds = %bb17, %bb5
+ %tmp22 = phi double [ %tmp, %bb5 ], [ %tmp9, %bb17 ]
+ %tmp23 = fadd double %tmp6, 1.000000e+00
+ br label %bb5
+}
+
+; Don't move calls past loads
+; CHECK-LABEL: no_stackify_call_past_load:
+; CHECK: i32.call $0=, red
+; CHECK: i32.const $push0=, 0
+; CHECK: i32.load $1=, count($pop0)
+@count = hidden global i32 0, align 4
+define i32 @no_stackify_call_past_load() {
+ %a = call i32 @red()
+ %b = load i32, i32* @count, align 4
+ call i32 @callee(i32 %a)
+ ret i32 %b
+ ; use of a
+}
+
+; Don't move stores past loads if there may be aliasing
+; CHECK-LABEL: no_stackify_store_past_load
+; CHECK: i32.store $[[L0:[0-9]+]]=, 0($1), $0
+; CHECK: i32.load {{.*}}, 0($2)
+; CHECK: i32.call {{.*}}, callee@FUNCTION, $[[L0]]{{$}}
+define i32 @no_stackify_store_past_load(i32 %a, i32* %p1, i32* %p2) {
+ store i32 %a, i32* %p1
+ %b = load i32, i32* %p2, align 4
+ call i32 @callee(i32 %a)
+ ret i32 %b
+}
+
+; Can still stackify past invariant loads.
+; CHECK-LABEL: store_past_invar_load
+; CHECK: i32.store $push{{.*}}, 0($1), $0
+; CHECK: i32.call {{.*}}, callee@FUNCTION, $pop
+; CHECK: i32.load $push{{.*}}, 0($2)
+; CHECK: return $pop
+define i32 @store_past_invar_load(i32 %a, i32* %p1, i32* dereferenceable(4) %p2) {
+ store i32 %a, i32* %p1
+ %b = load i32, i32* %p2, !invariant.load !0
+ call i32 @callee(i32 %a)
+ ret i32 %b
+}
+
+; CHECK-LABEL: ignore_dbg_value:
+; CHECK-NEXT: .Lfunc_begin
+; CHECK-NEXT: unreachable
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+define void @ignore_dbg_value() {
+ call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !9), !dbg !10
+ unreachable
+}
+
+; Don't stackify an expression that might use the stack into a return, since we
+; might insert a prologue before the return.
+
+; CHECK-LABEL: no_stackify_past_epilogue:
+; CHECK: return ${{[0-9]+}}{{$}}
+declare i32 @use_memory(i32*)
+define i32 @no_stackify_past_epilogue() {
+ %x = alloca i32
+ %call = call i32 @use_memory(i32* %x)
+ ret i32 %call
+}
+
+; Stackify a loop induction variable into a loop comparison.
+
+; CHECK-LABEL: stackify_indvar:
+; CHECK: i32.const $push[[L5:.+]]=, 1{{$}}
+; CHECK-NEXT: i32.add $push[[L4:.+]]=, $[[R0:.+]], $pop[[L5]]{{$}}
+; CHECK-NEXT: tee_local $push[[L3:.+]]=, $[[R0]]=, $pop[[L4]]{{$}}
+; CHECK-NEXT: i32.ne $push[[L2:.+]]=, $0, $pop[[L3]]{{$}}
+define void @stackify_indvar(i32 %tmp, i32* %v) #0 {
+bb:
+ br label %bb3
+
+bb3: ; preds = %bb3, %bb2
+ %tmp4 = phi i32 [ %tmp7, %bb3 ], [ 0, %bb ]
+ %tmp5 = load volatile i32, i32* %v, align 4
+ %tmp6 = add nsw i32 %tmp5, %tmp4
+ store volatile i32 %tmp6, i32* %v, align 4
+ %tmp7 = add nuw nsw i32 %tmp4, 1
+ %tmp8 = icmp eq i32 %tmp7, %tmp
+ br i1 %tmp8, label %bb10, label %bb3
+
+bb10: ; preds = %bb9, %bb
+ ret void
+}
+
+; Don't stackify a call past a __stack_pointer store.
+
+; CHECK-LABEL: stackpointer_dependency:
+; CHECK: call {{.+}}, stackpointer_callee@FUNCTION,
+; CHECK: i32.const $push[[L0:.+]]=, 0
+; CHECK-NEXT: i32.store $drop=, __stack_pointer($pop[[L0]]),
+declare i32 @stackpointer_callee(i8* readnone, i8* readnone)
+declare i8* @llvm.frameaddress(i32)
+define i32 @stackpointer_dependency(i8* readnone) {
+ %2 = tail call i8* @llvm.frameaddress(i32 0)
+ %3 = tail call i32 @stackpointer_callee(i8* %0, i8* %2)
+ ret i32 %3
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 3.9.0 (trunk 266005) (llvm/trunk 266105)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !3)
+!2 = !DIFile(filename: "test.c", directory: "/")
+!3 = !{}
+!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !6, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: true, unit: !1, variables: !3)
+!6 = !DISubroutineType(types: !3)
+!7 = !DILocalVariable(name: "nzcnt", scope: !5, file: !2, line: 15, type: !8)
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !DIExpression()
+!10 = !DILocation(line: 15, column: 6, scope: !5)
diff --git a/test/CodeGen/WebAssembly/return-int32.ll b/test/CodeGen/WebAssembly/return-int32.ll
index a93a0f6c438b..9e663b969e14 100644
--- a/test/CodeGen/WebAssembly/return-int32.ll
+++ b/test/CodeGen/WebAssembly/return-int32.ll
@@ -1,10 +1,34 @@
; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: return_i32:
-; CHECK: return $0{{$}}
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: copy_local $push0=, $0
+; CHECK-NEXT: .endfunc{{$}}
define i32 @return_i32(i32 %p) {
ret i32 %p
}
+
+; CHECK-LABEL: return_i32_twice:
+; CHECK: store
+; CHECK-NEXT: i32.const $push[[L0:[^,]+]]=, 1{{$}}
+; CHECK-NEXT: return $pop[[L0]]{{$}}
+; CHECK: store
+; CHECK-NEXT: i32.const $push{{[^,]+}}=, 3{{$}}
+; CHECK-NEXT: .endfunc{{$}}
+define i32 @return_i32_twice(i32 %a) {
+ %b = icmp ne i32 %a, 0
+ br i1 %b, label %true, label %false
+
+true:
+ store i32 0, i32* null
+ ret i32 1
+
+false:
+ store i32 2, i32* null
+ ret i32 3
+}
diff --git a/test/CodeGen/WebAssembly/return-void.ll b/test/CodeGen/WebAssembly/return-void.ll
index 65ff5f325719..c3a600f7838d 100644
--- a/test/CodeGen/WebAssembly/return-void.ll
+++ b/test/CodeGen/WebAssembly/return-void.ll
@@ -1,10 +1,29 @@
; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: return_void:
-; CHECK: return{{$}}
+; CHECK-NEXT: .endfunc{{$}}
define void @return_void() {
ret void
}
+
+; CHECK-LABEL: return_void_twice:
+; CHECK: store
+; CHECK-NEXT: return{{$}}
+; CHECK: store
+; CHECK-NEXT: .endfunc{{$}}
+define void @return_void_twice(i32 %a) {
+ %b = icmp ne i32 %a, 0
+ br i1 %b, label %true, label %false
+
+true:
+ store i32 0, i32* null
+ ret void
+
+false:
+ store i32 1, i32* null
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll
index 9c892bb3ecea..a277928ae400 100644
--- a/test/CodeGen/WebAssembly/returned.ll
+++ b/test/CodeGen/WebAssembly/returned.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that the "returned" attribute is optimized effectively.
@@ -38,7 +38,7 @@ entry:
; CHECK-LABEL: test_constant_arg:
; CHECK-NEXT: i32.const $push0=, global{{$}}
-; CHECK-NEXT: {{^}} i32.call $discard=, returns_arg@FUNCTION, $pop0{{$}}
+; CHECK-NEXT: {{^}} i32.call $drop=, returns_arg@FUNCTION, $pop0{{$}}
; CHECK-NEXT: return{{$}}
@global = external global i32
@addr = global i32* @global
diff --git a/test/CodeGen/WebAssembly/select.ll b/test/CodeGen/WebAssembly/select.ll
index 416f58cac0d3..06837e4c2368 100644
--- a/test/CodeGen/WebAssembly/select.ll
+++ b/test/CodeGen/WebAssembly/select.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -fast-isel | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel -fast-isel-abort=1 | FileCheck %s
; Test that wasm select instruction is selected from LLVM select instruction.
@@ -9,7 +9,7 @@ target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: select_i32_bool:
; CHECK-NEXT: .param i32, i32, i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
%cond = select i1 %a, i32 %b, i32 %c
@@ -19,7 +19,7 @@ define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
; CHECK-LABEL: select_i32_eq:
; CHECK-NEXT: .param i32, i32, i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: i32.select $push0=, $2, $1, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define i32 @select_i32_eq(i32 %a, i32 %b, i32 %c) {
%cmp = icmp eq i32 %a, 0
@@ -30,7 +30,7 @@ define i32 @select_i32_eq(i32 %a, i32 %b, i32 %c) {
; CHECK-LABEL: select_i32_ne:
; CHECK-NEXT: .param i32, i32, i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define i32 @select_i32_ne(i32 %a, i32 %b, i32 %c) {
%cmp = icmp ne i32 %a, 0
@@ -41,7 +41,7 @@ define i32 @select_i32_ne(i32 %a, i32 %b, i32 %c) {
; CHECK-LABEL: select_i64_bool:
; CHECK-NEXT: .param i32, i64, i64{{$}}
; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
%cond = select i1 %a, i64 %b, i64 %c
@@ -51,7 +51,7 @@ define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
; CHECK-LABEL: select_i64_eq:
; CHECK-NEXT: .param i32, i64, i64{{$}}
; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: i64.select $push0=, $2, $1, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define i64 @select_i64_eq(i32 %a, i64 %b, i64 %c) {
%cmp = icmp eq i32 %a, 0
@@ -62,7 +62,7 @@ define i64 @select_i64_eq(i32 %a, i64 %b, i64 %c) {
; CHECK-LABEL: select_i64_ne:
; CHECK-NEXT: .param i32, i64, i64{{$}}
; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define i64 @select_i64_ne(i32 %a, i64 %b, i64 %c) {
%cmp = icmp ne i32 %a, 0
@@ -73,7 +73,7 @@ define i64 @select_i64_ne(i32 %a, i64 %b, i64 %c) {
; CHECK-LABEL: select_f32_bool:
; CHECK-NEXT: .param i32, f32, f32{{$}}
; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: f32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
%cond = select i1 %a, float %b, float %c
@@ -83,7 +83,7 @@ define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
; CHECK-LABEL: select_f32_eq:
; CHECK-NEXT: .param i32, f32, f32{{$}}
; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: f32.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: f32.select $push0=, $2, $1, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define float @select_f32_eq(i32 %a, float %b, float %c) {
%cmp = icmp eq i32 %a, 0
@@ -94,7 +94,7 @@ define float @select_f32_eq(i32 %a, float %b, float %c) {
; CHECK-LABEL: select_f32_ne:
; CHECK-NEXT: .param i32, f32, f32{{$}}
; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: f32.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define float @select_f32_ne(i32 %a, float %b, float %c) {
%cmp = icmp ne i32 %a, 0
@@ -105,7 +105,7 @@ define float @select_f32_ne(i32 %a, float %b, float %c) {
; CHECK-LABEL: select_f64_bool:
; CHECK-NEXT: .param i32, f64, f64{{$}}
; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
%cond = select i1 %a, double %b, double %c
@@ -115,7 +115,7 @@ define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
; CHECK-LABEL: select_f64_eq:
; CHECK-NEXT: .param i32, f64, f64{{$}}
; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.select $push0=, $0, $2, $1{{$}}
+; CHECK-NEXT: f64.select $push0=, $2, $1, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define double @select_f64_eq(i32 %a, double %b, double %c) {
%cmp = icmp eq i32 %a, 0
@@ -126,7 +126,7 @@ define double @select_f64_eq(i32 %a, double %b, double %c) {
; CHECK-LABEL: select_f64_ne:
; CHECK-NEXT: .param i32, f64, f64{{$}}
; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.select $push0=, $0, $1, $2{{$}}
+; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
; CHECK-NEXT: return $pop0{{$}}
define double @select_f64_ne(i32 %a, double %b, double %c) {
%cmp = icmp ne i32 %a, 0
diff --git a/test/CodeGen/WebAssembly/signext-zeroext.ll b/test/CodeGen/WebAssembly/signext-zeroext.ll
index f6f56363c1af..f9561da5363d 100644
--- a/test/CodeGen/WebAssembly/signext-zeroext.ll
+++ b/test/CodeGen/WebAssembly/signext-zeroext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test zeroext and signext ABI keywords
@@ -8,10 +8,10 @@ target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: z2s_func:
; CHECK-NEXT: .param i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: .local i32{{$}}
-; CHECK-NEXT: i32.const $[[NUM0:[0-9]+]]=, 24{{$}}
-; CHECK-NEXT: i32.shl $push[[NUM2:[0-9]+]]=, $0, $[[NUM0]]{{$}}
-; CHECK-NEXT: i32.shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM0]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shl $push[[NUM2:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM1:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $pop[[NUM1]]{{$}}
; CHECK-NEXT: return $pop[[NUM3]]{{$}}
define signext i8 @z2s_func(i8 zeroext %t) {
ret i8 %t
@@ -44,13 +44,15 @@ define i32 @z2s_call(i32 %t) {
; CHECK-LABEL: s2z_call:
; CHECK-NEXT: .param i32{{$}}
; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: .local i32{{$}}
-; CHECK-NEXT: i32.const $[[NUM0:[0-9]+]]=, 24{{$}}
-; CHECK-NEXT: i32.shl $push[[NUM1:[0-9]+]]=, $0, $[[NUM0]]{{$}}
-; CHECK-NEXT: i32.shr_s $push[[NUM2:[0-9]+]]=, $pop[[NUM1]], $[[NUM0]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM0:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shl $push[[NUM1:[0-9]+]]=, $0, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM6:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shr_s $push[[NUM2:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM6]]{{$}}
; CHECK-NEXT: call $push[[NUM3:[0-9]]]=, s2z_func@FUNCTION, $pop[[NUM2]]{{$}}
-; CHECK-NEXT: i32.shl $push[[NUM4:[0-9]+]]=, $pop[[NUM3]], $[[NUM0]]{{$}}
-; CHECK-NEXT: i32.shr_s $push[[NUM5:[0-9]+]]=, $pop[[NUM4]], $[[NUM0]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM7:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shl $push[[NUM4:[0-9]+]]=, $pop[[NUM3]], $pop[[NUM7]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM8:[0-9]+]]=, 24{{$}}
+; CHECK-NEXT: i32.shr_s $push[[NUM5:[0-9]+]]=, $pop[[NUM4]], $pop[[NUM8]]{{$}}
; CHECK-NEXT: return $pop[[NUM5]]{{$}}
define i32 @s2z_call(i32 %t) {
%s = trunc i32 %t to i8
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
index ae74133fe386..121ee910f853 100644
--- a/test/CodeGen/WebAssembly/store-results.ll
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Test that the wasm-store-results pass makes users of stored values use the
; result of store expressions to reduce get_local/set_local traffic.
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: single_block:
-; CHECK-NOT: .local
+; CHECK-NOT: local
; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
; CHECK: i32.store $push[[STORE:[0-9]+]]=, 0($0), $pop{{[0-9]+}}{{$}}
; CHECK: return $pop[[STORE]]{{$}}
@@ -26,7 +26,7 @@ entry:
@pos = global %class.Vec3 zeroinitializer, align 4
; CHECK-LABEL: foo:
-; CHECK: i32.store $discard=, pos($0), $0{{$}}
+; CHECK: i32.store $drop=, pos($pop{{[0-9]+}}), $pop{{[0-9]+}}{{$}}
define void @foo() {
for.body.i:
br label %for.body5.i
@@ -44,7 +44,7 @@ for.cond.cleanup4.i:
}
; CHECK-LABEL: bar:
-; CHECK: i32.store $discard=, pos($0), $0{{$}}
+; CHECK: i32.store $drop=, pos($pop{{[0-9]+}}), $pop{{[0-9]+}}{{$}}
define void @bar() {
for.body.i:
br label %for.body5.i
@@ -59,3 +59,14 @@ for.body5.i:
for.cond.cleanup4.i:
ret void
}
+
+; CHECK-LABEL: fi_ret:
+; CHECK: i32.store $push0=,
+; CHECK: return $pop0{{$}}
+define hidden i8* @fi_ret(i8** %addr) {
+entry:
+ %buf = alloca [27 x i8], align 16
+ %0 = getelementptr inbounds [27 x i8], [27 x i8]* %buf, i32 0, i32 0
+ store i8* %0, i8** %addr
+ ret i8* %0
+}
diff --git a/test/CodeGen/WebAssembly/store-trunc.ll b/test/CodeGen/WebAssembly/store-trunc.ll
index d069af1da7bc..75d87ef45b4d 100644
--- a/test/CodeGen/WebAssembly/store-trunc.ll
+++ b/test/CodeGen/WebAssembly/store-trunc.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: trunc_i8_i32:
-; CHECK: i32.store8 $discard=, 0($0), $1{{$}}
+; CHECK: i32.store8 $drop=, 0($0), $1{{$}}
define void @trunc_i8_i32(i8 *%p, i32 %v) {
%t = trunc i32 %v to i8
store i8 %t, i8* %p
@@ -14,7 +14,7 @@ define void @trunc_i8_i32(i8 *%p, i32 %v) {
}
; CHECK-LABEL: trunc_i16_i32:
-; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
+; CHECK: i32.store16 $drop=, 0($0), $1{{$}}
define void @trunc_i16_i32(i16 *%p, i32 %v) {
%t = trunc i32 %v to i16
store i16 %t, i16* %p
@@ -22,7 +22,7 @@ define void @trunc_i16_i32(i16 *%p, i32 %v) {
}
; CHECK-LABEL: trunc_i8_i64:
-; CHECK: i64.store8 $discard=, 0($0), $1{{$}}
+; CHECK: i64.store8 $drop=, 0($0), $1{{$}}
define void @trunc_i8_i64(i8 *%p, i64 %v) {
%t = trunc i64 %v to i8
store i8 %t, i8* %p
@@ -30,7 +30,7 @@ define void @trunc_i8_i64(i8 *%p, i64 %v) {
}
; CHECK-LABEL: trunc_i16_i64:
-; CHECK: i64.store16 $discard=, 0($0), $1{{$}}
+; CHECK: i64.store16 $drop=, 0($0), $1{{$}}
define void @trunc_i16_i64(i16 *%p, i64 %v) {
%t = trunc i64 %v to i16
store i16 %t, i16* %p
@@ -38,7 +38,7 @@ define void @trunc_i16_i64(i16 *%p, i64 %v) {
}
; CHECK-LABEL: trunc_i32_i64:
-; CHECK: i64.store32 $discard=, 0($0), $1{{$}}
+; CHECK: i64.store32 $drop=, 0($0), $1{{$}}
define void @trunc_i32_i64(i32 *%p, i64 %v) {
%t = trunc i64 %v to i32
store i32 %t, i32* %p
diff --git a/test/CodeGen/WebAssembly/store.ll b/test/CodeGen/WebAssembly/store.ll
index dc93ebbbadb4..3ff84889a712 100644
--- a/test/CodeGen/WebAssembly/store.ll
+++ b/test/CodeGen/WebAssembly/store.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel -fast-isel-abort=1 | FileCheck %s
; Test that basic stores are assembled properly.
@@ -7,7 +8,7 @@ target triple = "wasm32-unknown-unknown"
; CHECK-LABEL: sti32:
; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: i32.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0), $1{{$}}
; CHECK-NEXT: return{{$}}
define void @sti32(i32 *%p, i32 %v) {
store i32 %v, i32* %p
@@ -16,7 +17,7 @@ define void @sti32(i32 *%p, i32 %v) {
; CHECK-LABEL: sti64:
; CHECK-NEXT: .param i32, i64{{$}}
-; CHECK-NEXT: i64.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: i64.store $drop=, 0($0), $1{{$}}
; CHECK-NEXT: return{{$}}
define void @sti64(i64 *%p, i64 %v) {
store i64 %v, i64* %p
@@ -25,7 +26,7 @@ define void @sti64(i64 *%p, i64 %v) {
; CHECK-LABEL: stf32:
; CHECK-NEXT: .param i32, f32{{$}}
-; CHECK-NEXT: f32.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: f32.store $drop=, 0($0), $1{{$}}
; CHECK-NEXT: return{{$}}
define void @stf32(float *%p, float %v) {
store float %v, float* %p
@@ -34,7 +35,7 @@ define void @stf32(float *%p, float %v) {
; CHECK-LABEL: stf64:
; CHECK-NEXT: .param i32, f64{{$}}
-; CHECK-NEXT: f64.store $discard=, 0($0), $1{{$}}
+; CHECK-NEXT: f64.store $drop=, 0($0), $1{{$}}
; CHECK-NEXT: return{{$}}
define void @stf64(double *%p, double %v) {
store double %v, double* %p
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
index 3df5e7f9cf6f..8355bc8562d6 100644
--- a/test/CodeGen/WebAssembly/switch.ll
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-block-placement -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs | FileCheck %s
; Test switch instructions. Block placement is disabled because it reorders
; the blocks in a way that isn't interesting here.
@@ -21,7 +21,7 @@ declare void @foo5()
; CHECK: block{{$}}
; CHECK: block{{$}}
; CHECK: block{{$}}
-; CHECK: tableswitch {{[^,]*}}, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5{{$}}
+; CHECK: br_table {{[^,]+}}, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 0{{$}}
; CHECK: .LBB0_2:
; CHECK: call foo0@FUNCTION{{$}}
; CHECK: .LBB0_3:
@@ -101,7 +101,7 @@ sw.epilog: ; preds = %entry, %sw.bb.5, %s
; CHECK: block{{$}}
; CHECK: block{{$}}
; CHECK: block{{$}}
-; CHECK: tableswitch {{[^,]*}}, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5{{$}}
+; CHECK: br_table {{[^,]+}}, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 0{{$}}
; CHECK: .LBB1_2:
; CHECK: call foo0@FUNCTION{{$}}
; CHECK: .LBB1_3:
diff --git a/test/CodeGen/WebAssembly/unreachable.ll b/test/CodeGen/WebAssembly/unreachable.ll
index 7b23bf3cecfb..77fda44d5ff3 100644
--- a/test/CodeGen/WebAssembly/unreachable.ll
+++ b/test/CodeGen/WebAssembly/unreachable.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -fast-isel -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 -verify-machineinstrs | FileCheck %s
; Test that LLVM unreachable instruction and trap intrinsic are lowered to
; wasm unreachable
diff --git a/test/CodeGen/WebAssembly/unused-argument.ll b/test/CodeGen/WebAssembly/unused-argument.ll
index 00dea769ee86..ff943b215438 100644
--- a/test/CodeGen/WebAssembly/unused-argument.ll
+++ b/test/CodeGen/WebAssembly/unused-argument.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
; Make sure that argument offsets are correct even if some arguments are unused.
@@ -22,7 +22,7 @@ define i32 @unused_second(i32 %x, i32 %y) {
}
; CHECK-LABEL: call_something:
-; CHECK-NEXT: {{^}} i32.call $discard=, return_something@FUNCTION{{$}}
+; CHECK-NEXT: {{^}} i32.call $drop=, return_something@FUNCTION{{$}}
; CHECK-NEXT: return{{$}}
declare i32 @return_something()
define void @call_something() {
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index cc50192b66db..66ac2cce7079 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -1,102 +1,255 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -fast-isel | FileCheck %s
-
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
+declare void @ext_func(i64* %ptr)
+declare void @ext_func_i32(i32* %ptr)
+
; CHECK-LABEL: alloca32:
; Check that there is an extra local for the stack pointer.
-; CHECK: .local i32, i32, i32, i32{{$}}
-define void @alloca32() {
- ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
- ; CHECK-NEXT: i32.const [[L2:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
+; CHECK: .local i32{{$}}
+define void @alloca32() noredzone {
+ ; CHECK: i32.const $push[[L4:.+]]=, 0{{$}}
+ ; CHECK: i32.const $push[[L1:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.load $push[[L2:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L8:.+]]=, $pop[[L2]], $pop[[L3]]
+ ; CHECK-NEXT: i32.store $push[[L10:.+]]=, __stack_pointer($pop[[L4]]), $pop[[L8]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L9:.+]]=, $[[SP:.+]]=, $pop[[L10]]{{$}}
%retval = alloca i32
- ; CHECK: i32.const $push[[L3:.+]]=, 0
- ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ ; CHECK: i32.const $push[[L0:.+]]=, 0
+ ; CHECK: i32.store {{.*}}=, 12($pop[[L9]]), $pop[[L0]]
store i32 0, i32* %retval
- ; CHECK: i32.const [[L4:.+]]=, 16
- ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L4]]
- ; CHECK-NEXT: i32.const [[L5:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.store [[SP]]=, 0([[L5]]), [[SP]]
+ ; CHECK: i32.const $push[[L6:.+]]=, 0
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 16
+ ; CHECK-NEXT: i32.add $push[[L7:.+]]=, $[[SP]], $pop[[L5]]
+ ; CHECK-NEXT: i32.store $drop=, __stack_pointer($pop[[L6]]), $pop[[L7]]
ret void
}
; CHECK-LABEL: alloca3264:
-; CHECK: .local i32, i32, i32, i32{{$}}
+; CHECK: .local i32{{$}}
define void @alloca3264() {
- ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
- ; CHECK-NEXT: i32.const [[L2:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
+ ; CHECK: i32.const $push[[L2:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.load $push[[L3:.+]]=, __stack_pointer($pop[[L2]])
+ ; CHECK-NEXT: i32.const $push[[L4:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L6:.+]]=, $pop[[L3]], $pop[[L4]]
+ ; CHECK-NEXT: tee_local $push[[L5:.+]]=, $[[SP:.+]]=, $pop[[L6]]
%r1 = alloca i32
%r2 = alloca double
- ; CHECK: i32.const $push[[L3:.+]]=, 0
- ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
+ ; CHECK-NEXT: i32.store $drop=, 12($pop[[L5]]), $pop[[L0]]
store i32 0, i32* %r1
- ; CHECK: i64.const $push[[L4:.+]]=, 0
- ; CHECK: i64.store {{.*}}=, 0([[SP]]), $pop[[L4]]
+ ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
+ ; CHECK-NEXT: i64.store $drop=, 0($[[SP]]), $pop[[L1]]
store double 0.0, double* %r2
- ; CHECK: i32.const [[L4:.+]]=, 16
- ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L4]]
- ; CHECK-NEXT: i32.const [[L5:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.store [[SP]]=, 0([[L5]]), [[SP]]
+ ; CHECK-NEXT: return
ret void
}
; CHECK-LABEL: allocarray:
-; CHECK: .local i32, i32, i32, i32, i32, i32{{$}}
+; CHECK: .local i32{{$}}
define void @allocarray() {
- ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
- ; CHECK-NEXT: i32.const [[L2:.+]]=, 32
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
- %r = alloca [5 x i32]
- ; CHECK: i32.const $push[[L3:.+]]=, 1
- ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
- %p = getelementptr [5 x i32], [5 x i32]* %r, i32 0, i32 0
+ ; CHECK: i32.const $push[[L7:.+]]=, 0{{$}}
+ ; CHECK: i32.const $push[[L4:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.load $push[[L5:.+]]=, __stack_pointer($pop[[L4]])
+ ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 144{{$}}
+ ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, $pop[[L5]], $pop[[L6]]
+ ; CHECK-NEXT: i32.store ${{.+}}=, __stack_pointer($pop[[L7]]), $pop[[L11]]
+ %r = alloca [33 x i32]
+
+ ; CHECK: i32.const $push{{.+}}=, 24
+ ; CHECK-NEXT: i32.add $push[[L3:.+]]=, $[[SP]], $pop{{.+}}
+ ; CHECK-NEXT: i32.const $push[[L1:.+]]=, 1{{$}}
+ ; CHECK-NEXT: i32.store $push[[L0:.+]]=, 0($pop[[L3]]), $pop[[L1]]{{$}}
+ ; CHECK-NEXT: i32.store $drop=, 12(${{.+}}), $pop[[L0]]{{$}}
+ %p = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 0
store i32 1, i32* %p
- ; CHECK: i32.const $push[[L4:.+]]=, 4
- ; CHECK: i32.const [[L5:.+]]=, 12
- ; CHECK: i32.add [[L5]]=, [[SP]], [[L5]]
- ; CHECK: i32.add $push[[L6:.+]]=, [[L5]], $pop[[L4]]
- ; CHECK: i32.store {{.*}}=, 0($pop[[L6]]), ${{.+}}
- %p2 = getelementptr [5 x i32], [5 x i32]* %r, i32 0, i32 1
+ %p2 = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 3
store i32 1, i32* %p2
- ; CHECK: i32.const [[L7:.+]]=, 32
- ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L7]]
- ; CHECK-NEXT: i32.const [[L8:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.store [[SP]]=, 0([[L7]]), [[SP]]
+
+ ; CHECK: i32.const $push[[L10:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.const $push[[L8:.+]]=, 144
+ ; CHECK-NEXT: i32.add $push[[L19:.+]]=, $[[SP]], $pop[[L8]]
+ ; CHECK-NEXT: i32.store $drop=, __stack_pointer($pop[[L10]]), $pop[[L9]]
+ ret void
+}
+
+; CHECK-LABEL: non_mem_use
+define void @non_mem_use(i8** %addr) {
+ ; CHECK: i32.const $push[[L1:.+]]=, 48
+ ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.store $[[SP:.+]]=, {{.+}}, $pop[[L11]]
+ %buf = alloca [27 x i8], align 16
+ %r = alloca i64
+ %r2 = alloca i64
+ ; %r is at SP+8
+ ; CHECK: tee_local $push[[L12:.+]]=, $[[SP:.+]]=, $pop{{.+}}
+ ; CHECK: i32.const $push[[OFF:.+]]=, 8
+ ; CHECK-NEXT: i32.add $push[[ARG1:.+]]=, $pop[[L12]], $pop[[OFF]]
+ ; CHECK-NEXT: call ext_func@FUNCTION, $pop[[ARG1]]
+ call void @ext_func(i64* %r)
+ ; %r2 is at SP+0, no add needed
+ ; CHECK-NEXT: call ext_func@FUNCTION, $[[SP]]
+ call void @ext_func(i64* %r2)
+ ; Use as a value, but in a store
+ ; %buf is at SP+16
+ ; CHECK: i32.const $push[[OFF:.+]]=, 16
+ ; CHECK-NEXT: i32.add $push[[VAL:.+]]=, $[[SP]], $pop[[OFF]]
+ ; CHECK-NEXT: i32.store {{.*}}=, 0($0), $pop[[VAL]]
+ %gep = getelementptr inbounds [27 x i8], [27 x i8]* %buf, i32 0, i32 0
+ store i8* %gep, i8** %addr
ret void
}
+; CHECK-LABEL: allocarray_inbounds:
+; CHECK: .local i32{{$}}
define void @allocarray_inbounds() {
- ; CHECK: i32.const [[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load [[L1]]=, 0([[L1]])
- ; CHECK-NEXT: i32.const [[L2:.+]]=, 32
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, [[L1]], [[L2]]
+ ; CHECK: i32.const $push[[L6:.+]]=, 0{{$}}
+ ; CHECK: i32.const $push[[L3:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.load $push[[L4:.+]]=, __stack_pointer($pop[[L3]])
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 32{{$}}
+ ; CHECK-NEXT: i32.sub $push[[L10:.+]]=, $pop[[L4]], $pop[[L5]]
+ ; CHECK-NEXT: i32.store ${{.+}}=, __stack_pointer($pop[[L6]]), $pop[[L10]]{{$}}
%r = alloca [5 x i32]
; CHECK: i32.const $push[[L3:.+]]=, 1
- ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ ; CHECK-DAG: i32.store $push{{.*}}=, 24(${{.+}}), $pop[[L3]]
%p = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 0
store i32 1, i32* %p
; This store should have both the GEP and the FI folded into it.
- ; CHECK-NEXT: i32.store {{.*}}=, 16([[SP]]), $pop
- %p2 = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 1
+ ; CHECK-DAG: i32.store {{.*}}=, 12(${{.+}}), $pop
+ %p2 = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 3
store i32 1, i32* %p2
- ; CHECK: i32.const [[L7:.+]]=, 32
- ; CHECK-NEXT: i32.add [[SP]]=, [[SP]], [[L7]]
- ; CHECK-NEXT: i32.const [[L8:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.store [[SP]]=, 0([[L7]]), [[SP]]
+ call void @ext_func(i64* null);
+ ; CHECK: call ext_func
+ ; CHECK: i32.const $push[[L6:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 32{{$}}
+ ; CHECK-NEXT: i32.add $push[[L7:.+]]=, ${{.+}}, $pop[[L5]]
+ ; CHECK-NEXT: i32.store $drop=, __stack_pointer($pop[[L6]]), $pop[[L7]]
ret void
}
+; CHECK-LABEL: dynamic_alloca:
define void @dynamic_alloca(i32 %alloc) {
- ; TODO: Support frame pointers
- ;%r = alloca i32, i32 %alloc
- ;store i32 0, i32* %r
+ ; CHECK: i32.const $push[[L7:.+]]=, 0{{$}}
+ ; CHECK: i32.const $push[[L1:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.load $push[[L13:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK-NEXT: tee_local $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
+ ; Target independent codegen bumps the stack pointer.
+ ; CHECK: i32.sub
+ ; Check that SP is written back to memory after decrement
+ ; CHECK: i32.store $drop=, __stack_pointer($pop{{.+}}),
+ %r = alloca i32, i32 %alloc
+ ; Target-independent codegen also calculates the store addr
+ ; CHECK: call ext_func_i32@FUNCTION
+ call void @ext_func_i32(i32* %r)
+ ; CHECK: i32.const $push[[L3:.+]]=, 0{{$}}
+ ; CHECK: i32.store $drop=, __stack_pointer($pop[[L3]]), $pop{{.+}}
+ ret void
+}
+
+; CHECK-LABEL: dynamic_alloca_redzone:
+define void @dynamic_alloca_redzone(i32 %alloc) {
+ ; CHECK: i32.const $push[[L8:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.load $push[[L13:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK-NEXT: tee_local $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
+ ; CHECK-NEXT: copy_local [[FP:.+]]=, $pop[[L12]]{{$}}
+ ; Target independent codegen bumps the stack pointer
+ ; CHECK: i32.sub
+ %r = alloca i32, i32 %alloc
+ ; CHECK-NEXT: tee_local $push[[L8:.+]]=, $0=, $pop
+ ; CHECK-NEXT: copy_local $drop=, $pop[[L8]]{{$}}
+ ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.store $drop=, 0($0), $pop[[L6]]{{$}}
+ store i32 0, i32* %r
+ ; CHECK-NEXT: return
+ ret void
+}
+
+; CHECK-LABEL: dynamic_static_alloca:
+define void @dynamic_static_alloca(i32 %alloc) noredzone {
+ ; Decrement SP in the prolog by the static amount and writeback to memory.
+ ; CHECK: i32.const $push[[L7:.+]]=, 0{{$}}
+ ; CHECK: i32.const $push[[L8:.+]]=, 0{{$}}
+ ; CHECK: i32.const $push[[L9:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.load $push[[L10:.+]]=, __stack_pointer($pop[[L9]])
+ ; CHECK-NEXT: i32.const $push[[L11:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L20:.+]]=, $pop[[L10]], $pop[[L11]]
+ ; CHECK-NEXT: tee_local $push[[L19:.+]]=, $[[FP:.+]]=, $pop[[L20]]
+ ; CHECK: i32.store $push[[L0:.+]]=, __stack_pointer($pop{{.+}}), $pop{{.+}}
+ ; Decrement SP in the body by the dynamic amount.
+ ; CHECK: i32.sub
+ ; Writeback to memory.
+ ; CHECK: i32.store $drop=, __stack_pointer($pop{{.+}}), $pop{{.+}}
+ %r1 = alloca i32
+ %r = alloca i32, i32 %alloc
+ store i32 0, i32* %r
+ ; CHEC: i32.store $drop=, 0($pop{{.+}}), $pop{{.+}}
ret void
}
-; TODO: test aligned alloc
+
+; The use of the alloca in a phi causes a CopyToReg DAG node to be generated,
+; which has to have special handling because CopyToReg can't have a FI operand
+; CHECK-LABEL: copytoreg_fi:
+define void @copytoreg_fi(i1 %cond, i32* %b) {
+entry:
+ ; CHECK: i32.const $push[[L1:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L3:.+]]=, {{.+}}, $pop[[L1]]
+ %addr = alloca i32
+ ; CHECK: i32.const $push[[OFF:.+]]=, 12
+ ; CHECK-NEXT: i32.add $push[[ADDR:.+]]=, $pop[[L3]], $pop[[OFF]]
+ ; CHECK-NEXT: copy_local [[COPY:.+]]=, $pop[[ADDR]]
+ br label %body
+body:
+ %a = phi i32* [%addr, %entry], [%b, %body]
+ store i32 1, i32* %a
+ ; CHECK: i32.store {{.*}}, 0([[COPY]]),
+ br i1 %cond, label %body, label %exit
+exit:
+ ret void
+}
+
+declare void @use_i8_star(i8*)
+declare i8* @llvm.frameaddress(i32)
+
+; Test __builtin_frame_address(0).
+; CHECK-LABEL: frameaddress_0:
+; CHECK: i32.const $push[[L0:.+]]=, 0{{$}}
+; CHECK-NEXT: i32.load $push[[L3:.+]]=, __stack_pointer($pop[[L0]])
+; CHECK-NEXT: copy_local $push[[L4:.+]]=, $pop[[L3]]{{$}}
+; CHECK-NEXT: tee_local $push[[L2:.+]]=, $[[FP:.+]]=, $pop[[L4]]{{$}}
+; CHECK-NEXT: call use_i8_star@FUNCTION, $pop[[L2]]
+; CHECK-NEXT: i32.const $push[[L1:.+]]=, 0{{$}}
+; CHECK-NEXT: i32.store $drop=, __stack_pointer($pop[[L1]]), $[[FP]]
+define void @frameaddress_0() {
+ %t = call i8* @llvm.frameaddress(i32 0)
+ call void @use_i8_star(i8* %t)
+ ret void
+}
+
+; Test __builtin_frame_address(1).
+
+; CHECK-LABEL: frameaddress_1:
+; CHECK-NEXT: i32.const $push0=, 0{{$}}
+; CHECK-NEXT: call use_i8_star@FUNCTION, $pop0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @frameaddress_1() {
+ %t = call i8* @llvm.frameaddress(i32 1)
+ call void @use_i8_star(i8* %t)
+ ret void
+}
+
+; Test a stack address passed to an inline asm.
+; CHECK-LABEL: inline_asm:
+; CHECK: __stack_pointer
+; CHECK: #APP
+; CHECK-NEXT: # %{{[0-9]+}}{{$}}
+; CHECK-NEXT: #NO_APP
+define void @inline_asm() {
+ %tmp = alloca i8
+ call void asm sideeffect "# %0", "r"(i8* %tmp)
+ ret void
+}
+
+; TODO: test over-aligned alloca
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
index c12264625c37..483d452624a8 100644
--- a/test/CodeGen/WebAssembly/varargs.ll
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
; Test varargs constructs.
@@ -8,13 +8,17 @@ target triple = "wasm32-unknown-unknown"
; Test va_start.
; TODO: Test va_start.
-
-;define void @start(i8** %ap, ...) {
-;entry:
-; %0 = bitcast i8** %ap to i8*
-; call void @llvm.va_start(i8* %0)
-; ret void
-;}
+; CHECK-LABEL: start:
+; CHECK-NEXT: .param i32, i32
+; CHECK-NOT: __stack_pointer
+define void @start(i8** %ap, ...) {
+entry:
+ %0 = bitcast i8** %ap to i8*
+; Store the second argument (the hidden vararg buffer pointer) into ap
+; CHECK: i32.store $drop=, 0($0), $1
+ call void @llvm.va_start(i8* %0)
+ ret void
+}
; Test va_end.
@@ -33,7 +37,7 @@ entry:
; CHECK-LABEL: copy:
; CHECK-NEXT: .param i32, i32{{$}}
; CHECK-NEXT: i32.load $push0=, 0($1){{$}}
-; CHECK-NEXT: i32.store $discard=, 0($0), $pop0{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0), $pop0{{$}}
; CHECK-NEXT: return{{$}}
define void @copy(i8** %ap, i8** %bp) {
entry:
@@ -49,12 +53,13 @@ entry:
; CHECK-NEXT: .param i32{{$}}
; CHECK-NEXT: .result i32{{$}}
; CHECK-NEXT: .local i32{{$}}
-; CHECK-NEXT: i32.load $1=, 0($0){{$}}
-; CHECK-NEXT: i32.const $push0=, 4{{$}}
-; CHECK-NEXT: i32.add $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.store $discard=, 0($0), $pop1{{$}}
-; CHECK-NEXT: i32.load $push2=, 0($1){{$}}
-; CHECK-NEXT: return $pop2{{$}}
+; CHECK-NEXT: i32.load $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: tee_local $push[[NUM1:[0-9]+]]=, $1=, $pop[[NUM0]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM2:[0-9]+]]=, 4{{$}}
+; CHECK-NEXT: i32.add $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0), $pop[[NUM3]]{{$}}
+; CHECK-NEXT: i32.load $push[[NUM4:[0-9]+]]=, 0($1){{$}}
+; CHECK-NEXT: return $pop[[NUM4]]{{$}}
define i8 @arg_i8(i8** %ap) {
entry:
%t = va_arg i8** %ap, i8
@@ -67,16 +72,17 @@ entry:
; CHECK-NEXT: .param i32{{$}}
; CHECK-NEXT: .result i32{{$}}
; CHECK-NEXT: .local i32{{$}}
-; CHECK-NEXT: i32.load $push0=, 0($0){{$}}
-; CHECK-NEXT: i32.const $push1=, 3{{$}}
-; CHECK-NEXT: i32.add $push2=, $pop0, $pop1{{$}}
-; CHECK-NEXT: i32.const $push3=, -4{{$}}
-; CHECK-NEXT: i32.and $1=, $pop2, $pop3{{$}}
-; CHECK-NEXT: i32.const $push4=, 4{{$}}
-; CHECK-NEXT: i32.add $push5=, $1, $pop4{{$}}
-; CHECK-NEXT: i32.store $discard=, 0($0), $pop5{{$}}
-; CHECK-NEXT: i32.load $push6=, 0($1){{$}}
-; CHECK-NEXT: return $pop6{{$}}
+; CHECK-NEXT: i32.load $push[[NUM0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i32.const $push[[NUM1:[0-9]+]]=, 3{{$}}
+; CHECK-NEXT: i32.add $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM3:[0-9]+]]=, -4{{$}}
+; CHECK-NEXT: i32.and $push[[NUM4:[0-9]+]]=, $pop[[NUM2]], $pop[[NUM3]]{{$}}
+; CHECK-NEXT: tee_local $push[[NUM5:[0-9]+]]=, $1=, $pop[[NUM4]]{{$}}
+; CHECK-NEXT: i32.const $push[[NUM6:[0-9]+]]=, 4{{$}}
+; CHECK-NEXT: i32.add $push[[NUM7:[0-9]+]]=, $pop[[NUM5]], $pop[[NUM6]]{{$}}
+; CHECK-NEXT: i32.store $drop=, 0($0), $pop[[NUM7]]{{$}}
+; CHECK-NEXT: i32.load $push[[NUM8:[0-9]+]]=, 0($1){{$}}
+; CHECK-NEXT: return $pop[[NUM8]]{{$}}
define i32 @arg_i32(i8** %ap) {
entry:
%t = va_arg i8** %ap, i32
@@ -103,21 +109,44 @@ entry:
declare void @callee(...)
; CHECK-LABEL: caller_none:
-; CHECK-NEXT: call callee@FUNCTION{{$}}
+; CHECK-NEXT: i32.const $push0=, 0
+; CHECK-NEXT: call callee@FUNCTION, $pop0
; CHECK-NEXT: return{{$}}
define void @caller_none() {
call void (...) @callee()
ret void
}
+; Test a varargs call with some actual arguments.
+; Note that the store of 2.0 is converted to an i64 store; this optimization
+; is not needed on WebAssembly, but there isn't currently a convenient hook for
+; disabling it.
+
; CHECK-LABEL: caller_some
+; CHECK: i32.store
+; CHECK: i64.store
define void @caller_some() {
- ; TODO: Fix interaction between register coalescer and reg stackifier,
- ; or disable coalescer.
- ;call void (...) @callee(i32 0, double 2.0)
+ call void (...) @callee(i32 0, double 2.0)
ret void
}
+; Test a va_start call in a non-entry block
+; CHECK-LABEL: startbb:
+; CHECK: .param i32, i32, i32
+define void @startbb(i1 %cond, i8** %ap, ...) {
+entry:
+ br i1 %cond, label %bb0, label %bb1
+bb0:
+ ret void
+bb1:
+ %0 = bitcast i8** %ap to i8*
+; Store the second argument (the hidden vararg buffer pointer) into ap
+; CHECK: i32.store $drop=, 0($1), $2
+ call void @llvm.va_start(i8* %0)
+ ret void
+}
+
+
declare void @llvm.va_start(i8*)
declare void @llvm.va_end(i8*)
declare void @llvm.va_copy(i8*, i8*)
diff --git a/test/CodeGen/WinEH/wineh-asm.ll b/test/CodeGen/WinEH/wineh-asm.ll
new file mode 100644
index 000000000000..ed99411810e7
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-asm.ll
@@ -0,0 +1,26 @@
+; RUN: opt -winehprepare < %s
+
+target triple = "x86_64-pc-windows-msvc"
+
+define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ invoke void @f(i32 1)
+ to label %exit unwind label %cleanup
+
+cleanup:
+ %cp = cleanuppad within none []
+ call void asm sideeffect "", ""()
+ cleanupret from %cp unwind to caller
+
+exit:
+ ret void
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK: %[[cp:.*]] = cleanuppad within none []
+; CHECK-NEXT: call void asm sideeffect "", ""()
+; CHECK-NEXT: cleanupret from %[[cp]] unwind to caller
+
+declare void @f(i32)
+
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/CodeGen/WinEH/wineh-cloning.ll b/test/CodeGen/WinEH/wineh-cloning.ll
index 748c07df1730..86984c7b5db8 100644
--- a/test/CodeGen/WinEH/wineh-cloning.ll
+++ b/test/CodeGen/WinEH/wineh-cloning.ll
@@ -383,11 +383,10 @@ exit:
!llvm.dbg.cu = !{!1}
!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "compiler", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !3, subprograms: !4)
+!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "compiler", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !3)
!2 = !DIFile(filename: "test.cpp", directory: ".")
!3 = !{}
-!4 = !{!5}
-!5 = distinct !DISubprogram(name: "test12", scope: !2, file: !2, type: !6, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, variables: !3)
+!5 = distinct !DISubprogram(name: "test12", scope: !2, file: !2, type: !6, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !1, variables: !3)
!6 = !DISubroutineType(types: !7)
!7 = !{null}
!8 = !DILocation(line: 1, scope: !5)
diff --git a/test/CodeGen/WinEH/wineh-nested-unwind.ll b/test/CodeGen/WinEH/wineh-nested-unwind.ll
new file mode 100644
index 000000000000..b568be521607
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-nested-unwind.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+; Function Attrs: uwtable
+define void @f() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+ invoke void @g()
+ to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch: ; preds = %entry
+ %0 = catchswitch within none [label %catch] unwind label %ehcleanup
+
+catch: ; preds = %catch.dispatch
+ %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+ invoke void @g() [ "funclet"(token %1) ]
+ to label %dtor.exit unwind label %catch.dispatch.i
+
+catch.dispatch.i: ; preds = %catch
+ %2 = catchswitch within %1 [label %catch.i] unwind to caller
+
+catch.i: ; preds = %catch.dispatch.i
+ %3 = catchpad within %2 [i8* null, i32 64, i8* null]
+ catchret from %3 to label %dtor.exit
+
+dtor.exit:
+ catchret from %1 to label %try.cont
+
+try.cont:
+ ret void
+
+ehcleanup: ; preds = %catch.dispatch
+ %4 = cleanuppad within none []
+ call void @dtor() #1 [ "funclet"(token %4) ]
+ cleanupret from %4 unwind to caller
+}
+
+declare void @g()
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind
+declare void @dtor() #1
+
+attributes #0 = { uwtable }
+attributes #1 = { nounwind }
+
+; CHECK-LABEL: $ip2state$f:
+; CHECK: -1
+; CHECK: 1
+; CHECK: -1
+; CHECK: 4
+; CHECK: 2
+; CHECK: 3
+; CHECK: 2
diff --git a/test/CodeGen/WinEH/wineh-setjmp.ll b/test/CodeGen/WinEH/wineh-setjmp.ll
new file mode 100644
index 000000000000..bf459d98bdea
--- /dev/null
+++ b/test/CodeGen/WinEH/wineh-setjmp.ll
@@ -0,0 +1,75 @@
+; RUN: opt -mtriple=i686-pc-windows-msvc -S -x86-winehstate < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+@jb = external global i8
+
+define i32 @test1() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+; CHECK-LABEL: define i32 @test1(
+; CHECK: %[[eh_reg:.*]] = alloca
+; CHECK: %[[gep:.*]] = getelementptr inbounds {{.*}}, {{.*}} %[[eh_reg]], i32 0, i32 2
+; CHECK: store i32 -1, i32* %[[gep]]
+; CHECK: %[[gep:.*]] = getelementptr inbounds {{.*}}, {{.*}} %[[eh_reg]], i32 0, i32 2
+; CHECK: store i32 0, i32* %[[gep]]
+; CHECK: %[[lsda:.*]] = call i8* @llvm.x86.seh.lsda(i8* bitcast (i32 ()* @test1 to i8*))
+; CHECK: invoke i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 3, void (i8*)* @__CxxLongjmpUnwind, i32 0, i8* %[[lsda]])
+ %inv = invoke i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 0) #2
+ to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:
+; CHECK: %[[gep:.*]] = getelementptr inbounds {{.*}}, {{.*}} %[[eh_reg]], i32 0, i32 2
+; CHECK: store i32 -1, i32* %[[gep]]
+; CHECK: %[[lsda:.*]] = call i8* @llvm.x86.seh.lsda(i8* bitcast (i32 ()* @test1 to i8*))
+; CHECK: call i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 3, void (i8*)* @__CxxLongjmpUnwind, i32 -1, i8* %[[lsda]])
+ call i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 0)
+ call void @cleanup()
+ ret i32 %inv
+
+ehcleanup:
+ %cp = cleanuppad within none []
+; CHECK: %[[gep:.*]] = getelementptr inbounds {{.*}}, {{.*}} %[[eh_reg]], i32 0, i32 2
+; CHECK: %[[load:.*]] = load i32, i32* %[[gep]]
+; CHECK: %[[lsda:.*]] = call i8* @llvm.x86.seh.lsda(i8* bitcast (i32 ()* @test1 to i8*))
+; CHECK: call i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 3, void (i8*)* @__CxxLongjmpUnwind, i32 %[[load]], i8* %[[lsda]]) [ "funclet"(token %cp) ]
+ %cal = call i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 0) [ "funclet"(token %cp) ]
+ call void @cleanup() [ "funclet"(token %cp) ]
+ cleanupret from %cp unwind to caller
+}
+
+define i32 @test2() personality i32 (...)* @_except_handler3 {
+entry:
+; CHECK-LABEL: define i32 @test2(
+; CHECK: %[[eh_reg:.*]] = alloca
+; CHECK: %[[gep:.*]] = getelementptr inbounds {{.*}}, {{.*}} %[[eh_reg]], i32 0, i32 4
+; CHECK: store i32 -1, i32* %[[gep]]
+; CHECK: %[[gep:.*]] = getelementptr inbounds {{.*}}, {{.*}} %[[eh_reg]], i32 0, i32 4
+; CHECK: store i32 0, i32* %[[gep]]
+; CHECK: invoke i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 2, void (i8*)* @_seh_longjmp_unwind, i32 0)
+ %inv = invoke i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 0) #2
+ to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:
+; CHECK: %[[gep:.*]] = getelementptr inbounds {{.*}}, {{.*}} %[[eh_reg]], i32 0, i32 4
+; CHECK: store i32 -1, i32* %[[gep]]
+; CHECK: call i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 2, void (i8*)* @_seh_longjmp_unwind, i32 -1)
+ call i32 (i8*, i32, ...) @_setjmp3(i8* @jb, i32 0)
+ call void @cleanup()
+ ret i32 %inv
+
+ehcleanup:
+ %cp = cleanuppad within none []
+ call void @cleanup() [ "funclet"(token %cp) ]
+ cleanupret from %cp unwind to caller
+}
+
+; Function Attrs: returns_twice
+declare i32 @_setjmp3(i8*, i32, ...) #2
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare i32 @_except_handler3(...)
+
+declare void @cleanup()
+
+attributes #2 = { returns_twice }
diff --git a/test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll b/test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll
index f5889f03965b..a18e412255e3 100644
--- a/test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll
+++ b/test/CodeGen/WinEH/wineh-statenumbering-cleanups.ll
@@ -44,7 +44,6 @@ entry:
to label %exit unwind label %cleanup.pad
cleanup.pad:
; CHECK: cleanup.pad:
- ; CHECK: store i32 1
; CHECK: invoke void @f(i32 0)
%cleanup = cleanuppad within none []
invoke void @f(i32 0)
diff --git a/test/CodeGen/WinEH/wineh-statenumbering.ll b/test/CodeGen/WinEH/wineh-statenumbering.ll
index 4e7c36943a01..d5c330bdf4ee 100644
--- a/test/CodeGen/WinEH/wineh-statenumbering.ll
+++ b/test/CodeGen/WinEH/wineh-statenumbering.ll
@@ -28,7 +28,11 @@ entry:
; CHECK: entry:
; CHECK: store i32 -1
; CHECK: call void @g(i32 3)
+ ; CHECK-NEXT: call void @g(i32 4)
+ ; CHECK-NEXT: call void @g(i32 5)
call void @g(i32 3)
+ call void @g(i32 4)
+ call void @g(i32 5)
store i32 0, i32* %tmp, align 4
%0 = bitcast i32* %tmp to i8*
; CHECK: store i32 0
@@ -54,14 +58,22 @@ catch.3: ; preds = %catch.dispatch.1
; CHECK: catch.3:
; CHECK: store i32 3
; CHECK: call void @g(i32 1)
+ ; CHECK-NEXT: call void @g(i32 2)
+ ; CHECK-NEXT: call void @g(i32 3)
call void @g(i32 1)
+ call void @g(i32 2)
+ call void @g(i32 3)
catchret from %2 to label %try.cont
try.cont: ; preds = %catch.3
; CHECK: try.cont:
; CHECK: store i32 1
; CHECK: call void @g(i32 2)
+ ; CHECK-NEXT: call void @g(i32 3)
+ ; CHECK-NEXT: call void @g(i32 4)
call void @g(i32 2)
+ call void @g(i32 3)
+ call void @g(i32 4)
unreachable
unreachable: ; preds = %catch
@@ -111,6 +123,10 @@ try.cont: ; preds = %catch2
; CHECK: try.cont:
; CHECK: store i32 1
; CHECK: call void @dtor()
+ ; CHECK-NEXT: call void @dtor()
+ ; CHECK-NEXT: call void @dtor()
+ call void @dtor() #3 [ "funclet"(token %1) ]
+ call void @dtor() #3 [ "funclet"(token %1) ]
call void @dtor() #3 [ "funclet"(token %1) ]
catchret from %1 to label %try.cont4
@@ -120,7 +136,6 @@ try.cont4: ; preds = %try.cont
ehcleanup: ; preds = %catch.dispatch1
%4 = cleanuppad within %1 []
; CHECK: ehcleanup:
- ; CHECK: store i32 -1
; CHECK: call void @dtor()
call void @dtor() #3 [ "funclet"(token %4) ]
cleanupret from %4 unwind to caller
@@ -132,6 +147,52 @@ unreachable1: ; preds = %catch
unreachable
}
+; CHECK-LABEL: define void @required_state_store(
+define void @required_state_store(i1 zeroext %cond) personality i32 (...)* @_except_handler3 {
+entry:
+ %__exception_code = alloca i32, align 4
+ call void (...) @llvm.localescape(i32* nonnull %__exception_code)
+; CHECK: store i32 -1
+; CHECK: call void @g(i32 0)
+ call void @g(i32 0)
+ br i1 %cond, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+; CHECK: store i32 0
+; CHECK-NEXT: invoke void @g(i32 1)
+ invoke void @g(i32 1)
+ to label %if.end unwind label %catch.dispatch
+
+catch.dispatch: ; preds = %if.then
+ %0 = catchswitch within none [label %__except.ret] unwind to caller
+
+__except.ret: ; preds = %catch.dispatch
+ %1 = catchpad within %0 [i8* bitcast (i32 ()* @"\01?filt$0@0@required_state_store@@" to i8*)]
+ catchret from %1 to label %if.end
+
+if.end: ; preds = %if.then, %__except.ret, %entry
+; CHECK: store i32 -1
+; CHECK-NEXT: call void @dtor()
+ call void @dtor()
+ ret void
+}
+
+define internal i32 @"\01?filt$0@0@required_state_store@@"() {
+entry:
+ %0 = tail call i8* @llvm.frameaddress(i32 1)
+ %1 = tail call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void (i1)* @required_state_store to i8*), i8* %0)
+ %2 = tail call i8* @llvm.localrecover(i8* bitcast (void (i1)* @required_state_store to i8*), i8* %1, i32 0)
+ %__exception_code = bitcast i8* %2 to i32*
+ %3 = getelementptr inbounds i8, i8* %0, i32 -20
+ %4 = bitcast i8* %3 to { i32*, i8* }**
+ %5 = load { i32*, i8* }*, { i32*, i8* }** %4, align 4
+ %6 = getelementptr inbounds { i32*, i8* }, { i32*, i8* }* %5, i32 0, i32 0
+ %7 = load i32*, i32** %6, align 4
+ %8 = load i32, i32* %7, align 4
+ store i32 %8, i32* %__exception_code, align 4
+ ret i32 1
+}
+
declare void @g(i32) #0
declare void @dtor()
@@ -140,6 +201,16 @@ declare x86_stdcallcc void @_CxxThrowException(i8*, %eh.ThrowInfo*)
declare i32 @__CxxFrameHandler3(...)
+declare i8* @llvm.frameaddress(i32)
+
+declare i8* @llvm.x86.seh.recoverfp(i8*, i8*)
+
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+
+declare void @llvm.localescape(...)
+
+declare i32 @_except_handler3(...)
+
attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noreturn }
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
index 46c5e88955f4..acd32e49e60d 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
-; RUN: grep asm-printer | grep 16
+; RUN: llc < %s -mtriple=i686-unknown-linux -relocation-model=static -stats 2>&1 | \
+; RUN: grep asm-printer | grep 14
;
; It's possible to schedule this in 14 instructions by avoiding
; callee-save registers, but the scheduler isn't currently that
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index b6a8fc0bb2f8..9e1bf9edbbc4 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -6,7 +6,14 @@ target triple = "i686-pc-linux-gnu"
define i32 @main() {
; CHECK-LABEL: main:
; CHECK-NOT: ret
-; CHECK: subl $4, %{{.*}}
+; CHECK: subl $12, %esp
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: calll cexp
+; CHECK: addl $28, %esp
; CHECK: ret
entry:
diff --git a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
index aa0ee5d07462..85a144083ece 100644
--- a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
+++ b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
@@ -2,9 +2,10 @@
@X = global i32 0 ; <i32*> [#uses=1]
-define signext i8 @_Z3fooi(i32 %x) {
+define i32 @_Z3fooi(i32 %x) {
entry:
store i32 %x, i32* @X, align 4
%retval67 = trunc i32 %x to i8 ; <i8> [#uses=1]
- ret i8 %retval67
+ %retval = sext i8 %retval67 to i32
+ ret i32 %retval
}
diff --git a/test/CodeGen/X86/2007-08-13-AppendingLinkage.ll b/test/CodeGen/X86/2007-08-13-AppendingLinkage.ll
deleted file mode 100644
index e08a5c493b5c..000000000000
--- a/test/CodeGen/X86/2007-08-13-AppendingLinkage.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86 | not grep drectve
-; PR1607
-
-%hlvm_programs_element = type { i8*, i32 (i32, i8**)* }
-@hlvm_programs = appending constant [1 x %hlvm_programs_element]
-zeroinitializer
-
-define %hlvm_programs_element* @hlvm_get_programs() {
-entry:
- ret %hlvm_programs_element* getelementptr([1 x %hlvm_programs_element], [1 x %hlvm_programs_element]*
- @hlvm_programs, i32 0, i32 0)
-}
diff --git a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
index c6eb6f0f0d7a..65e5ed762135 100644
--- a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
@@ -224,7 +224,7 @@ declare void @fancy_abort(i8*, i32, i8*)
declare i8* @pool_alloc(%struct.alloc_pool_def*)
-declare void @llvm.memset.i64(i8*, i8, i64, i32)
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
declare void @link_block(%struct.basic_block_def*, %struct.basic_block_def*)
diff --git a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
index de95e7925f08..581fae269021 100644
--- a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
+++ b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -fixup-byte-word-insts=0 | FileCheck %s -check-prefix=CHECK -check-prefix=BWOFF
+; RUN: llc < %s -march=x86 -fixup-byte-word-insts=1 | FileCheck %s -check-prefix=CHECK -check-prefix=BWON
; These transforms are turned off for load volatiles and stores.
; Check that they weren't turned off for all loads and stores!
; CHECK-LABEL: f:
; CHECK-NOT: movsd
-; CHECK: movw
+; BWOFF: movw
+; BWON: movzwl
; CHECK: addw
@atomic = global double 0.000000e+00 ; <double*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-07-19-movups-spills.ll b/test/CodeGen/X86/2008-07-19-movups-spills.ll
index 45ea69943e87..052cf97fefe9 100644
--- a/test/CodeGen/X86/2008-07-19-movups-spills.ll
+++ b/test/CodeGen/X86/2008-07-19-movups-spills.ll
@@ -4,38 +4,38 @@
; Verify that movups is still generated with an aligned stack for the globals
; that must be accessed unaligned
-external global <4 x float>, align 1 ; <<4 x float>*>:0 [#uses=2]
-external global <4 x float>, align 1 ; <<4 x float>*>:1 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:2 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:3 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:4 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:5 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:6 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:7 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:8 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:9 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:10 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:11 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:12 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:13 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:14 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:15 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:16 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:17 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:18 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:19 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:20 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:21 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:22 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:23 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:24 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:25 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:26 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:27 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:28 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:29 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:30 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:31 [#uses=1]
+@0 = external global <4 x float>, align 1 ; <<4 x float>*>:0 [#uses=2]
+@1 = external global <4 x float>, align 1 ; <<4 x float>*>:1 [#uses=1]
+@2 = external global <4 x float>, align 1 ; <<4 x float>*>:2 [#uses=1]
+@3 = external global <4 x float>, align 1 ; <<4 x float>*>:3 [#uses=1]
+@4 = external global <4 x float>, align 1 ; <<4 x float>*>:4 [#uses=1]
+@5 = external global <4 x float>, align 1 ; <<4 x float>*>:5 [#uses=1]
+@6 = external global <4 x float>, align 1 ; <<4 x float>*>:6 [#uses=1]
+@7 = external global <4 x float>, align 1 ; <<4 x float>*>:7 [#uses=1]
+@8 = external global <4 x float>, align 1 ; <<4 x float>*>:8 [#uses=1]
+@9 = external global <4 x float>, align 1 ; <<4 x float>*>:9 [#uses=1]
+@10 = external global <4 x float>, align 1 ; <<4 x float>*>:10 [#uses=1]
+@11 = external global <4 x float>, align 1 ; <<4 x float>*>:11 [#uses=1]
+@12 = external global <4 x float>, align 1 ; <<4 x float>*>:12 [#uses=1]
+@13 = external global <4 x float>, align 1 ; <<4 x float>*>:13 [#uses=1]
+@14 = external global <4 x float>, align 1 ; <<4 x float>*>:14 [#uses=1]
+@15 = external global <4 x float>, align 1 ; <<4 x float>*>:15 [#uses=1]
+@16 = external global <4 x float>, align 1 ; <<4 x float>*>:16 [#uses=1]
+@17 = external global <4 x float>, align 1 ; <<4 x float>*>:17 [#uses=1]
+@18 = external global <4 x float>, align 1 ; <<4 x float>*>:18 [#uses=1]
+@19 = external global <4 x float>, align 1 ; <<4 x float>*>:19 [#uses=1]
+@20 = external global <4 x float>, align 1 ; <<4 x float>*>:20 [#uses=1]
+@21 = external global <4 x float>, align 1 ; <<4 x float>*>:21 [#uses=1]
+@22 = external global <4 x float>, align 1 ; <<4 x float>*>:22 [#uses=1]
+@23 = external global <4 x float>, align 1 ; <<4 x float>*>:23 [#uses=1]
+@24 = external global <4 x float>, align 1 ; <<4 x float>*>:24 [#uses=1]
+@25 = external global <4 x float>, align 1 ; <<4 x float>*>:25 [#uses=1]
+@26 = external global <4 x float>, align 1 ; <<4 x float>*>:26 [#uses=1]
+@27 = external global <4 x float>, align 1 ; <<4 x float>*>:27 [#uses=1]
+@28 = external global <4 x float>, align 1 ; <<4 x float>*>:28 [#uses=1]
+@29 = external global <4 x float>, align 1 ; <<4 x float>*>:29 [#uses=1]
+@30 = external global <4 x float>, align 1 ; <<4 x float>*>:30 [#uses=1]
+@31 = external global <4 x float>, align 1 ; <<4 x float>*>:31 [#uses=1]
declare void @abort()
diff --git a/test/CodeGen/X86/2008-07-22-CombinerCrash.ll b/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
index 35bb5f054282..719baf5cc945 100644
--- a/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
+++ b/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
@@ -1,8 +1,8 @@
; RUN: llc < %s -march=x86 -mattr=+sse2
; PR2566
-external global i16 ; <i16*>:0 [#uses=1]
-external global <4 x i16> ; <<4 x i16>*>:1 [#uses=1]
+@0 = external global i16 ; <i16*>:0 [#uses=1]
+@1 = external global <4 x i16> ; <<4 x i16>*>:1 [#uses=1]
declare void @abort()
diff --git a/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll b/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll
index 32f6ca0ce086..907f4cc4ca3f 100644
--- a/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll
+++ b/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movzbl
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep xorl
define i32 @foo(<4 x float> %a, <4 x float> %b) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-08-19-SubAndFetch.ll b/test/CodeGen/X86/2008-08-19-SubAndFetch.ll
deleted file mode 100644
index 9324d5dfa3bb..000000000000
--- a/test/CodeGen/X86/2008-08-19-SubAndFetch.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-
-@var = external global i64 ; <i64*> [#uses=1]
-
-define i32 @main() nounwind {
-entry:
-; CHECK-LABEL: main:
-; CHECK: lock
-; CHECK: decq
- atomicrmw sub i64* @var, i64 1 monotonic
- unreachable
-}
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
index 757dff4230fc..a9875521fb18 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -march=x86
-; RUN: llc -pre-RA-sched=source < %s -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -pre-RA-sched=source < %s -mtriple=i686-unknown-linux -mcpu=corei7 | FileCheck %s --check-prefix=SOURCE-SCHED
; PR2748
@g_73 = external global i32 ; <i32*> [#uses=1]
@@ -10,9 +10,9 @@ entry:
; SOURCE-SCHED: subl
; SOURCE-SCHED: movl
; SOURCE-SCHED: sarl
+; SOURCE-SCHED: xorl
; SOURCE-SCHED: cmpl
; SOURCE-SCHED: setg
-; SOURCE-SCHED: movzbl
; SOURCE-SCHED: movb
; SOURCE-SCHED: xorl
; SOURCE-SCHED: subl
diff --git a/test/CodeGen/X86/2008-09-29-ReMatBug.ll b/test/CodeGen/X86/2008-09-29-ReMatBug.ll
index 754fd8f0ab64..cc481a056c84 100644
--- a/test/CodeGen/X86/2008-09-29-ReMatBug.ll
+++ b/test/CodeGen/X86/2008-09-29-ReMatBug.ll
@@ -5,7 +5,7 @@
%struct.XCStringList = type { i32, %struct._XCStringListNode* }
%struct._XCStringListNode = type { [3 x i8], [0 x i8], i8 }
%struct.__builtin_CFString = type { i32*, i32, i8*, i32 }
-internal constant %struct.__builtin_CFString { i32* getelementptr ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr ([3 x i8], [3 x i8]* @"\01LC", i32 0, i32 0), i32 2 } ; <%struct.__builtin_CFString*>:0 [#uses=1]
+@0 = internal constant %struct.__builtin_CFString { i32* getelementptr ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr ([3 x i8], [3 x i8]* @"\01LC", i32 0, i32 0), i32 2 } ; <%struct.__builtin_CFString*>:0 [#uses=1]
@__CFConstantStringClassReference = external global [0 x i32] ; <[0 x i32]*> [#uses=1]
@"\01LC" = internal constant [3 x i8] c"NO\00" ; <[3 x i8]*> [#uses=1]
@"\01LC1" = internal constant [1 x i8] zeroinitializer ; <[1 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 2abb5ba7cd52..8edaf3f1fa34 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s
-; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -verify-machineinstrs | FileCheck %s
; PR3538
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9"
@@ -12,7 +12,7 @@ define signext i8 @foo(i8* %s1) nounwind ssp {
; movq %rax, %rsp
; CHECK-LABEL: @foo
-; CHECK: movq -40(%rbp), %rsp
+; CHECK: movq -{{[0-9]+}}(%rbp), %rsp
entry:
%s1_addr = alloca i8* ; <i8**> [#uses=2]
@@ -76,9 +76,10 @@ declare i64 @strlen(i8*) nounwind readonly
declare void @llvm.stackrestore(i8*) nounwind
+!llvm.dbg.cu = !{!2}
!0 = !DILocalVariable(name: "s1", line: 2, arg: 1, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !2, type: !3)
-!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, scope: !2, type: !3)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !17, enums: !18, retainedTypes: !18)
!3 = !DISubroutineType(types: !4)
!4 = !{!5, !6}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
diff --git a/test/CodeGen/X86/2009-03-05-burr-list-crash.ll b/test/CodeGen/X86/2009-03-05-burr-list-crash.ll
index 853bb16aa327..e8b6a3142697 100644
--- a/test/CodeGen/X86/2009-03-05-burr-list-crash.ll
+++ b/test/CodeGen/X86/2009-03-05-burr-list-crash.ll
@@ -2,7 +2,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
-external global i32 ; <i32*>:0 [#uses=1]
+@0 = external global i32 ; <i32*>:0 [#uses=1]
declare i64 @strlen(i8* nocapture) nounwind readonly
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index 06a56ad90205..840a479de251 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -22,10 +22,11 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
declare i32 @foo(i32) ssp
+!llvm.dbg.cu = !{!3}
!0 = !DILocation(line: 5, column: 2, scope: !1)
!1 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !2)
-!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3)
-!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9)
+!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scope: !3)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: FullDebug, file: !8, retainedTypes: !9)
!4 = !DILocalVariable(name: "count_", line: 5, scope: !5, file: !3, type: !6)
!5 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !1)
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index c15e7a79bfa1..8b11fd86ef17 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -32,9 +32,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
!llvm.module.flags = !{!21}
!0 = !DILocalVariable(name: "my_r0", line: 11, arg: 1, scope: !1, file: !2, type: !7)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !19, scope: !2, type: !4)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 11, file: !19, scope: !2, type: !4)
!2 = !DIFile(filename: "b2.c", directory: "/tmp/")
-!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 0, file: !19, enums: !20, retainedTypes: !20, subprograms: !18)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: FullDebug, file: !19, enums: !20, retainedTypes: !20)
!4 = !DISubroutineType(types: !5)
!5 = !{!6, !7}
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
@@ -49,7 +49,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
!15 = !DILocation(line: 11, scope: !1)
!16 = !DILocation(line: 12, scope: !17)
!17 = distinct !DILexicalBlock(line: 11, column: 0, file: !19, scope: !1)
-!18 = !{!1}
!19 = !DIFile(filename: "b2.c", directory: "/tmp/")
!20 = !{}
!21 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index eb077c074bc2..b4bb865f7f7e 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -18,7 +18,8 @@ entry:
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone
-!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16)
+!llvm.dbg.cu = !{!0}
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !15, enums: !16, retainedTypes: !16)
!1 = !DIDerivedType(tag: DW_TAG_const_type, size: 192, align: 64, file: !15, scope: !0, baseType: !2)
!2 = !DICompositeType(tag: DW_TAG_structure_type, name: "C", line: 1, size: 192, align: 64, file: !15, scope: !0, elements: !3)
!3 = !{!4, !6, !7}
@@ -28,7 +29,7 @@ declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.
!7 = !DIDerivedType(tag: DW_TAG_member, name: "z", line: 1, size: 64, align: 64, offset: 128, file: !15, scope: !2, baseType: !5)
!8 = !DILocalVariable(name: "t", line: 5, scope: !9, file: !0, type: !2)
!9 = distinct !DILexicalBlock(line: 0, column: 0, file: null, scope: !10)
-!10 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !0, type: !11)
+!10 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !0, scope: !0, type: !11)
!11 = !DISubroutineType(types: !12)
!12 = !{!13}
!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index f157d5011b02..3172f82b2860 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -200,9 +200,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.module.flags = !{!48}
!0 = !DILocalVariable(name: "a", line: 1921, arg: 1, scope: !1, file: !2, type: !9)
-!1 = distinct !DISubprogram(name: "__divsc3", linkageName: "__divsc3", line: 1922, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 1922, file: !45, scope: !2, type: !4, variables: !43)
+!1 = distinct !DISubprogram(name: "__divsc3", linkageName: "__divsc3", line: 1922, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !3, scopeLine: 1922, file: !45, scope: !2, type: !4, variables: !43)
!2 = !DIFile(filename: "libgcc2.c", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
-!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !45, enums: !47, retainedTypes: !47, subprograms: !44, imports: null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !45, enums: !47, retainedTypes: !47, imports: null)
!4 = !DISubroutineType(types: !5)
!5 = !{!6, !9, !9, !9, !9}
!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "SCtype", line: 170, file: !46, scope: !7, baseType: !8)
@@ -243,7 +243,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!41 = !DILocation(line: 1965, scope: !15)
!42 = !DILocation(line: 1969, scope: !15)
!43 = !{!0, !11, !12, !13, !14, !16, !17, !18}
-!44 = !{!1}
!45 = !DIFile(filename: "libgcc2.c", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
!46 = !DIFile(filename: "libgcc2.h", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
!47 = !{}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index a34e7bd9fe43..30e5e346d294 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -26,14 +26,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!0 = !DIGlobalVariable(name: "ret", line: 7, isLocal: false, isDefinition: true, scope: !1, file: !1, type: !3)
!1 = !DIFile(filename: "foo.c", directory: "/tmp/")
-!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !36, enums: !37, retainedTypes: !37, subprograms: !32, globals: !31, imports: !37)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !36, enums: !37, retainedTypes: !37, globals: !31, imports: !37)
!3 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!4 = !DILocalVariable(name: "x", line: 12, arg: 1, scope: !5, file: !1, type: !3)
-!5 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 13, file: !36, scope: !1, type: !6, variables: !33)
+!5 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, scopeLine: 13, file: !36, scope: !1, type: !6, variables: !33)
!6 = !DISubroutineType(types: !7)
!7 = !{null, !3}
!8 = !DILocalVariable(name: "myvar", line: 17, arg: 1, scope: !9, file: !1, type: !13)
-!9 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 17, file: !36, scope: !1, type: !10, variables: !34)
+!9 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, scopeLine: 17, file: !36, scope: !1, type: !10, variables: !34)
!10 = !DISubroutineType(types: !11)
!11 = !{!12, !13}
!12 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !36, scope: !1, baseType: null)
@@ -43,7 +43,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!16 = !DIDerivedType(tag: DW_TAG_member, name: "c", line: 3, size: 32, align: 32, file: !36, scope: !14, baseType: !3)
!17 = !DIDerivedType(tag: DW_TAG_member, name: "d", line: 4, size: 64, align: 64, offset: 64, file: !36, scope: !14, baseType: !13)
!18 = !DILocalVariable(name: "argc", line: 22, arg: 1, scope: !19, file: !1, type: !3)
-!19 = distinct !DISubprogram(name: "main", linkageName: "main", line: 22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 22, file: !36, scope: !1, type: !20, variables: !35)
+!19 = distinct !DISubprogram(name: "main", linkageName: "main", line: 22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, scopeLine: 22, file: !36, scope: !1, type: !20, variables: !35)
!20 = !DISubroutineType(types: !21)
!21 = !{!3, !3, !22}
!22 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !36, scope: !1, baseType: !23)
@@ -56,7 +56,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!29 = distinct !DILexicalBlock(line: 17, column: 0, file: !36, scope: !9)
!30 = !DILocation(line: 19, scope: !29)
!31 = !{!0}
-!32 = !{!5, !9, !19}
!33 = !{!4}
!34 = !{!8}
!35 = !{!18, !25, !26}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 7967d45c2ee8..38bbe4e367b1 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -26,14 +26,14 @@ entry:
!llvm.module.flags = !{!20}
!0 = !DILocalVariable(name: "y", line: 2, arg: 1, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 2, file: !18, scope: !2, type: !4, variables: !15)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !3, scopeLine: 2, file: !18, scope: !2, type: !4, variables: !15)
!2 = !DIFile(filename: "f.c", directory: "/tmp")
-!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !18, enums: !19, retainedTypes: !19, subprograms: !17, imports: null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !18, enums: !19, retainedTypes: !19, imports: null)
!4 = !DISubroutineType(types: !5)
!5 = !{!6, !6}
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!7 = !DILocalVariable(name: "x", line: 6, arg: 1, scope: !8, file: !2, type: !6)
-!8 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 6, file: !18, scope: !2, type: !4, variables: !16)
+!8 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !3, scopeLine: 6, file: !18, scope: !2, type: !4, variables: !16)
!9 = !DILocation(line: 3, scope: !10)
!10 = distinct !DILexicalBlock(line: 2, column: 0, file: !18, scope: !1)
!11 = !{i32 1}
@@ -42,7 +42,6 @@ entry:
!14 = distinct !DILexicalBlock(line: 6, column: 0, file: !18, scope: !8)
!15 = !{!0}
!16 = !{!7}
-!17 = !{!1, !8}
!18 = !DIFile(filename: "f.c", directory: "/tmp")
!19 = !{}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 1be800cdfcf0..fa3932d26698 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -24,14 +24,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
!0 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !1, file: !3, type: !12)
-!1 = distinct !DISubprogram(name: "bar", linkageName: "_ZN3foo3barEi", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 11, file: !31, scope: !2, type: !9)
+!1 = distinct !DISubprogram(name: "bar", linkageName: "_ZN3foo3barEi", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !4, scopeLine: 11, file: !31, scope: !2, type: !9)
!2 = !DICompositeType(tag: DW_TAG_structure_type, name: "foo", line: 3, size: 32, align: 32, file: !31, scope: !3, elements: !5)
!3 = !DIFile(filename: "foo.cp", directory: "/tmp/")
-!4 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 LLVM build", isOptimized: true, emissionKind: 0, file: !31, enums: !32, retainedTypes: !32, subprograms: !33)
+!4 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 LLVM build", isOptimized: true, emissionKind: FullDebug, file: !31, enums: !32, retainedTypes: !32)
!5 = !{!6, !1, !8}
!6 = !DIDerivedType(tag: DW_TAG_member, name: "y", line: 8, size: 32, align: 32, file: !31, scope: !2, baseType: !7)
!7 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!8 = distinct !DISubprogram(name: "baz", linkageName: "_ZN3foo3bazEi", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 15, file: !31, scope: !2, type: !9)
+!8 = distinct !DISubprogram(name: "baz", linkageName: "_ZN3foo3bazEi", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !4, scopeLine: 15, file: !31, scope: !2, type: !9)
!9 = !DISubroutineType(types: !10)
!10 = !{!7, !11, !7}
!11 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial, file: !31, scope: !3, baseType: !2)
@@ -41,7 +41,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!15 = !DILocalVariable(name: "this", line: 15, arg: 1, scope: !8, file: !3, type: !12)
!16 = !DILocalVariable(name: "x", line: 15, arg: 2, scope: !8, file: !3, type: !7)
!17 = !DILocalVariable(name: "argc", line: 19, arg: 1, scope: !18, file: !3, type: !7)
-!18 = distinct !DISubprogram(name: "main", linkageName: "main", line: 19, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 19, file: !31, scope: !3, type: !19)
+!18 = distinct !DISubprogram(name: "main", linkageName: "main", line: 19, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !4, scopeLine: 19, file: !31, scope: !3, type: !19)
!19 = !DISubroutineType(types: !20)
!20 = !{!7, !7, !21}
!21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !31, scope: !3, baseType: !22)
@@ -56,5 +56,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!30 = distinct !DILexicalBlock(line: 15, column: 0, file: !31, scope: !8)
!31 = !DIFile(filename: "foo.cp", directory: "/tmp/")
!32 = !{}
-!33 = !{!1, !8, !18}
!34 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index 5e565a1a667f..f86a7601e219 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -1,14 +1,16 @@
; RUN: llc -O0 -relocation-model pic < %s -o /dev/null
; REQUIRES: default_triple
; PR7545
+
@.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1]
@.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1]
@C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str1, i64 0, i64 0)]
+!llvm.dbg.cu = !{!39}
!38 = !DIFile(filename: "pbmsrch.c", directory: "/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch")
-!39 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", isOptimized: true, emissionKind: 0, file: !109, enums: !108, retainedTypes: !108)
+!39 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", isOptimized: true, emissionKind: FullDebug, file: !109, enums: !108, retainedTypes: !108)
!46 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !109, baseType: !47)
!47 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!97 = distinct !DISubprogram(name: "main", linkageName: "main", line: 73, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !39, type: !98)
+!97 = distinct !DISubprogram(name: "main", linkageName: "main", line: 73, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !39, scope: !39, type: !98)
!98 = !DISubroutineType(types: !99)
!99 = !{!100}
!100 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -29,4 +31,3 @@ bb.nph:
}
declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index d305d678c596..e63a36d7fa36 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -81,7 +81,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!0 = !DISubprogram(name: "SVal", line: 11, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !47, scope: !1, type: !14)
!1 = !DICompositeType(tag: DW_TAG_structure_type, name: "SVal", line: 1, size: 128, align: 64, file: !47, scope: !2, elements: !4)
!2 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330")
-!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !46, imports: null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: FullDebug, file: !47, enums: !48, retainedTypes: !48, imports: null)
!4 = !{!5, !7, !0, !9}
!5 = !DIDerivedType(tag: DW_TAG_member, name: "Data", line: 7, size: 64, align: 64, file: !47, scope: !1, baseType: !6)
!6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !47, scope: !2, baseType: null)
@@ -94,11 +94,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!14 = !DISubroutineType(types: !15)
!15 = !{null, !12}
-!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !47, scope: !1, type: !14)
-!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 16, file: !47, scope: !2, type: !18)
+!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 11, file: !47, scope: !1, type: !14)
+!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 16, file: !47, scope: !2, type: !18)
!18 = !DISubroutineType(types: !19)
!19 = !{!13, !13, !1}
-!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 23, file: !47, scope: !2, type: !21)
+!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 23, file: !47, scope: !2, type: !21)
!21 = !DISubroutineType(types: !22)
!22 = !{!13}
!23 = !DILocalVariable(name: "i", line: 16, arg: 1, scope: !17, file: !2, type: !13)
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index 4303ca991a86..0291ce0da468 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -15,20 +15,19 @@ entry:
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!17}
-!0 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 53, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !14, scope: !1, type: !3)
+!0 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 53, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, file: !14, scope: !1, type: !3)
!1 = !DIFile(filename: "", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 114084)", isOptimized: false, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16, subprograms: !13)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 114084)", isOptimized: false, emissionKind: FullDebug, file: !15, enums: !16, retainedTypes: !16)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !15, scope: !7, type: !3)
+!6 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, file: !15, scope: !7, type: !3)
!7 = !DIFile(filename: "bug.c", directory: "/private/tmp")
!8 = !DILocation(line: 53, column: 13, scope: !9)
!9 = distinct !DILexicalBlock(line: 53, column: 11, file: !14, scope: !0)
!10 = !DILocation(line: 4, column: 13, scope: !11)
!11 = distinct !DILexicalBlock(line: 4, column: 13, file: !15, scope: !12)
!12 = distinct !DILexicalBlock(line: 4, column: 11, file: !15, scope: !6)
-!13 = !{!0, !6}
!14 = !DIFile(filename: "", directory: "/private/tmp")
!15 = !DIFile(filename: "bug.c", directory: "/private/tmp")
!16 = !{}
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index b091003585c2..be2d040a0dcc 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -18,9 +18,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!19}
-!0 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !17, scope: !1, type: !3, variables: !16)
+!0 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 3, file: !17, scope: !1, type: !3, variables: !16)
!1 = !DIFile(filename: "one.c", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 117922)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18, subprograms: !15, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 117922)", isOptimized: true, emissionKind: FullDebug, file: !17, enums: !18, retainedTypes: !18, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -33,7 +33,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!12 = !DILocation(line: 3, column: 47, scope: !0)
!13 = !DILocation(line: 4, column: 2, scope: !14)
!14 = distinct !DILexicalBlock(line: 3, column: 50, file: !17, scope: !0)
-!15 = !{!0}
!16 = !{!6}
!17 = !DIFile(filename: "one.c", directory: "/private/tmp")
!18 = !{}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 661ec94fee4e..d4f4e9057105 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -78,13 +78,13 @@ declare i32 @puts(i8* nocapture) nounwind
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!33}
-!0 = distinct !DISubprogram(name: "gcd", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !31, scope: !1, type: !3, variables: !29)
+!0 = distinct !DISubprogram(name: "gcd", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, file: !31, scope: !1, type: !3, variables: !29)
!1 = !DIFile(filename: "rem_small.c", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 124117)", isOptimized: true, emissionKind: 1, file: !31, enums: !32, retainedTypes: !32, subprograms: !28, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 124117)", isOptimized: true, emissionKind: FullDebug, file: !31, enums: !32, retainedTypes: !32, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
-!6 = distinct !DISubprogram(name: "main", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, file: !31, scope: !1, type: !7, variables: !30)
+!6 = distinct !DISubprogram(name: "main", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, file: !31, scope: !1, type: !7, variables: !30)
!7 = !DISubroutineType(types: !8)
!8 = !{!9}
!9 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -106,7 +106,6 @@ declare i32 @puts(i8* nocapture) nounwind
!25 = !DILocation(line: 27, column: 38, scope: !15)
!26 = !DILocation(line: 28, column: 9, scope: !15)
!27 = !DILocation(line: 30, column: 1, scope: !15)
-!28 = !{!0, !6}
!29 = !{!10, !11, !12}
!30 = !{!14, !17}
!31 = !DIFile(filename: "rem_small.c", directory: "/private/tmp")
diff --git a/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
index 114b985f71d4..c9b3df83613d 100644
--- a/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
+++ b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
@@ -6,7 +6,7 @@
define i8 @f(i8 %v1, i8 %v2) nounwind {
entry:
; CHECK: callq
-; CHECK: movb %{{.*}}, %al
+; CHECK: movl %{{.*}}, %eax
; CHECK: mulb
; CHECK: mulb
%rval = tail call i8 @bar() nounwind
diff --git a/test/CodeGen/X86/2011-09-14-valcoalesce.ll b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
index b8e5100c53bb..812628bf0e70 100644
--- a/test/CodeGen/X86/2011-09-14-valcoalesce.ll
+++ b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
@@ -19,7 +19,7 @@
; reusing the pre-addition register later, or the post-addition one. Currently,
; it does the latter, so we check:
-; CHECK: # %while.body85.i
+; CHECK: # %while.body85.i{{$}}
; CHECK-NOT: # %
; CHECK-NOT: add
; CHECK: movl %[[POSTR:e[abcdxi]+]], %[[PRER:e[abcdxi]+]]
diff --git a/test/CodeGen/X86/2011-10-21-widen-cmp.ll b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
index cb4648c382f7..420e843b52a0 100644
--- a/test/CodeGen/X86/2011-10-21-widen-cmp.ll
+++ b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
@@ -42,7 +42,7 @@ entry:
define void @mp_11193(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
; CHECK-LABEL: mp_11193:
; CHECK: # BB#0: # %allocas
-; CHECK-NEXT: movl $-1082130432, (%rsi) # imm = 0xFFFFFFFFBF800000
+; CHECK-NEXT: movl $-1082130432, (%rsi) # imm = 0xBF800000
; CHECK-NEXT: retq
allocas:
%bincmp = fcmp olt <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 9.000000e+00, float 1.000000e+00, float 9.000000e+00, float 1.000000e+00> , <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
diff --git a/test/CodeGen/X86/2012-01-11-split-cv.ll b/test/CodeGen/X86/2012-01-11-split-cv.ll
index cb39ed911976..212acedafb94 100644
--- a/test/CodeGen/X86/2012-01-11-split-cv.ll
+++ b/test/CodeGen/X86/2012-01-11-split-cv.ll
@@ -1,12 +1,21 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mattr=+avx -mtriple=i686-unknown-unknown | FileCheck %s
-;CHECK-LABEL: add18i16:
define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
-;CHECK: vmovaps
+; CHECK-LABEL: add18i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: vmovups (%ecx), %ymm0
+; CHECK-NEXT: movl 32(%ecx), %ecx
+; CHECK-NEXT: movl %ecx, 32(%eax)
+; CHECK-NEXT: vmovups %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl $4
+;
%b = load <18 x i16>, <18 x i16>* %bp, align 16
%x = add <18 x i16> zeroinitializer, %b
store <18 x i16> %x, <18 x i16>* %ret, align 16
-;CHECK: ret
ret void
}
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 6950641a08ae..9bc4b5f55b64 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -3,12 +3,13 @@
define void @endless_loop() {
; CHECK-LABEL: endless_loop:
; CHECK-NEXT: # BB#0:
-; CHECK-NEXT: vbroadcastss (%eax), %ymm0
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; CHECK-NEXT: vmovaps (%eax), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; CHECK-NEXT: vmovaps %ymm0, (%eax)
; CHECK-NEXT: vmovaps %ymm1, (%eax)
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 4e3f1f4a6e4d..2a76e1a66b2b 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -1,19 +1,31 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
; rdar://11314175: SD Scheduler, BuildSchedUnits assert:
; N->getNodeId() == -1 && "Node already inserted!
-; It's hard to test for the ISEL condition because CodeGen optimizes
-; away the bugpointed code. Just ensure the basics are still there.
-;CHECK-LABEL: func:
-;CHECK: vxorps
-;CHECK: vpshufd
-;CHECK: vpbroadcastd
-;CHECK: vinserti128
-;CHECK: vmulps
-;CHECK: vmulps
-;CHECK: ret
-
define void @func() nounwind ssp {
+; CHECK-LABEL: func:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups 0, %xmm0
+; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; CHECK-NEXT: vbroadcastss 32, %xmm3
+; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; CHECK-NEXT: vmulps %ymm0, %ymm2, %ymm2
+; CHECK-NEXT: vmulps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vhaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%tmp = load <4 x float>, <4 x float>* null, align 1
%tmp14 = getelementptr <4 x float>, <4 x float>* null, i32 2
%tmp15 = load <4 x float>, <4 x float>* %tmp14, align 1
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index eb237847e1bc..2d1b5960d98c 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -4,7 +4,7 @@
define void @bad_cast() {
; CHECK-LABEL: bad_cast:
; CHECK: # BB#0:
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmovaps %xmm0, (%eax)
; CHECK-NEXT: movl $0, (%eax)
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 50b486c6f925..495ff0304b1b 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -38,7 +38,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!12}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !11, enums: !2, retainedTypes: !2, subprograms: !13, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2)
!2 = !{}
!4 = !DILocalVariable(name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6)
!5 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
@@ -46,7 +46,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!7 = !DICompositeType(tag: DW_TAG_structure_type, line: 487, size: 512, align: 64, file: !11)
!11 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
!12 = !{i32 1, !"Debug Info Version", i32 3}
-!13 = !{!14}
-!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !11, scope: !5, type: !15)
+!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !11, scope: !5, type: !15)
!15 = !DISubroutineType(types: !16)
!16 = !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 7ed416e36c22..fbe6000d7ace 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -65,7 +65,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!35}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !19, enums: !2, retainedTypes: !2, subprograms: !20, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !19, enums: !2, retainedTypes: !2, globals: !2)
!1 = !{!2}
!2 = !{}
!4 = !DILocalVariable(name: "num1", line: 815, scope: !5, file: !14, type: !15)
@@ -85,8 +85,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
!18 = !DISubrange(count: 20)
!19 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset")
-!20 = !{!21}
-!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22)
+!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !19, scope: !14, type: !22)
!22 = !DISubroutineType(types: !23)
!23 = !{null}
@@ -134,11 +133,10 @@ declare void @_Znwm()
!llvm.dbg.cu = !{!30}
-!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: 0, file: !34, enums: !2, retainedTypes: !2, subprograms: !36)
+!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: FullDebug, file: !34, enums: !2, retainedTypes: !2)
!31 = !DILocalVariable(name: "X", line: 29, scope: !37, type: !32)
!32 = !DIDerivedType(tag: DW_TAG_typedef, name: "HM", line: 28, file: !34, baseType: null)
!33 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
!34 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
!35 = !{i32 1, !"Debug Info Version", i32 3}
-!36 = !{!37}
-!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22)
+!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !30, scopeLine: 1, file: !19, scope: !14, type: !22)
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 3f7a10ae035b..a717202d3574 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -36,11 +36,10 @@ invoke.cont44: ; preds = %if.end
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: 0, file: !6, subprograms: !1)
-!1 = !{!2}
-!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !6, scope: !5, type: !7)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: FullDebug, file: !6)
+!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !6, scope: !5, type: !7)
!3 = !DILocalVariable(name: "callback", line: 214, scope: !2, type: !4)
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 512, align: 64, file: !6)
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 64, align: 64, file: !6)
!5 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
!6 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
!7 = !DISubroutineType(types: !9)
diff --git a/test/CodeGen/X86/3addr-16bit.ll b/test/CodeGen/X86/3addr-16bit.ll
index 2d6a5e76657f..c80e91a4d8b0 100644
--- a/test/CodeGen/X86/3addr-16bit.ll
+++ b/test/CodeGen/X86/3addr-16bit.ll
@@ -12,7 +12,7 @@ entry:
; 64BIT-LABEL: t1:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal 1(%rsi), %eax
+; 64BIT: movl %esi, %eax
%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]
%1 = add i16 %k, 1 ; <i16> [#uses=3]
br i1 %0, label %bb, label %bb1
@@ -34,7 +34,7 @@ entry:
; 64BIT-LABEL: t2:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal -1(%rsi), %eax
+; 64BIT: movl %esi, %eax
; 64BIT: movzwl %ax
%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]
%1 = add i16 %k, -1 ; <i16> [#uses=3]
@@ -59,7 +59,7 @@ entry:
; 64BIT-LABEL: t3:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal 2(%rsi), %eax
+; 64BIT: movl %esi, %eax
%0 = add i16 %k, 2 ; <i16> [#uses=3]
%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
br i1 %1, label %bb, label %bb1
@@ -82,7 +82,7 @@ entry:
; 64BIT-LABEL: t4:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal (%rsi,%rdi), %eax
+; 64BIT: movl %esi, %eax
%0 = add i16 %k, %c ; <i16> [#uses=3]
%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
br i1 %1, label %bb, label %bb1
diff --git a/test/CodeGen/X86/AppendingLinkage.ll b/test/CodeGen/X86/AppendingLinkage.ll
new file mode 100644
index 000000000000..1a49287d1b38
--- /dev/null
+++ b/test/CodeGen/X86/AppendingLinkage.ll
@@ -0,0 +1,4 @@
+; RUN: not llc < %s -march=x86 2>&1 | FileCheck %s
+
+; CHECK: unknown special variable
+@foo = appending constant [1 x i32 ]zeroinitializer
diff --git a/test/CodeGen/X86/GC/dynamic-frame-size.ll b/test/CodeGen/X86/GC/dynamic-frame-size.ll
index 9ec9b8b08507..0f9a8f57cf2a 100644
--- a/test/CodeGen/X86/GC/dynamic-frame-size.ll
+++ b/test/CodeGen/X86/GC/dynamic-frame-size.ll
@@ -15,7 +15,7 @@ define void @test(i8* %ptr) gc "erlang" {
}
; CHECK: .note.gc
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
; safe point count
; CHECK: .short 1
; CHECK: .long .Ltmp0
diff --git a/test/CodeGen/X86/GC/erlang-gc.ll b/test/CodeGen/X86/GC/erlang-gc.ll
index c55b7f6dcf61..c2cb8c7d6575 100644
--- a/test/CodeGen/X86/GC/erlang-gc.ll
+++ b/test/CodeGen/X86/GC/erlang-gc.ll
@@ -6,7 +6,7 @@ define i32 @main(i32 %x) nounwind gc "erlang" {
ret i32 0
; CHECK64: .section .note.gc,"",@progbits
-; CHECK64-NEXT: .align 8
+; CHECK64-NEXT: .p2align 3
; CHECK64-NEXT: .short 1 # safe point count
; CHECK64-NEXT: .long .Ltmp0 # safe point address
; CHECK64-NEXT: .short 1 # stack frame size (in words)
@@ -14,7 +14,7 @@ define i32 @main(i32 %x) nounwind gc "erlang" {
; CHECK64-NEXT: .short 0 # live root count
; CHECK32: .section .note.gc,"",@progbits
-; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: .p2align 2
; CHECK32-NEXT: .short 1 # safe point count
; CHECK32-NEXT: .long .Ltmp0 # safe point address
; CHECK32-NEXT: .short 3 # stack frame size (in words)
diff --git a/test/CodeGen/X86/GC/ocaml-gc.ll b/test/CodeGen/X86/GC/ocaml-gc.ll
index 37ddaf90bf67..4e4e2e952f73 100644
--- a/test/CodeGen/X86/GC/ocaml-gc.ll
+++ b/test/CodeGen/X86/GC/ocaml-gc.ll
@@ -22,12 +22,12 @@ define i32 @main(i32 %x) nounwind gc "ocaml" {
; CHECK-NEXT: .globl "caml<stdin>__frametable"
; CHECK-NEXT: "caml<stdin>__frametable":
; CHECK-NEXT: .short 1
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: # live roots for main
; CHECK-NEXT: .quad .Ltmp0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
}
declare i32 @foo(i32)
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 457d9beb37d5..a794c896eb27 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -28,17 +28,17 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !20, enums: !21, retainedTypes: !21, subprograms: !18, imports: null)
-!1 = distinct !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !20, scope: !2, type: !3, variables: !19)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: FullDebug, file: !20, enums: !21, retainedTypes: !21, imports: null)
+!1 = distinct !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, file: !20, scope: !2, type: !3, variables: !19)
!2 = !DIFile(filename: "a.c", directory: "/private/tmp")
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!6 = !DILocalVariable(name: "i", line: 2, arg: 1, scope: !1, file: !2, type: !5)
!7 = !DILocalVariable(name: "c", line: 2, arg: 2, scope: !1, file: !2, type: !8)
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, scope: !0, baseType: !9)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, scope: !0, baseType: !5)
!9 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!10 = !DILocalVariable(name: "a", line: 3, scope: !11, file: !2, type: !9)
+!10 = !DILocalVariable(name: "a", line: 3, scope: !11, file: !2, type: !5)
!11 = distinct !DILexicalBlock(line: 2, column: 25, file: !20, scope: !1)
!12 = !DILocation(line: 2, column: 13, scope: !1)
!13 = !DILocation(line: 2, column: 22, scope: !1)
@@ -46,7 +46,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!15 = !DILocation(line: 4, column: 3, scope: !11)
!16 = !DILocation(line: 5, column: 5, scope: !11)
!17 = !DILocation(line: 7, column: 1, scope: !11)
-!18 = !{!1}
!19 = !{!6, !7, !10}
!20 = !DIFile(filename: "a.c", directory: "/private/tmp")
!21 = !{}
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 70af4184e8a2..b50253bf2b03 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
@@ -147,7 +148,8 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) n
; CHECK-LABEL: merge_loads_i16:
; load:
-; CHECK: movw
+; BWON: movzwl
+; BWOFF: movw
; store:
; CHECK: movw
; CHECK: ret
@@ -180,9 +182,11 @@ define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struc
; The loads and the stores are interleaved. Can't merge them.
; CHECK-LABEL: no_merge_loads:
+; BWON: movzbl
+; BWOFF: movb
; CHECK: movb
-; CHECK: movb
-; CHECK: movb
+; BWON: movzbl
+; BWOFF: movb
; CHECK: movb
; CHECK: ret
define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
@@ -337,8 +341,9 @@ block4: ; preds = %4, %.lr.ph
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy.
; CHECK-LABEL: MergeLoadStoreBaseIndexOffset:
-; CHECK: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
-; CHECK: movw [[REG]], (%{{.*}})
+; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
+; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
+; CHECK: movw %[[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
br label %1
@@ -369,8 +374,9 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
; word (16 bit) instead of a byte copy even if there are intermediate sign
; extensions.
; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext:
-; CHECK: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
-; CHECK: movw [[REG]], (%{{.*}})
+; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
+; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
+; CHECK: movw %[[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
br label %1
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 91fe7f819383..15be7aa1029f 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -21,15 +21,16 @@ for.body:
br label %for.body
}
+
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!23}
-!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !{}, retainedTypes: !{})
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !{}, retainedTypes: !{})
!1 = !DIFile(filename: "t.c", directory: "")
!16 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!2 = distinct !DISubprogram()
+!2 = distinct !DISubprogram(unit: !0)
!22 = !DILocalVariable(name: "x", line: 16, scope: !2, file: !1, type: !16)
!23 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index 634f66ad52de..f974cdc30a21 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -1,4 +1,5 @@
; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR --check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false -stackcoloring-lifetime-start-on-first-use=false < %s | FileCheck %s --check-prefix=NOFIRSTUSE --check-prefix=CHECK
; RUN: llc -mcpu=corei7 -no-stack-coloring=true < %s | FileCheck %s --check-prefix=NOCOLOR --check-prefix=CHECK
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -87,7 +88,8 @@ bb3:
}
;CHECK-LABEL: myCall_w4:
-;YESCOLOR: subq $200, %rsp
+;YESCOLOR: subq $120, %rsp
+;NOFIRSTUSE: subq $200, %rsp
;NOCOLOR: subq $408, %rsp
define i32 @myCall_w4(i32 %in) {
@@ -217,7 +219,7 @@ bb3:
;CHECK-LABEL: myCall2_nostart:
-;YESCOLOR: subq $144, %rsp
+;YESCOLOR: subq $272, %rsp
;NOCOLOR: subq $272, %rsp
define i32 @myCall2_nostart(i32 %in, i1 %d) {
entry:
@@ -243,8 +245,8 @@ bb3:
; Adopt the test from Transforms/Inline/array_merge.ll'
;CHECK-LABEL: array_merge:
-;YESCOLOR: subq $816, %rsp
-;NOCOLOR: subq $1616, %rsp
+;YESCOLOR: subq $808, %rsp
+;NOCOLOR: subq $1608, %rsp
define void @array_merge() nounwind ssp {
entry:
%A.i1 = alloca [100 x i32], align 4
@@ -306,6 +308,9 @@ bb3:
;CHECK-LABEL: multi_region_bb:
+;YESCOLOR: subq $272, %rsp
+;NOCOLOR: subq $272, %rsp
+
define void @multi_region_bb() nounwind ssp {
entry:
%A.i1 = alloca [100 x i32], align 4
@@ -330,8 +335,6 @@ entry:
call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
ret void
}
-;YESCOLOR: subq $272, %rsp
-;NOCOLOR: subq $272, %rsp
define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
entry:
@@ -360,7 +363,7 @@ bb3:
; Regression test for PR15707. %buf1 and %buf2 should not be merged
; in this test case.
;CHECK-LABEL: myCall_pr15707:
-;YESCOLOR: subq $200008, %rsp
+;NOFIRSTUSE: subq $200008, %rsp
;NOCOLOR: subq $200008, %rsp
define void @myCall_pr15707() {
%buf1 = alloca i8, i32 100000, align 16
@@ -425,6 +428,164 @@ define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
ret i32 9
}
+; In this case 'itar1' and 'itar2' can't be overlapped if we treat
+; lifetime.start as the beginning of the lifetime, but we can
+; overlap if we consider first use of the slot as lifetime
+; start. See llvm bug 25776.
+
+;CHECK-LABEL: ifthen_twoslots:
+;YESCOLOR: subq $1544, %rsp
+;NOFIRSTUSE: subq $2056, %rsp
+;NOCOLOR: subq $2568, %rsp
+
+define i32 @ifthen_twoslots(i32 %x) #0 {
+entry:
+ %b1 = alloca [128 x i32], align 16
+ %b2 = alloca [128 x i32], align 16
+ %b3 = alloca [128 x i32], align 16
+ %b4 = alloca [128 x i32], align 16
+ %b5 = alloca [128 x i32], align 16
+ %tmp = bitcast [128 x i32]* %b1 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp)
+ %tmp1 = bitcast [128 x i32]* %b2 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp1)
+ %and = and i32 %x, 1
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %tmp2 = bitcast [128 x i32]* %b3 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp2)
+ %a1 = getelementptr inbounds [128 x i32], [128 x i32]* %b1, i64 0, i64 0
+ %a2 = getelementptr inbounds [128 x i32], [128 x i32]* %b3, i64 0, i64 0
+ call void @initb(i32* %a1, i32* %a2, i32* null)
+ call void @llvm.lifetime.end(i64 512, i8* %tmp2)
+ br label %if.end
+
+if.else: ; preds = %entry
+ %tmp3 = bitcast [128 x i32]* %b4 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp3)
+ %tmp4 = bitcast [128 x i32]* %b5 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp4)
+ %a3 = getelementptr inbounds [128 x i32], [128 x i32]* %b2, i64 0, i64 0
+ %a4 = getelementptr inbounds [128 x i32], [128 x i32]* %b4, i64 0, i64 0
+ %a5 = getelementptr inbounds [128 x i32], [128 x i32]* %b5, i64 0, i64 0
+ call void @initb(i32* %a3, i32* %a4, i32* %a5) #3
+ call void @llvm.lifetime.end(i64 512, i8* %tmp4)
+ call void @llvm.lifetime.end(i64 512, i8* %tmp3)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ call void @llvm.lifetime.end(i64 512, i8* %tmp1)
+ call void @llvm.lifetime.end(i64 512, i8* %tmp)
+ ret i32 0
+
+}
+
+; This function is intended to test the case where you
+; have a reference to a stack slot that lies outside of
+; the START/END lifetime markers-- the flow analysis
+; should catch this and build the lifetime based on the
+; markers only.
+
+;CHECK-LABEL: while_loop:
+;YESCOLOR: subq $1032, %rsp
+;NOFIRSTUSE: subq $1544, %rsp
+;NOCOLOR: subq $1544, %rsp
+
+define i32 @while_loop(i32 %x) #0 {
+entry:
+ %b1 = alloca [128 x i32], align 16
+ %b2 = alloca [128 x i32], align 16
+ %b3 = alloca [128 x i32], align 16
+ %tmp = bitcast [128 x i32]* %b1 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp) #3
+ %tmp1 = bitcast [128 x i32]* %b2 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp1) #3
+ %and = and i32 %x, 1
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %arraydecay = getelementptr inbounds [128 x i32], [128 x i32]* %b2, i64 0, i64 0
+ call void @inita(i32* %arraydecay) #3
+ br label %if.end
+
+if.else: ; preds = %entry
+ %arraydecay1 = getelementptr inbounds [128 x i32], [128 x i32]* %b1, i64 0, i64 0
+ call void @inita(i32* %arraydecay1) #3
+ %arraydecay3 = getelementptr inbounds [128 x i32], [128 x i32]* %b3, i64 0, i64 0
+ call void @inita(i32* %arraydecay3) #3
+ %tobool25 = icmp eq i32 %x, 0
+ br i1 %tobool25, label %if.end, label %while.body.lr.ph
+
+while.body.lr.ph: ; preds = %if.else
+ %tmp2 = bitcast [128 x i32]* %b3 to i8*
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %x.addr.06 = phi i32 [ %x, %while.body.lr.ph ], [ %dec, %while.body ]
+ %dec = add nsw i32 %x.addr.06, -1
+ call void @llvm.lifetime.start(i64 512, i8* %tmp2) #3
+ call void @inita(i32* %arraydecay3) #3
+ call void @llvm.lifetime.end(i64 512, i8* %tmp2) #3
+ %tobool2 = icmp eq i32 %dec, 0
+ br i1 %tobool2, label %if.end.loopexit, label %while.body
+
+if.end.loopexit: ; preds = %while.body
+ br label %if.end
+
+if.end: ; preds = %if.end.loopexit, %if.else, %if.then
+ call void @llvm.lifetime.end(i64 512, i8* %tmp1) #3
+ call void @llvm.lifetime.end(i64 512, i8* %tmp) #3
+ ret i32 0
+}
+
+; Test case motivated by PR27903. Same routine inlined multiple times
+; into a caller results in a multi-segment lifetime, but the second
+; lifetime has no explicit references to the stack slot. Such slots
+; have to be treated conservatively.
+
+;CHECK-LABEL: twobod_b27903:
+;YESCOLOR: subq $96, %rsp
+;NOFIRSTUSE: subq $96, %rsp
+;NOCOLOR: subq $96, %rsp
+
+define i32 @twobod_b27903(i32 %y, i32 %x) {
+entry:
+ %buffer.i = alloca [12 x i32], align 16
+ %abc = alloca [12 x i32], align 16
+ %tmp = bitcast [12 x i32]* %buffer.i to i8*
+ call void @llvm.lifetime.start(i64 48, i8* %tmp)
+ %idxprom.i = sext i32 %y to i64
+ %arrayidx.i = getelementptr inbounds [12 x i32], [12 x i32]* %buffer.i, i64 0, i64 %idxprom.i
+ call void @inita(i32* %arrayidx.i)
+ %add.i = add nsw i32 %x, %y
+ call void @llvm.lifetime.end(i64 48, i8* %tmp)
+ %tobool = icmp eq i32 %y, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %tmp1 = bitcast [12 x i32]* %abc to i8*
+ call void @llvm.lifetime.start(i64 48, i8* %tmp1)
+ %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %abc, i64 0, i64 %idxprom.i
+ call void @inita(i32* %arrayidx)
+ call void @llvm.lifetime.start(i64 48, i8* %tmp)
+ call void @inita(i32* %arrayidx.i)
+ %add.i9 = add nsw i32 %add.i, %y
+ call void @llvm.lifetime.end(i64 48, i8* %tmp)
+ call void @llvm.lifetime.end(i64 48, i8* %tmp1)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %x.addr.0 = phi i32 [ %add.i9, %if.then ], [ %add.i, %entry ]
+ ret i32 %x.addr.0
+}
+
+declare void @inita(i32*)
+
+declare void @initb(i32*,i32*,i32*)
+
declare void @bar([100 x i32]* , [100 x i32]*) nounwind
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/WidenArith.ll b/test/CodeGen/X86/WidenArith.ll
index f87b3821dde8..cdd1a2818b2f 100644
--- a/test/CodeGen/X86/WidenArith.ll
+++ b/test/CodeGen/X86/WidenArith.ll
@@ -1,15 +1,17 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-;CHECK-LABEL: test:
-;CHECK: vaddps
-;CHECK: vmulps
-;CHECK: vsubps
-;CHECK: vcmpltps
-;CHECK: vcmpltps
-;CHECK: vandps
-;CHECK: vandps
-;CHECK: ret
define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm2
+; CHECK-NEXT: vmulps %ymm0, %ymm1, %ymm1
+; CHECK-NEXT: vsubps %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%c1 = fadd <8 x float> %a, %b
%b1 = fmul <8 x float> %b, %a
%d = fsub <8 x float> %b1, %c1
diff --git a/test/CodeGen/X86/abi-isel.ll b/test/CodeGen/X86/abi-isel.ll
index f363b64386f5..742041a974b3 100644
--- a/test/CodeGen/X86/abi-isel.ll
+++ b/test/CodeGen/X86/abi-isel.ll
@@ -5,8 +5,8 @@
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
@@ -8425,25 +8425,25 @@ entry:
; DARWIN-32-DYNAMIC: _lcallee:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _lcallee:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
@@ -8557,25 +8557,25 @@ entry:
; DARWIN-32-DYNAMIC: _dcallee:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _dcallee:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
@@ -8802,15 +8802,15 @@ entry:
; DARWIN-32-DYNAMIC: _caller:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_callee$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_callee$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _callee
+; DARWIN-32-DYNAMIC-NEXT: calll _callee
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _caller:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_callee$stub
-; DARWIN-32-PIC-NEXT: calll L_callee$stub
+; DARWIN-32-PIC-NEXT: calll _callee
+; DARWIN-32-PIC-NEXT: calll _callee
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
@@ -9021,13 +9021,13 @@ entry:
; DARWIN-32-DYNAMIC: _tailcaller:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_callee$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _callee
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _tailcaller:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_callee$stub
+; DARWIN-32-PIC-NEXT: calll _callee
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
diff --git a/test/CodeGen/X86/add-nsw-sext.ll b/test/CodeGen/X86/add-nsw-sext.ll
index 0a6f6c315c13..658c58b3d61b 100644
--- a/test/CodeGen/X86/add-nsw-sext.ll
+++ b/test/CodeGen/X86/add-nsw-sext.ll
@@ -25,7 +25,7 @@ define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_add:
; CHECK: # BB#0:
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq 5(%rax,%rsi), %rax
+; CHECK-NEXT: leaq 5(%rsi,%rax), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
@@ -72,7 +72,7 @@ define i8* @gep8(i32 %i, i8* %x) {
; CHECK-LABEL: gep8:
; CHECK: # BB#0:
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq 5(%rax,%rsi), %rax
+; CHECK-NEXT: leaq 5(%rsi,%rax), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
@@ -127,7 +127,7 @@ define i128* @gep128(i32 %i, i128* %x) {
; CHECK: # BB#0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: shlq $4, %rax
-; CHECK-NEXT: leaq 80(%rax,%rsi), %rax
+; CHECK-NEXT: leaq 80(%rsi,%rax), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index 62a62a460bd7..df1bc9b6ee7e 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -148,3 +148,39 @@ entry:
; X64: incl
; X64-NEXT: seto
}
+
+define void @test11(i32* inreg %a) nounwind {
+ %aa = load i32, i32* %a
+ %b = add i32 %aa, 128
+ store i32 %b, i32* %a
+ ret void
+; X32-LABEL: test11:
+; X32: subl $-128, (%
+; X64-LABEL: test11:
+; X64: subl $-128, (%
+}
+
+define void @test12(i64* inreg %a) nounwind {
+ %aa = load i64, i64* %a
+ %b = add i64 %aa, 2147483648
+ store i64 %b, i64* %a
+ ret void
+; X32-LABEL: test12:
+; X32: addl (%
+; X32-NEXT: adcl $0,
+; X64-LABEL: test12:
+; X64: subq $-2147483648, (%
+}
+
+define void @test13(i64* inreg %a) nounwind {
+ %aa = load i64, i64* %a
+ %b = add i64 %aa, 128
+ store i64 %b, i64* %a
+ ret void
+
+; X32-LABEL: test13:
+; X32: addl (%
+; X32-NEXT: adcl $0,
+; X64-LABEL: test13:
+; X64: subq $-128, (%
+}
diff --git a/test/CodeGen/X86/alias-gep.ll b/test/CodeGen/X86/alias-gep.ll
new file mode 100644
index 000000000000..5ecf20ba78ed
--- /dev/null
+++ b/test/CodeGen/X86/alias-gep.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=MACHO %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck --check-prefix=ELF %s
+
+;MACHO: .globl _offsetSym0
+;MACHO-NOT: .alt_entry
+;MACHO: _offsetSym0 = _s
+;MACHO: .globl _offsetSym1
+;MACHO: .alt_entry _offsetSym1
+;MACHO: _offsetSym1 = _s+8
+
+;ELF: .globl offsetSym0
+;ELF-NOT: .alt_entry
+;ELF: offsetSym0 = s
+;ELF: .globl offsetSym1
+;ELF-NOT: .alt_entry
+;ELF: offsetSym1 = s+8
+
+%struct.S1 = type { i32, i32, i32 }
+
+@s = global %struct.S1 { i32 31, i32 32, i32 33 }, align 4
+@offsetSym0 = alias i32, i32* getelementptr inbounds (%struct.S1, %struct.S1* @s, i64 0, i32 0)
+@offsetSym1 = alias i32, i32* getelementptr inbounds (%struct.S1, %struct.S1* @s, i64 0, i32 2)
diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll
index 294159220626..1ea57296a707 100644
--- a/test/CodeGen/X86/aligned-variadic.ll
+++ b/test/CodeGen/X86/aligned-variadic.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X32
%struct.Baz = type { [17 x i8] }
%struct.__va_list_tag = type { i32, i32, i8*, i8* }
diff --git a/test/CodeGen/X86/alignment.ll b/test/CodeGen/X86/alignment.ll
index 5908c0cde61e..acf11fdec494 100644
--- a/test/CodeGen/X86/alignment.ll
+++ b/test/CodeGen/X86/alignment.ll
@@ -6,7 +6,7 @@
; CHECK: .bss
; CHECK: .globl GlobalA
-; CHECK: .align 8
+; CHECK: .p2align 3
; CHECK: GlobalA:
; CHECK: .zero 384
@@ -29,7 +29,7 @@
@GlobalAS = global { [384 x i8] } zeroinitializer, align 8, section "foo"
; CHECK: .globl GlobalAS
-; CHECK: .align 8
+; CHECK: .p2align 3
; CHECK: GlobalAS:
; CHECK: .zero 384
diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll
index 10fecadaa023..9707eb57ae47 100644
--- a/test/CodeGen/X86/all-ones-vector.ll
+++ b/test/CodeGen/X86/all-ones-vector.ll
@@ -1,14 +1,143 @@
-; RUN: llc < %s -march=x86 -mattr=sse2 | grep pcmpeqd | count 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX
+
+define <16 x i8> @coo() nounwind {
+; X32-SSE-LABEL: coo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: coo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: coo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: coo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+}
+
+define <8 x i16> @soo() nounwind {
+; X32-SSE-LABEL: soo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: soo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: soo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: soo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+}
define <4 x i32> @ioo() nounwind {
- ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; X32-SSE-LABEL: ioo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: ioo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: ioo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: ioo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
}
+
define <2 x i64> @loo() nounwind {
- ret <2 x i64> <i64 -1, i64 -1>
+; X32-SSE-LABEL: loo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: loo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: loo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: loo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <2 x i64> <i64 -1, i64 -1>
}
+
define <2 x double> @doo() nounwind {
- ret <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
+; X32-SSE-LABEL: doo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: doo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: doo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: doo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
}
+
define <4 x float> @foo() nounwind {
- ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
+; X32-SSE-LABEL: foo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: foo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: foo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: foo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
}
diff --git a/test/CodeGen/X86/and-encoding.ll b/test/CodeGen/X86/and-encoding.ll
index f7bbac2a4bd9..1a90bd0d6eb7 100644
--- a/test/CodeGen/X86/and-encoding.ll
+++ b/test/CodeGen/X86/and-encoding.ll
@@ -15,27 +15,18 @@ define void @f1() {
ret void
}
-define void @f2(i1 *%x, i16 *%y) {
+define void @f2(i16 %x, i1 *%y) {
; CHECK-LABEL: f2:
-; CHECK: andl $1, %eax # encoding: [0x83,0xe0,0x01]
- %a = load i1, i1* %x
- %b = zext i1 %a to i16
- store i16 %b, i16* %y
+; CHECK: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+ %c = trunc i16 %x to i1
+ store i1 %c, i1* %y
ret void
}
-define i32 @f3(i1 *%x) {
+define void @f3(i32 %x, i1 *%y) {
; CHECK-LABEL: f3:
-; CHECK: andl $1, %eax # encoding: [0x83,0xe0,0x01]
- %a = load i1, i1* %x
- %b = zext i1 %a to i32
- ret i32 %b
-}
-
-define i64 @f4(i1 *%x) {
-; CHECK-LABEL: f4:
-; CHECK: andl $1, %eax # encoding: [0x83,0xe0,0x01]
- %a = load i1, i1* %x
- %b = zext i1 %a to i64
- ret i64 %b
+; CHECK: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+ %c = trunc i32 %x to i1
+ store i1 %c, i1* %y
+ ret void
}
diff --git a/test/CodeGen/X86/anyext.ll b/test/CodeGen/X86/anyext.ll
index 106fe83661b4..4f4218bdd63d 100644
--- a/test/CodeGen/X86/anyext.ll
+++ b/test/CodeGen/X86/anyext.ll
@@ -1,15 +1,52 @@
-; RUN: llc < %s -march=x86-64 | grep movzbl | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
; Use movzbl to avoid partial-register updates.
define i32 @foo(i32 %p, i8 zeroext %x) nounwind {
+; X32-LABEL: foo:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: divb {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: divb %sil
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
%q = trunc i32 %p to i8
%r = udiv i8 %q, %x
%s = zext i8 %r to i32
%t = and i32 %s, 1
ret i32 %t
}
+
define i32 @bar(i32 %p, i16 zeroext %x) nounwind {
+; X32-LABEL: bar:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: divw {{[0-9]+}}(%esp)
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<def>
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: bar:
+; X64: # BB#0:
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: divw %si
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<def>
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
%q = trunc i32 %p to i16
%r = udiv i16 %q, %x
%s = zext i16 %r to i32
diff --git a/test/CodeGen/X86/atom-lea-sp.ll b/test/CodeGen/X86/atom-lea-sp.ll
index 1ee3b00ee87e..25da6b30adfe 100644
--- a/test/CodeGen/X86/atom-lea-sp.ll
+++ b/test/CodeGen/X86/atom-lea-sp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck -check-prefix=ATOM %s
-; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck %s
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux -no-x86-call-frame-opt | FileCheck %s
declare void @use_arr(i8*)
declare void @many_params(i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/X86/atomic-eflags-reuse.ll b/test/CodeGen/X86/atomic-eflags-reuse.ll
new file mode 100644
index 000000000000..dc1814b55cd3
--- /dev/null
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define i32 @test_add_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_slt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock incq (%rdi)
+; CHECK-NEXT: cmovgl %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp slt i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_add_1_cmov_sge(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_sge:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock incq (%rdi)
+; CHECK-NEXT: cmovlel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sge i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_sub_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_sub_1_cmov_sle:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock decq (%rdi)
+; CHECK-NEXT: cmovgel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sle i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_sub_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_sub_1_cmov_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock decq (%rdi)
+; CHECK-NEXT: cmovll %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+; FIXME: (setcc slt x, 0) gets combined into shr early.
+define i8 @test_add_1_setcc_slt(i64* %p) #0 {
+; CHECK-LABEL: test_add_1_setcc_slt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: shrq $63, %rax
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %RAX<kill>
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp slt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i8 @test_sub_1_setcc_sgt(i64* %p) #0 {
+; CHECK-LABEL: test_sub_1_setcc_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock decq (%rdi)
+; CHECK-NEXT: setge %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i32 @test_add_1_brcond_sge(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_brcond_sge:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock incq (%rdi)
+; CHECK-NEXT: jle .LBB6_2
+; CHECK-NEXT: # BB#1: # %t
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB6_2: # %f
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sge i64 %tmp0, 0
+ br i1 %tmp1, label %t, label %f
+t:
+ ret i32 %a0
+f:
+ ret i32 %a1
+}
+
+; Also make sure we don't muck with condition codes that we should ignore.
+; No need to test unsigned comparisons, as they should all be simplified.
+
+define i32 @test_add_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_sle:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovgl %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sle i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_add_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovlel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+; Test a result being used by more than just the comparison.
+
+define i8 @test_add_1_setcc_sgt_reuse(i64* %p, i64* %p2) #0 {
+; CHECK-LABEL: test_add_1_setcc_sgt_reuse:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: lock xaddq %rcx, (%rdi)
+; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: movq %rcx, (%rsi)
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ store i64 %tmp0, i64* %p2
+ ret i8 %tmp2
+}
+
+define i8 @test_sub_2_setcc_sgt(i64* %p) #0 {
+; CHECK-LABEL: test_sub_2_setcc_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq $-2, %rax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 2 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/atomic-non-integer.ll b/test/CodeGen/X86/atomic-non-integer.ll
index 98fcd96d3e4c..17b73ecf4e1c 100644
--- a/test/CodeGen/X86/atomic-non-integer.ll
+++ b/test/CodeGen/X86/atomic-non-integer.ll
@@ -43,7 +43,7 @@ define half @load_half(half* %fptr) {
; CHECK-LABEL: @load_half
; CHECK: movw (%rdi), %ax
; CHECK: movzwl %ax, %edi
-; CHECK: jmp __gnu_h2f_ieee
+; CHECK: callq __gnu_h2f_ieee
%v = load atomic half, half* %fptr unordered, align 2
ret half %v
}
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
index c41269b0b606..1bf7bfbfa260 100644
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@@ -4,9 +4,14 @@
define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
; CHECK-LABEL: val_compare_and_swap:
+; Due to the scheduling right after isel for cmpxchg and given the
+; machine scheduler and copy coalescer do not mess up with physical
+; register live-ranges, we end up with a useless copy.
+;
+; CHECK: movq %rcx, [[TMP:%r[0-9a-z]+]]
; CHECK: movq %rsi, %rax
-; CHECK: movq %rcx, %rbx
; CHECK: movq %r8, %rcx
+; CHECK: movq [[TMP]], %rbx
; CHECK: lock
; CHECK: cmpxchg16b (%rdi)
@@ -216,8 +221,8 @@ define i128 @atomic_load_seq_cst(i128* %p) {
; CHECK-LABEL: atomic_load_seq_cst:
; CHECK: xorl %eax, %eax
; CHECK: xorl %edx, %edx
-; CHECK: xorl %ebx, %ebx
; CHECK: xorl %ecx, %ecx
+; CHECK: xorl %ebx, %ebx
; CHECK: lock
; CHECK: cmpxchg16b (%rdi)
@@ -229,8 +234,8 @@ define i128 @atomic_load_relaxed(i128* %p) {
; CHECK: atomic_load_relaxed:
; CHECK: xorl %eax, %eax
; CHECK: xorl %edx, %edx
-; CHECK: xorl %ebx, %ebx
; CHECK: xorl %ecx, %ecx
+; CHECK: xorl %ebx, %ebx
; CHECK: lock
; CHECK: cmpxchg16b (%rdi)
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
index f6892de43d89..90716cc3984f 100644
--- a/test/CodeGen/X86/atomic16.ll
+++ b/test/CodeGen/X86/atomic16.ll
@@ -154,17 +154,19 @@ define void @atomic_fetch_nand16(i16 %x) nounwind {
}
define void @atomic_fetch_max16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_max16
+; X32-LABEL: atomic_fetch_max16
%t1 = atomicrmw max i16* @sc16, i16 %x acquire
-; X64: movswl
-; X64: movswl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movswl
-; X32: movswl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
@@ -174,17 +176,19 @@ define void @atomic_fetch_max16(i16 %x) nounwind {
}
define void @atomic_fetch_min16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_min16
+; X32-LABEL: atomic_fetch_min16
%t1 = atomicrmw min i16* @sc16, i16 %x acquire
-; X64: movswl
-; X64: movswl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movswl
-; X32: movswl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
@@ -194,17 +198,19 @@ define void @atomic_fetch_min16(i16 %x) nounwind {
}
define void @atomic_fetch_umax16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_umax16
+; X32-LABEL: atomic_fetch_umax16
%t1 = atomicrmw umax i16* @sc16, i16 %x acquire
-; X64: movzwl
-; X64: movzwl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movzwl
-; X32: movzwl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
@@ -214,17 +220,19 @@ define void @atomic_fetch_umax16(i16 %x) nounwind {
}
define void @atomic_fetch_umin16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_umin16
+; X32-LABEL: atomic_fetch_umin16
%t1 = atomicrmw umin i16* @sc16, i16 %x acquire
-; X64: movzwl
-; X64: movzwl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movzwl
-; X32: movzwl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
index 5eef9b295e80..01123ae9b073 100644
--- a/test/CodeGen/X86/atomic8.ll
+++ b/test/CodeGen/X86/atomic8.ll
@@ -157,15 +157,15 @@ define void @atomic_fetch_max8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_max8:
; X32-LABEL: atomic_fetch_max8:
%t1 = atomicrmw max i8* @sc8, i8 %x acquire
-; X64: movsbl
-; X64: movsbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movsbl
-; X32: movsbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
@@ -177,15 +177,15 @@ define void @atomic_fetch_min8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_min8:
; X32-LABEL: atomic_fetch_min8:
%t1 = atomicrmw min i8* @sc8, i8 %x acquire
-; X64: movsbl
-; X64: movsbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movsbl
-; X32: movsbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
@@ -197,15 +197,15 @@ define void @atomic_fetch_umax8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_umax8:
; X32-LABEL: atomic_fetch_umax8:
%t1 = atomicrmw umax i8* @sc8, i8 %x acquire
-; X64: movzbl
-; X64: movzbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movzbl
-; X32: movzbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
@@ -217,15 +217,15 @@ define void @atomic_fetch_umin8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_umin8:
; X32-LABEL: atomic_fetch_umin8:
%t1 = atomicrmw umin i8* @sc8, i8 %x acquire
-; X64: movzbl
-; X64: movzbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movzbl
-; X32: movzbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll
index 356d9dcff6fa..e9f1b59ac589 100644
--- a/test/CodeGen/X86/atomic_mi.ll
+++ b/test/CodeGen/X86/atomic_mi.ll
@@ -979,3 +979,20 @@ define void @fadd_64stack() {
store atomic i64 %bc1, i64* %ptr release, align 8
ret void
}
+
+define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) {
+; X64-LABEL: fadd_array:
+; X64-NOT: lock
+; X64: addsd ([[ADDR:%r..,%r..,8]]), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: movsd %[[XMM]], ([[ADDR]])
+; X32-LABEL: fadd_array:
+; Don't check x86-32 (see comment above).
+bb:
+ %tmp4 = getelementptr inbounds i64, i64* %arg, i64 %arg2
+ %tmp6 = load atomic i64, i64* %tmp4 monotonic, align 8
+ %tmp7 = bitcast i64 %tmp6 to double
+ %tmp8 = fadd double %tmp7, %arg1
+ %tmp9 = bitcast double %tmp8 to i64
+ store atomic i64 %tmp9, i64* %tmp4 monotonic, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/avoid-loop-align.ll b/test/CodeGen/X86/avoid-loop-align.ll
index d82cf9418e64..9895b30800ec 100644
--- a/test/CodeGen/X86/avoid-loop-align.ll
+++ b/test/CodeGen/X86/avoid-loop-align.ll
@@ -4,7 +4,7 @@
; header in this case.
; CHECK: jmp LBB0_2
-; CHECK: .align
+; CHECK: .p2align
; CHECK: LBB0_1:
@A = common global [100 x i32] zeroinitializer, align 32 ; <[100 x i32]*> [#uses=1]
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 6857bb8bd112..b05dc71c175a 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -1,30 +1,44 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s
@x = common global <8 x float> zeroinitializer, align 32
@y = common global <4 x double> zeroinitializer, align 32
@z = common global <4 x float> zeroinitializer, align 16
define void @zero128() nounwind ssp {
-entry:
- ; CHECK: vxorps
- ; CHECK: vmovaps
+; CHECK-LABEL: zero128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax
+; CHECK-NEXT: vmovaps %xmm0, (%rax)
+; CHECK-NEXT: retq
store <4 x float> zeroinitializer, <4 x float>* @z, align 16
ret void
}
define void @zero256() nounwind ssp {
-entry:
- ; CHECK: vxorps
- ; CHECK: vmovaps
- ; CHECK: vmovaps
+; CHECK-LABEL: zero256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movq _x@{{.*}}(%rip), %rax
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+; CHECK-NEXT: movq _y@{{.*}}(%rip), %rax
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* @x, align 32
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
ret void
}
-; CHECK: vpcmpeqd
-; CHECK: vinsertf128 $1
define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
+; CHECK-LABEL: ones:
+; CHECK: ## BB#0: ## %allocas
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
allocas:
%ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
@@ -34,9 +48,14 @@ float>* %ptr2vec615, align 32
ret void
}
-; CHECK: vpcmpeqd
-; CHECK: vinsertf128 $1
define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind {
+; CHECK-LABEL: ones2:
+; CHECK: ## BB#0: ## %allocas
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
allocas:
%ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>*
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32
@@ -44,18 +63,22 @@ allocas:
}
;;; Just make sure this doesn't crash
-; CHECK: _ISelCrash
define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: ISelCrash:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
ret <4 x i64> %shuffle
}
;;; Don't crash on movd
-; CHECK: _VMOVZQI2PQI
-; CHECK: vmovd (%
define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
-allocas:
+; CHECK-LABEL: VMOVZQI2PQI:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: retq
%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4
%ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1
@@ -67,35 +90,45 @@ allocas:
;;;; Don't crash on fneg
; rdar://10566486
-; CHECK: fneg
-; CHECK: vxorps
define <16 x float> @fneg(<16 x float> %a) nounwind {
+; CHECK-LABEL: fneg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: retq
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
ret <16 x float> %1
}
;;; Don't crash on build vector
-; CHECK: @build_vec_16x16
-; CHECK: vmovd
define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
+; CHECK-LABEL: build_vec_16x16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: retq
%res = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a, i32 0
ret <16 x i16> %res
}
;;; Check that VMOVPQIto64rr generates the assembly string "vmovq". Previously
;;; an incorrect mnemonic of "movd" was printed for this instruction.
-; CHECK: VMOVPQIto64rr
-; CHECK: vmovq
define i64 @VMOVPQIto64rr(<2 x i64> %a) {
-entry:
+; CHECK-LABEL: VMOVPQIto64rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: retq
%vecext.i = extractelement <2 x i64> %a, i32 0
ret i64 %vecext.i
}
; PR22685
-; CHECK: mov00
-; CHECK: vmovss
define <8 x float> @mov00_8f32(float* %ptr) {
+; CHECK-LABEL: mov00_8f32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq
%val = load float, float* %ptr
%vec = insertelement <8 x float> zeroinitializer, float %val, i32 0
ret <8 x float> %vec
diff --git a/test/CodeGen/X86/avx-cast.ll b/test/CodeGen/X86/avx-cast.ll
index 34c5dfaa0162..103715c3628e 100644
--- a/test/CodeGen/X86/avx-cast.ll
+++ b/test/CodeGen/X86/avx-cast.ll
@@ -9,6 +9,7 @@
define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castA:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: retq
@@ -19,6 +20,7 @@ define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castB:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: retq
@@ -31,12 +33,14 @@ define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castC:
; AVX1: ## BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: castC:
; AVX2: ## BB#0:
+; AVX2-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
@@ -50,6 +54,7 @@ define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castD:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -59,6 +64,7 @@ define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castE:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
@@ -68,6 +74,7 @@ define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castF:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 3923ca850d1a..be4920d1122d 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -15,9 +15,10 @@ declare i32 @func_int(i32, i32)
; WIN64: ret
; X32-LABEL: testf16_inp
-; X32: movl %eax, (%esp)
; X32: vaddps {{.*}}, {{%ymm[0-1]}}
; X32: vaddps {{.*}}, {{%ymm[0-1]}}
+; Push is not deemed profitable if we're realigning the stack.
+; X32: {{pushl|movl}} %eax
; X32: call
; X32: ret
@@ -114,8 +115,8 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; test functions with integer parameters
; pass parameters on stack for 32-bit platform
; X32-LABEL: test_int
-; X32: movl {{.*}}, 4(%esp)
-; X32: movl {{.*}}, (%esp)
+; X32: pushl {{.*}}
+; X32: pushl {{.*}}
; X32: call
; X32: addl {{.*}}, %eax
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..c7cf857e1d44
--- /dev/null
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -0,0 +1,3778 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
+
+define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_add_pd:
+; X32: # BB#0:
+; X32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_pd:
+; X64: # BB#0:
+; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fadd <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_add_ps:
+; X32: # BB#0:
+; X32-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_ps:
+; X64: # BB#0:
+; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fadd <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_addsub_pd:
+; X32: # BB#0:
+; X32-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_addsub_pd:
+; X64: # BB#0:
+; X64-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_addsub_ps:
+; X32: # BB#0:
+; X32-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_addsub_ps:
+; X64: # BB#0:
+; X64-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_and_pd:
+; X32: # BB#0:
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_and_pd:
+; X64: # BB#0:
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %res = and <4 x i64> %1, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_and_ps:
+; X32: # BB#0:
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_and_ps:
+; X64: # BB#0:
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %res = and <8 x i32> %1, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_andnot_pd:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X32-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_andnot_pd:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X64-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %res = and <4 x i64> %3, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_andnot_ps:
+; X32: # BB#0:
+; X32-NEXT: vandnps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_andnot_ps:
+; X64: # BB#0:
+; X64-NEXT: vandnps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = and <8 x i32> %3, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_blend_pd:
+; X32: # BB#0:
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_pd:
+; X64: # BB#0:
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_blend_ps:
+; X32: # BB#0:
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_ps:
+; X64: # BB#0:
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
+; X32-LABEL: test_mm256_blendv_pd:
+; X32: # BB#0:
+; X32-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blendv_pd:
+; X64: # BB#0:
+; X64-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
+; X32-LABEL: test_mm256_blendv_ps:
+; X32: # BB#0:
+; X32-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blendv_ps:
+; X64: # BB#0:
+; X64-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double>* %a0 to i8*
+ %res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %arg0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly
+
+define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float>* %a0 to i8*
+ %res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %arg0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
+
+define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastsd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_sd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-NEXT: retq
+ %ld = load double, double* %a0
+ %ins0 = insertelement <4 x double> undef, double %ld, i32 0
+ %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
+ %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
+ %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
+ ret <4 x double> %ins3
+}
+
+define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
+; X32-LABEL: test_mm_broadcast_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastss (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcast_ss:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss (%rdi), %xmm0
+; X64-NEXT: retq
+ %ld = load float, float* %a0
+ %ins0 = insertelement <4 x float> undef, float %ld, i32 0
+ %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
+ %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
+ %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
+ ret <4 x float> %ins3
+}
+
+define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastss (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_ss:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss (%rdi), %ymm0
+; X64-NEXT: retq
+ %ld = load float, float* %a0
+ %ins0 = insertelement <8 x float> undef, float %ld, i32 0
+ %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
+ %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
+ %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
+ %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
+ %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
+ %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
+ %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
+ ret <8 x float> %ins7
+}
+
+define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x double> %a0 to <8 x float>
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd_si256:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd_si256:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x double> %a0 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd128_pd256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd128_pd256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ ret <4 x double> %res
+}
+
+define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd256_pd128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd256_pd128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <8 x float> %a0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps_si256:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps_si256:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <8 x float> %a0 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps128_ps256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps128_ps256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x float> %res
+}
+
+define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps256_ps128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps256_ps128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi256_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi256_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x i64> %a0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi256_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi256_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x i64> %a0 to <8 x float>
+ ret <8 x float> %res
+}
+
+define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi256_si128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi256_si128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
+ ret <2 x i64> %res
+}
+
+define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_ceil_pd:
+; X32: # BB#0:
+; X32-NEXT: vroundpd $2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_ceil_pd:
+; X64: # BB#0:
+; X64-NEXT: vroundpd $2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_ceil_ps:
+; X32: # BB#0:
+; X32-NEXT: vroundps $2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_ceil_ps:
+; X64: # BB#0:
+; X64-NEXT: vroundps $2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_pd:
+; X32: # BB#0:
+; X32-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_pd:
+; X64: # BB#0:
+; X64-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_cmp_pd:
+; X32: # BB#0:
+; X32-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmp_pd:
+; X64: # BB#0:
+; X64-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_ps:
+; X32: # BB#0:
+; X32-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_ps:
+; X64: # BB#0:
+; X64-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_cmp_ps:
+; X32: # BB#0:
+; X32-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmp_ps:
+; X64: # BB#0:
+; X64-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_sd:
+; X32: # BB#0:
+; X32-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_sd:
+; X64: # BB#0:
+; X64-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_ss:
+; X32: # BB#0:
+; X32-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_ss:
+; X64: # BB#0:
+; X64-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtepi32_pd:
+; X32: # BB#0:
+; X32-NEXT: vcvtdq2pd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi32_pd:
+; X64: # BB#0:
+; X64-NEXT: vcvtdq2pd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = sitofp <4 x i32> %arg0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtepi32_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi32_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %arg0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
+ %res = bitcast <4 x i32> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
+
+define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtpd_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtpd2psy %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtpd_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtpd2psy %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
+
+define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtps_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2dq %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtps_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2dq %ymm0, %ymm0
+; X64-NEXT: retq
+ %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
+ %res = bitcast <8 x i32> %cvt to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtps_pd:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2pd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtps_pd:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2pd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = fpext <4 x float> %a0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_cvttpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvttpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %cvt = fptosi <4 x double> %a0 to <4 x i32>
+ %res = bitcast <4 x i32> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvttps_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvttps2dq %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvttps_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvttps2dq %ymm0, %ymm0
+; X64-NEXT: retq
+ %cvt = fptosi <8 x float> %a0 to <8 x i32>
+ %res = bitcast <8 x i32> %cvt to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_div_pd:
+; X32: # BB#0:
+; X32-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_div_pd:
+; X64: # BB#0:
+; X64-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fdiv <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_div_ps:
+; X32: # BB#0:
+; X32-NEXT: vdivps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_div_ps:
+; X64: # BB#0:
+; X64-NEXT: vdivps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fdiv <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_dp_ps:
+; X32: # BB#0:
+; X32-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_dp_ps:
+; X64: # BB#0:
+; X64-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi8:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrb $15, %xmm0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi8:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrb $15, %xmm0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %ext = extractelement <32 x i8> %arg0, i32 31
+ %res = zext i8 %ext to i32
+ ret i32 %res
+}
+
+define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi16:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrw $3, %xmm0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi16:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrw $3, %xmm0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %ext = extractelement <16 x i16> %arg0, i32 11
+ %res = zext i16 %ext to i32
+ ret i32 %res
+}
+
+define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi32:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrd $1, %xmm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi32:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrd $1, %xmm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = extractelement <8 x i32> %arg0, i32 5
+ ret i32 %res
+}
+
+define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi64:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrd $2, %xmm0, %eax
+; X32-NEXT: vpextrd $3, %xmm0, %edx
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi64:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrq $1, %xmm0, %rax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = extractelement <4 x i64> %a0, i32 3
+ ret i64 %res
+}
+
+define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_extractf128_pd:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extractf128_pd:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_extractf128_ps:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extractf128_ps:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extractf128_si256:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extractf128_si256:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
+ ret <2 x i64> %res
+}
+
+define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_floor_pd:
+; X32: # BB#0:
+; X32-NEXT: vroundpd $1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_floor_pd:
+; X64: # BB#0:
+; X64-NEXT: vroundpd $1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_floor_ps:
+; X32: # BB#0:
+; X32-NEXT: vroundps $1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_floor_ps:
+; X64: # BB#0:
+; X64-NEXT: vroundps $1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_hadd_pd:
+; X32: # BB#0:
+; X32-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_pd:
+; X64: # BB#0:
+; X64-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_hadd_ps:
+; X32: # BB#0:
+; X32-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_ps:
+; X64: # BB#0:
+; X64-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_hsub_pd:
+; X32: # BB#0:
+; X32-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_pd:
+; X64: # BB#0:
+; X64-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_hsub_ps:
+; X32: # BB#0:
+; X32-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_ps:
+; X64: # BB#0:
+; X64-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi16:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi64:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm2
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi64:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_insertf128_pd:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insertf128_pd:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X64-NEXT: retq
+ %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_insertf128_ps:
+; X32: # BB#0:
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insertf128_ps:
+; X64: # BB#0:
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_insertf128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insertf128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X64-NEXT: retq
+ %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm256_lddqu_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vlddqu (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_lddqu_si256:
+; X64: # BB#0:
+; X64-NEXT: vlddqu (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64>* %a0 to i8*
+ %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
+
+define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm256_load_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_load_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ %res = load <4 x double>, <4 x double>* %arg0, align 32
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm256_load_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_load_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ %res = load <8 x float>, <8 x float>* %arg0, align 32
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm256_load_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_load_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %ymm0
+; X64-NEXT: retq
+ %res = load <4 x i64>, <4 x i64>* %a0, align 32
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm256_loadu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ %res = load <4 x double>, <4 x double>* %arg0, align 1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm256_loadu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ %res = load <8 x float>, <8 x float>* %arg0, align 1
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm256_loadu_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: retq
+ %res = load <4 x i64>, <4 x i64>* %a0, align 1
+ ret <4 x i64> %res
+}
+
+define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
+; X32-LABEL: test_mm256_loadu2_m128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu2_m128:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
+ %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %arg1 = bitcast float* %a1 to <4 x float>*
+ %lo4 = load <4 x float>, <4 x float>* %arg1, align 1
+ %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm256_loadu2_m128d:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu2_m128d:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
+ %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %arg1 = bitcast double* %a1 to <2 x double>*
+ %lo2 = load <2 x double>, <2 x double>* %arg1, align 1
+ %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
+; X32-LABEL: test_mm256_loadu2_m128i:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu2_m128i:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to <2 x i64>*
+ %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
+ %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %arg1 = bitcast i64* %a1 to <2 x i64>*
+ %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1
+ %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x i64> %res
+}
+
+define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
+
+define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone
+
+define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
+
+define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone
+
+define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone
+
+define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone
+
+define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone
+
+define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_max_pd:
+; X32: # BB#0:
+; X32-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_max_ps:
+; X32: # BB#0:
+; X32-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_min_pd:
+; X32: # BB#0:
+; X32-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_pd:
+; X64: # BB#0:
+; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_min_ps:
+; X32: # BB#0:
+; X32-NEXT: vminps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_ps:
+; X64: # BB#0:
+; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_movedup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movedup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x float> %res
+}
+
+define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_movemask_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovmskpd %ymm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movemask_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovmskpd %ymm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
+
+define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_movemask_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovmskps %ymm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movemask_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovmskps %ymm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_mul_pd:
+; X32: # BB#0:
+; X32-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_pd:
+; X64: # BB#0:
+; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fmul <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_mul_ps:
+; X32: # BB#0:
+; X32-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_ps:
+; X64: # BB#0:
+; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fmul <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_or_pd:
+; X32: # BB#0:
+; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_or_pd:
+; X64: # BB#0:
+; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %res = or <4 x i64> %1, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_or_ps:
+; X32: # BB#0:
+; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_or_ps:
+; X64: # BB#0:
+; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %res = or <8 x i32> %1, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x double> %res
+}
+
+define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
+; X32-LABEL: test2_mm_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test2_mm_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_permute2f128_pd:
+; X32: # BB#0:
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2f128_pd:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 44)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+; PR26667
+define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_permute2f128_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovaps %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2f128_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm1, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 50)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_permute2f128_si256:
+; X32: # BB#0:
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2f128_si256:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; X64-NEXT: retq
+ %1 = bitcast <4 x i64> %a0 to <8 x i32>
+ %2 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %1, <8 x i32> %2, i8 35)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+
+define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_permutevar_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permutevar_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
+
+define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_permutevar_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
+
+define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_permutevar_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permutevar_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
+
+define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_permutevar_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
+
+define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_rcp_ps:
+; X32: # BB#0:
+; X32-NEXT: vrcpps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_rcp_ps:
+; X64: # BB#0:
+; X64-NEXT: vrcpps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_round_pd:
+; X32: # BB#0:
+; X32-NEXT: vroundpd $4, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_round_pd:
+; X64: # BB#0:
+; X64-NEXT: vroundpd $4, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_round_ps:
+; X32: # BB#0:
+; X32-NEXT: vroundps $4, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_round_ps:
+; X64: # BB#0:
+; X64-NEXT: vroundps $4, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_rsqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: vrsqrtps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_rsqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: vrsqrtps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
+; X32-LABEL: test_mm256_set_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm1
+; X32-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: vmovd %ecx, %xmm1
+; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <32 x i8> undef, i8 %a31, i32 0
+ %res1 = insertelement <32 x i8> %res0, i8 %a30, i32 1
+ %res2 = insertelement <32 x i8> %res1, i8 %a29, i32 2
+ %res3 = insertelement <32 x i8> %res2, i8 %a28, i32 3
+ %res4 = insertelement <32 x i8> %res3, i8 %a27, i32 4
+ %res5 = insertelement <32 x i8> %res4, i8 %a26, i32 5
+ %res6 = insertelement <32 x i8> %res5, i8 %a25, i32 6
+ %res7 = insertelement <32 x i8> %res6, i8 %a24, i32 7
+ %res8 = insertelement <32 x i8> %res7, i8 %a23, i32 8
+ %res9 = insertelement <32 x i8> %res8, i8 %a22, i32 9
+ %res10 = insertelement <32 x i8> %res9, i8 %a21, i32 10
+ %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
+ %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
+ %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
+ %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
+ %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
+ %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
+ %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
+ %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
+ %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
+ %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
+ %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
+ %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
+ %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
+ %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
+ %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
+ %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
+ %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
+ %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
+ %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
+ %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
+ %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
+ %res = bitcast <32 x i8> %res31 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
+; X32-LABEL: test_mm256_set_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i16> undef, i16 %a15, i32 0
+ %res1 = insertelement <16 x i16> %res0, i16 %a14, i32 1
+ %res2 = insertelement <16 x i16> %res1, i16 %a13, i32 2
+ %res3 = insertelement <16 x i16> %res2, i16 %a12, i32 3
+ %res4 = insertelement <16 x i16> %res3, i16 %a11, i32 4
+ %res5 = insertelement <16 x i16> %res4, i16 %a10, i32 5
+ %res6 = insertelement <16 x i16> %res5, i16 %a9 , i32 6
+ %res7 = insertelement <16 x i16> %res6, i16 %a8 , i32 7
+ %res8 = insertelement <16 x i16> %res7, i16 %a7 , i32 8
+ %res9 = insertelement <16 x i16> %res8, i16 %a6 , i32 9
+ %res10 = insertelement <16 x i16> %res9, i16 %a5 , i32 10
+ %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
+ %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
+ %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
+ %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
+ %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
+ %res = bitcast <16 x i16> %res15 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+; X32-LABEL: test_mm256_set_epi32:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi32:
+; X64: # BB#0:
+; X64-NEXT: vmovd %ecx, %xmm0
+; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
+; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; X64-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1
+; X64-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
+ %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
+ %res = bitcast <8 x i32> %res7 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
+; X32-LABEL: test_mm256_set_epi64x:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi64x:
+; X64: # BB#0:
+; X64-NEXT: vmovq %rdi, %xmm0
+; X64-NEXT: vmovq %rsi, %xmm1
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT: vmovq %rdx, %xmm1
+; X64-NEXT: vmovq %rcx, %xmm2
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
+ ret <4 x i64> %res3
+}
+
+define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_set_m128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_m128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_set_m128d:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_m128d:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x float>
+ %arg1 = bitcast <2 x double> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_set_m128i:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_m128i:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x float>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
+; X32-LABEL: test_mm256_set_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x double> undef, double %a3, i32 0
+ %res1 = insertelement <4 x double> %res0, double %a2, i32 1
+ %res2 = insertelement <4 x double> %res1, double %a1, i32 2
+ %res3 = insertelement <4 x double> %res2, double %a0, i32 3
+ ret <4 x double> %res3
+}
+
+define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
+; X32-LABEL: test_mm256_set_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_ps:
+; X64: # BB#0:
+; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x float> undef, float %a7, i32 0
+ %res1 = insertelement <8 x float> %res0, float %a6, i32 1
+ %res2 = insertelement <8 x float> %res1, float %a5, i32 2
+ %res3 = insertelement <8 x float> %res2, float %a4, i32 3
+ %res4 = insertelement <8 x float> %res3, float %a3, i32 4
+ %res5 = insertelement <8 x float> %res4, float %a2, i32 5
+ %res6 = insertelement <8 x float> %res5, float %a1, i32 6
+ %res7 = insertelement <8 x float> %res6, float %a0, i32 7
+ ret <8 x float> %res7
+}
+
+define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X32-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <32 x i8> undef, i8 %a0, i32 0
+ %res1 = insertelement <32 x i8> %res0, i8 %a0, i32 1
+ %res2 = insertelement <32 x i8> %res1, i8 %a0, i32 2
+ %res3 = insertelement <32 x i8> %res2, i8 %a0, i32 3
+ %res4 = insertelement <32 x i8> %res3, i8 %a0, i32 4
+ %res5 = insertelement <32 x i8> %res4, i8 %a0, i32 5
+ %res6 = insertelement <32 x i8> %res5, i8 %a0, i32 6
+ %res7 = insertelement <32 x i8> %res6, i8 %a0, i32 7
+ %res8 = insertelement <32 x i8> %res7, i8 %a0, i32 8
+ %res9 = insertelement <32 x i8> %res8, i8 %a0, i32 9
+ %res10 = insertelement <32 x i8> %res9, i8 %a0, i32 10
+ %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
+ %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
+ %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
+ %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
+ %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
+ %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
+ %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
+ %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
+ %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
+ %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
+ %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
+ %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
+ %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
+ %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
+ %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
+ %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
+ %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
+ %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
+ %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
+ %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
+ %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
+ %res = bitcast <32 x i8> %res31 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi16:
+; X64: # BB#0:
+; X64-NEXT: vmovd %edi, %xmm0
+; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0
+ %res1 = insertelement <16 x i16> %res0, i16 %a0, i32 1
+ %res2 = insertelement <16 x i16> %res1, i16 %a0, i32 2
+ %res3 = insertelement <16 x i16> %res2, i16 %a0, i32 3
+ %res4 = insertelement <16 x i16> %res3, i16 %a0, i32 4
+ %res5 = insertelement <16 x i16> %res4, i16 %a0, i32 5
+ %res6 = insertelement <16 x i16> %res5, i16 %a0, i32 6
+ %res7 = insertelement <16 x i16> %res6, i16 %a0, i32 7
+ %res8 = insertelement <16 x i16> %res7, i16 %a0, i32 8
+ %res9 = insertelement <16 x i16> %res8, i16 %a0, i32 9
+ %res10 = insertelement <16 x i16> %res9, i16 %a0, i32 10
+ %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
+ %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
+ %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
+ %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
+ %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
+ %res = bitcast <16 x i16> %res15 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi32:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi32:
+; X64: # BB#0:
+; X64-NEXT: vmovd %edi, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
+ %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
+ %res = bitcast <8 x i32> %res7 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi64x:
+; X64: # BB#0:
+; X64-NEXT: vmovq %rdi, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
+ ret <4 x i64> %res3
+}
+
+define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
+; X32-LABEL: test_mm256_set1_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x double> undef, double %a0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %a0, i32 1
+ %res2 = insertelement <4 x double> %res1, double %a0, i32 2
+ %res3 = insertelement <4 x double> %res2, double %a0, i32 3
+ ret <4 x double> %res3
+}
+
+define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
+; X32-LABEL: test_mm256_set1_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x float> undef, float %a0, i32 0
+ %res1 = insertelement <8 x float> %res0, float %a0, i32 1
+ %res2 = insertelement <8 x float> %res1, float %a0, i32 2
+ %res3 = insertelement <8 x float> %res2, float %a0, i32 3
+ %res4 = insertelement <8 x float> %res3, float %a0, i32 4
+ %res5 = insertelement <8 x float> %res4, float %a0, i32 5
+ %res6 = insertelement <8 x float> %res5, float %a0, i32 6
+ %res7 = insertelement <8 x float> %res6, float %a0, i32 7
+ ret <8 x float> %res7
+}
+
+define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
+; X32-LABEL: test_mm256_setr_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm1
+; X32-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzbl %dil, %esi
+; X64-NEXT: vmovd %esi, %xmm1
+; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <32 x i8> undef, i8 %a0 , i32 0
+ %res1 = insertelement <32 x i8> %res0, i8 %a1 , i32 1
+ %res2 = insertelement <32 x i8> %res1, i8 %a2 , i32 2
+ %res3 = insertelement <32 x i8> %res2, i8 %a3 , i32 3
+ %res4 = insertelement <32 x i8> %res3, i8 %a4 , i32 4
+ %res5 = insertelement <32 x i8> %res4, i8 %a5 , i32 5
+ %res6 = insertelement <32 x i8> %res5, i8 %a6 , i32 6
+ %res7 = insertelement <32 x i8> %res6, i8 %a7 , i32 7
+ %res8 = insertelement <32 x i8> %res7, i8 %a8 , i32 8
+ %res9 = insertelement <32 x i8> %res8, i8 %a9 , i32 9
+ %res10 = insertelement <32 x i8> %res9, i8 %a10, i32 10
+ %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
+ %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
+ %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
+ %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
+ %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
+ %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
+ %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
+ %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
+ %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
+ %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
+ %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
+ %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
+ %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
+ %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
+ %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
+ %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
+ %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
+ %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
+ %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
+ %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
+ %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
+ %res = bitcast <32 x i8> %res31 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
+; X32-LABEL: test_mm256_setr_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; X64-NEXT: vmovd %edi, %xmm1
+; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i16> undef, i16 %a0 , i32 0
+ %res1 = insertelement <16 x i16> %res0, i16 %a1 , i32 1
+ %res2 = insertelement <16 x i16> %res1, i16 %a2 , i32 2
+ %res3 = insertelement <16 x i16> %res2, i16 %a3 , i32 3
+ %res4 = insertelement <16 x i16> %res3, i16 %a4 , i32 4
+ %res5 = insertelement <16 x i16> %res4, i16 %a5 , i32 5
+ %res6 = insertelement <16 x i16> %res5, i16 %a6 , i32 6
+ %res7 = insertelement <16 x i16> %res6, i16 %a7 , i32 7
+ %res8 = insertelement <16 x i16> %res7, i16 %a8 , i32 8
+ %res9 = insertelement <16 x i16> %res8, i16 %a9 , i32 9
+ %res10 = insertelement <16 x i16> %res9, i16 %a10, i32 10
+ %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
+ %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
+ %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
+ %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
+ %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
+ %res = bitcast <16 x i16> %res15 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+; X32-LABEL: test_mm256_setr_epi32:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi32:
+; X64: # BB#0:
+; X64-NEXT: vmovd %r8d, %xmm0
+; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0
+; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-NEXT: vmovd %edi, %xmm1
+; X64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
+; X64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
+; X64-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
+ %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
+ %res = bitcast <8 x i32> %res7 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
+; X32-LABEL: test_mm256_setr_epi64x:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi64x:
+; X64: # BB#0:
+; X64-NEXT: vmovq %rcx, %xmm0
+; X64-NEXT: vmovq %rdx, %xmm1
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT: vmovq %rsi, %xmm1
+; X64-NEXT: vmovq %rdi, %xmm2
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
+ ret <4 x i64> %res3
+}
+
+define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_setr_m128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_m128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_setr_m128d:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_m128d:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x float>
+ %arg1 = bitcast <2 x double> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_setr_m128i:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_m128i:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x float>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
+; X32-LABEL: test_mm256_setr_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x double> undef, double %a0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %a1, i32 1
+ %res2 = insertelement <4 x double> %res1, double %a2, i32 2
+ %res3 = insertelement <4 x double> %res2, double %a3, i32 3
+ ret <4 x double> %res3
+}
+
+define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
+; X32-LABEL: test_mm256_setr_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_ps:
+; X64: # BB#0:
+; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x float> undef, float %a0, i32 0
+ %res1 = insertelement <8 x float> %res0, float %a1, i32 1
+ %res2 = insertelement <8 x float> %res1, float %a2, i32 2
+ %res3 = insertelement <8 x float> %res2, float %a3, i32 3
+ %res4 = insertelement <8 x float> %res3, float %a4, i32 4
+ %res5 = insertelement <8 x float> %res4, float %a5, i32 5
+ %res6 = insertelement <8 x float> %res5, float %a6, i32 6
+ %res7 = insertelement <8 x float> %res6, float %a7, i32 7
+ ret <8 x float> %res7
+}
+
+define <4 x double> @test_mm256_setzero_pd() nounwind {
+; X32-LABEL: test_mm256_setzero_pd:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setzero_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ ret <4 x double> zeroinitializer
+}
+
+define <8 x float> @test_mm256_setzero_ps() nounwind {
+; X32-LABEL: test_mm256_setzero_ps:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setzero_ps:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ ret <8 x float> zeroinitializer
+}
+
+define <4 x i64> @test_mm256_setzero_si256() nounwind {
+; X32-LABEL: test_mm256_setzero_si256:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setzero_si256:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ ret <4 x i64> zeroinitializer
+}
+
+define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_sqrt_pd:
+; X32: # BB#0:
+; X32-NEXT: vsqrtpd %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sqrt_pd:
+; X64: # BB#0:
+; X64-NEXT: vsqrtpd %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_sqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: vsqrtps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: vsqrtps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_store_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_store_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ store <4 x double> %a1, <4 x double>* %arg0, align 32
+ ret void
+}
+
+define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_store_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_store_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ store <8 x float> %a1, <8 x float>* %arg0, align 32
+ ret void
+}
+
+define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_store_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_store_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ store <4 x i64> %a1, <4 x i64>* %a0, align 32
+ ret void
+}
+
+define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_storeu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ store <4 x double> %a1, <4 x double>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_storeu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ store <8 x float> %a1, <8 x float>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_storeu_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ store <4 x i64> %a1, <4 x i64>* %a0, align 1
+ ret void
+}
+
+define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
+; X32-LABEL: test_mm256_storeu2_m128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups %xmm0, (%ecx)
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu2_m128:
+; X64: # BB#0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vmovups %xmm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x float> %lo, <4 x float>* %arg0, align 1
+ %arg1 = bitcast float* %a1 to <4 x float>*
+ %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ store <4 x float> %hi, <4 x float>* %arg1, align 1
+ ret void
+}
+
+define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
+; X32-LABEL: test_mm256_storeu2_m128d:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups %xmm0, (%ecx)
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu2_m128d:
+; X64: # BB#0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vmovups %xmm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
+ store <2 x double> %lo, <2 x double>* %arg0, align 1
+ %arg1 = bitcast double* %a1 to <2 x double>*
+ %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
+ store <2 x double> %hi, <2 x double>* %arg1, align 1
+ ret void
+}
+
+define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
+; X32-LABEL: test_mm256_storeu2_m128i:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups %xmm0, (%ecx)
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu2_m128i:
+; X64: # BB#0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vmovups %xmm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
+ %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
+ store <2 x i64> %lo, <2 x i64>* %arg0, align 1
+ %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>*
+ %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
+ store <2 x i64> %hi, <2 x i64>* %arg1, align 1
+ ret void
+}
+
+define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_stream_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
+ ret void
+}
+
+define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_stream_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
+ ret void
+}
+
+define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_stream_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
+ ret void
+}
+
+define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_pd:
+; X32: # BB#0:
+; X32-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_pd:
+; X64: # BB#0:
+; X64-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fsub <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_ps:
+; X32: # BB#0:
+; X32-NEXT: vsubps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_ps:
+; X64: # BB#0:
+; X64-NEXT: vsubps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fsub <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_testc_pd:
+; X32: # BB#0:
+; X32-NEXT: vtestpd %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testc_pd:
+; X64: # BB#0:
+; X64-NEXT: vtestpd %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_testc_pd:
+; X32: # BB#0:
+; X32-NEXT: vtestpd %ymm1, %ymm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testc_pd:
+; X64: # BB#0:
+; X64-NEXT: vtestpd %ymm1, %ymm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_testc_ps:
+; X32: # BB#0:
+; X32-NEXT: vtestps %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testc_ps:
+; X64: # BB#0:
+; X64-NEXT: vtestps %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_testc_ps:
+; X32: # BB#0:
+; X32-NEXT: vtestps %ymm1, %ymm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testc_ps:
+; X64: # BB#0:
+; X64-NEXT: vtestps %ymm1, %ymm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_testc_si256:
+; X32: # BB#0:
+; X32-NEXT: vptest %ymm1, %ymm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testc_si256:
+; X64: # BB#0:
+; X64-NEXT: vptest %ymm1, %ymm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_testnzc_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testnzc_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_testnzc_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %ymm1, %ymm0
+; X32-NEXT: seta %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testnzc_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %ymm1, %ymm0
+; X64-NEXT: seta %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_testnzc_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testnzc_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_testnzc_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %ymm1, %ymm0
+; X32-NEXT: seta %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testnzc_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %ymm1, %ymm0
+; X64-NEXT: seta %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_testnzc_si256:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vptest %ymm1, %ymm0
+; X32-NEXT: seta %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testnzc_si256:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vptest %ymm1, %ymm0
+; X64-NEXT: seta %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_testz_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testz_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_testz_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %ymm1, %ymm0
+; X32-NEXT: sete %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testz_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %ymm1, %ymm0
+; X64-NEXT: sete %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_testz_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testz_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_testz_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %ymm1, %ymm0
+; X32-NEXT: sete %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testz_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %ymm1, %ymm0
+; X64-NEXT: sete %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_testz_si256:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vptest %ymm1, %ymm0
+; X32-NEXT: sete %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testz_si256:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vptest %ymm1, %ymm0
+; X64-NEXT: sete %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <2 x double> @test_mm_undefined_pd() nounwind {
+; X32-LABEL: test_mm_undefined_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <2 x double> undef
+}
+
+define <4 x double> @test_mm256_undefined_pd() nounwind {
+; X32-LABEL: test_mm256_undefined_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_undefined_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <4 x double> undef
+}
+
+define <8 x float> @test_mm256_undefined_ps() nounwind {
+; X32-LABEL: test_mm256_undefined_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_undefined_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <8 x float> undef
+}
+
+define <4 x i64> @test_mm256_undefined_si256() nounwind {
+; X32-LABEL: test_mm256_undefined_si256:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_undefined_si256:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <4 x i64> undef
+}
+
+define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_xor_pd:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_xor_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %res = xor <4 x i64> %1, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_xor_ps:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_xor_ps:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %res = xor <8 x i32> %1, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define void @test_mm256_zeroall() nounwind {
+; X32-LABEL: test_mm256_zeroall:
+; X32: # BB#0:
+; X32-NEXT: vzeroall
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_zeroall:
+; X64: # BB#0:
+; X64-NEXT: vzeroall
+; X64-NEXT: retq
+ call void @llvm.x86.avx.vzeroall()
+ ret void
+}
+declare void @llvm.x86.avx.vzeroall() nounwind readnone
+
+define void @test_mm256_zeroupper() nounwind {
+; X32-LABEL: test_mm256_zeroupper:
+; X32: # BB#0:
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_zeroupper:
+; X64: # BB#0:
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ call void @llvm.x86.avx.vzeroupper()
+ ret void
+}
+declare void @llvm.x86.avx.vzeroupper() nounwind readnone
+
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 4867869863e3..a7b4c6b285d8 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,26 +1,33 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx | FileCheck %s
-; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
+; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
-; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
-; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
-; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
ret <8 x i32> %res
}
@@ -29,34 +36,46 @@ define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1
; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
; not a vinsertf128 $1.
define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
-; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
-; We don't check any vextractf128 variant with immediate 0 because that's just a move.
+; We don't check any vextractf128 variant with immediate 0 because that's just a move.
define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
-; CHECK: vextractf128 $1, %ymm0, %xmm0
+; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
-; CHECK: vextractf128 $1, %ymm0, %xmm0
+; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
-; CHECK: vextractf128 $1, %ymm0, %xmm0
+; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
ret <4 x i32> %res
}
@@ -66,16 +85,21 @@ declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind read
; of a vextractf128 $0 which should be optimized away, so just check that it's
; not a vextractf128 of any kind.
define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
-; CHECK-NOT: vextractf128
+; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
ret <2 x double> %res
}
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_blend_pd_256:
-; CHECK: vblendpd
+; CHECK-LABEL: test_x86_avx_blend_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; CHECK-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -83,8 +107,10 @@ declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_blend_ps_256:
-; CHECK: vblendps
+; CHECK-LABEL: test_x86_avx_blend_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -92,8 +118,10 @@ declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) no
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_dp_ps_256:
-; CHECK: vdpps
+; CHECK-LABEL: test_x86_avx_dp_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -101,8 +129,10 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounw
define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_psll_dq:
-; CHECK: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-LABEL: test_x86_sse2_psll_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -110,8 +140,10 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrl_dq:
-; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-LABEL: test_x86_sse2_psrl_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -119,8 +151,10 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_blendpd:
-; CHECK: vblendpd
+; CHECK-LABEL: test_x86_sse41_blendpd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -128,8 +162,10 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nou
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_blendps:
-; CHECK: vblendps
+; CHECK-LABEL: test_x86_sse41_blendps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -137,8 +173,10 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse41_pblendw:
-; CHECK: vpblendw
+; CHECK-LABEL: test_x86_sse41_pblendw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -147,7 +185,7 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind rea
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbd:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
@@ -158,7 +196,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbq:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
@@ -169,7 +207,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbw:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
@@ -180,7 +218,7 @@ declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxdq:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
@@ -191,7 +229,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwd:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
@@ -202,10 +240,282 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwq:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: retl
+ %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtps2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
+
+
+define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
+
+
+define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
+; CHECK-NEXT: vmovdqu %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+
+define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
+ ; fadd operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovupd %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
+ call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
+
+
+define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
+; CHECK-LABEL: test_x86_sse_storeu_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vmovups %xmm0, (%eax)
+; CHECK-NEXT: retl
+ call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
+ ret void
+}
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+
+define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
+ ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_avx_storeu_dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
+
+define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_avx_storeu_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovupd %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+ call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
+
+
+define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
+; CHECK-LABEL: test_x86_avx_storeu_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vmovups %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
+
+
+define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
+; CHECK-NEXT: retl
+ %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind readnone
+
+
+define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0]
+; CHECK-NEXT: retl
+ %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4]
+; CHECK-NEXT: retl
+ %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 206be2396cba..35763297d816 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,11 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx,aes,pclmul | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512vl,aes,pclmul | FileCheck %s --check-prefix=AVX512VL
define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdec:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesdec %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesdec:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesdec %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesdec:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesdec %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -13,10 +19,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdeclast:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesdeclast:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesdeclast:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -24,10 +35,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind read
define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesenc %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesenc:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesenc %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesenc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesenc %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -35,10 +51,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenclast:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesenclast %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesenclast:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesenclast %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesenclast:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesenclast %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -46,10 +67,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind read
define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aesimc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesimc %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesimc:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesimc %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesimc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesimc %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -57,10 +83,15 @@ declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aeskeygenassist:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaeskeygenassist $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aeskeygenassist:
+; AVX: ## BB#0:
+; AVX-NEXT: vaeskeygenassist $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aeskeygenassist:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaeskeygenassist $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -68,10 +99,15 @@ declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readno
define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_add_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_add_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_add_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -79,10 +115,15 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cmp_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cmp_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cmp_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -90,10 +131,15 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cmp_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cmp_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cmp_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -101,12 +147,23 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comieq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comieq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comieq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -114,12 +171,19 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comige_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comige_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comige_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -127,12 +191,19 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comigt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comigt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comigt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -140,12 +211,19 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comile_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comile_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comile_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -153,12 +231,19 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comilt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comilt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comilt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -166,34 +251,39 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comineq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comineq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comineq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
-
-
define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtdq2ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtdq2ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtdq2ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -201,10 +291,15 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtpd2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtpd2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtpd2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -212,10 +307,15 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtpd2ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtpd2ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtpd2ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -223,32 +323,31 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtps2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtps2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtps2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtps2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtps2pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
-
-
define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtsd2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtsd2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtsd2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtsd2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtsd2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -256,33 +355,47 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cvtsd2ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtsd2ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtsd2ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtsi2sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
+; AVX-LABEL: test_x86_sse2_cvtsi2sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtsi2sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse2_cvtss2sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtss2sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtss2sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -290,10 +403,15 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttpd2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvttpd2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -301,10 +419,15 @@ declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttps2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvttps2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvttps2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -312,10 +435,15 @@ declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttsd2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttsd2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvttsd2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttsd2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvttsd2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttsd2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -323,10 +451,15 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_div_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdivsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_div_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_div_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -335,10 +468,15 @@ declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_max_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_max_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_max_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -346,10 +484,15 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_max_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_max_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_max_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -357,10 +500,15 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_min_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_min_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_min_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -368,10 +516,15 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_min_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_min_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_min_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -379,10 +532,15 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_movmsk_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskpd %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_movmsk_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskpd %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_movmsk_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskpd %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -392,10 +550,15 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_mul_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_mul_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_mul_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -403,10 +566,15 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind
define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_packssdw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_packssdw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_packssdw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -414,10 +582,15 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea
define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_packsswb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_packsswb_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_packsswb_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -425,10 +598,15 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_packuswb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_packuswb_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_packuswb_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -436,10 +614,15 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_padds_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_padds_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_padds_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -447,10 +630,15 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_padds_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_padds_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_padds_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -458,10 +646,15 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_paddus_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_paddus_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_paddus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -469,10 +662,15 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_paddus_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_paddus_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_paddus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -480,10 +678,15 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_pavg_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pavg_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pavg_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -491,10 +694,15 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pavg_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pavg_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pavg_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -502,10 +710,15 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmadd_wd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmadd_wd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmadd_wd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -513,10 +726,15 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmaxs_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmaxs_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmaxs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -524,10 +742,15 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmaxu_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmaxu_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmaxu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -535,10 +758,15 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmins_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmins_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmins_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -546,10 +774,15 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_pminu_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pminu_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pminu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -557,10 +790,15 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse2_pmovmskb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovmskb %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmovmskb_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmovmskb %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmovmskb_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmovmskb %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -568,10 +806,15 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmulh_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmulh_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmulh_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -579,10 +822,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmulhu_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmulhu_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmulhu_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -590,10 +838,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmulu_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmulu_dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmulu_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -601,10 +854,15 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_psad_bw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psad_bw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psad_bw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -612,10 +870,15 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_psll_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psll_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psll_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -623,10 +886,15 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse2_psll_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psll_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psll_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -634,10 +902,15 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psll_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psll_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psll_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -645,10 +918,15 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpslld $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pslli_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpslld $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pslli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -656,10 +934,15 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllq $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pslli_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllq $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pslli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -667,10 +950,15 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse2_pslli_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pslli_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pslli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -678,10 +966,15 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_psra_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psra_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psra_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -689,10 +982,15 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psra_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psra_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psra_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -700,10 +998,15 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrad $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrai_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrad $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrai_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -711,10 +1014,15 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrai_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsraw $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrai_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsraw $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrai_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -722,10 +1030,15 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_psrl_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrl_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrl_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -733,10 +1046,15 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse2_psrl_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrl_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrl_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -744,10 +1062,15 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psrl_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrl_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrl_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -755,10 +1078,15 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrld $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrli_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrld $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -766,10 +1094,15 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlq $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrli_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -777,10 +1110,15 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrli_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlw $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrli_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -788,10 +1126,15 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubs_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubs_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubs_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -799,10 +1142,15 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubs_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubs_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -810,10 +1158,15 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubus_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubus_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -821,10 +1174,15 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubus_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubus_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -832,10 +1190,15 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_sqrt_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtpd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_sqrt_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtpd %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_sqrt_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtpd %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -843,65 +1206,31 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_sqrt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_sqrt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_sqrt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_storel_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovlps %xmm0, (%eax)
-; CHECK-NEXT: retl
- call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
- ret void
-}
-declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
-
-
-define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
- ; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_sse2_storeu_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpaddb LCPI77_0, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, (%eax)
-; CHECK-NEXT: retl
- %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
-
-
-define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
- ; fadd operation forces the execution domain.
-; CHECK-LABEL: test_x86_sse2_storeu_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovupd %xmm0, (%eax)
-; CHECK-NEXT: retl
- %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
- call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
-
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_sub_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsubsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_sub_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_sub_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -909,12 +1238,23 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomieq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomieq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomieq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -922,12 +1262,19 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomige_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomige_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomige_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -935,12 +1282,19 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomigt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomigt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomigt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -948,12 +1302,19 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomile_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomile_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomile_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -961,12 +1322,19 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomilt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomilt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomilt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -974,12 +1342,23 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomineq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomineq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomineq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -987,10 +1366,15 @@ declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind read
define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_addsub_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_addsub_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_addsub_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -998,10 +1382,15 @@ declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwi
define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_addsub_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_addsub_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_addsub_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1009,10 +1398,15 @@ declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind
define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_hadd_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hadd_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hadd_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1020,10 +1414,15 @@ declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_hadd_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hadd_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hadd_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1031,10 +1430,15 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind re
define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_hsub_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hsub_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hsub_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1042,10 +1446,15 @@ declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_hsub_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hsub_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hsub_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1053,11 +1462,17 @@ declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind re
define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
-; CHECK-LABEL: test_x86_sse3_ldu_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vlddqu (%eax), %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_ldu_dq:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vlddqu (%eax), %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_ldu_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vlddqu (%eax), %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1065,10 +1480,15 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-; CHECK-LABEL: test_x86_sse41_blendvpd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_blendvpd:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_blendvpd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1076,10 +1496,15 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-; CHECK-LABEL: test_x86_sse41_blendvps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_blendvps:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_blendvps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1087,10 +1512,15 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_dppd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_dppd:
+; AVX: ## BB#0:
+; AVX-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_dppd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1098,10 +1528,15 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_dpps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_dpps:
+; AVX: ## BB#0:
+; AVX-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_dpps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1109,11 +1544,16 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_insertps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[3]
-; CHECK-NEXT: retl
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+; AVX-LABEL: test_x86_sse41_insertps:
+; AVX: ## BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_insertps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
+; AVX512VL-NEXT: retl
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
@@ -1121,10 +1561,15 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse41_mpsadbw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_mpsadbw:
+; AVX: ## BB#0:
+; AVX-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_mpsadbw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1132,10 +1577,15 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_packusdw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_packusdw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_packusdw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1143,10 +1593,15 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno
define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse41_pblendvb:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pblendvb:
+; AVX: ## BB#0:
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pblendvb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1154,10 +1609,15 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_phminposuw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphminposuw %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_phminposuw:
+; AVX: ## BB#0:
+; AVX-NEXT: vphminposuw %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_phminposuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1165,10 +1625,15 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxsb:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxsb:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1176,10 +1641,15 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxsd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxsd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1187,10 +1657,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxud:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxud:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1198,10 +1673,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxuw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxuw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1209,10 +1689,15 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminsb:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminsb:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1220,10 +1705,15 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminsd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminsd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1231,10 +1721,15 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminud:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminud:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1242,87 +1737,31 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminuw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminuw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
-define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxbd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxbq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: retl
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxbw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: retl
- %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxdq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-NEXT: retl
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxwd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxwq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NEXT: retl
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
-
-
define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmuldq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmuldq:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmuldq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -1330,12 +1769,19 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_ptestc:
+; AVX: ## BB#0:
+; AVX-NEXT: vptest %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_ptestc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vptest %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1343,12 +1789,19 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestnzc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_ptestnzc:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_ptestnzc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1356,12 +1809,19 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestz:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_ptestz:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_ptestz:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1369,10 +1829,15 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse41_round_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundpd $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundpd $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1380,10 +1845,15 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse41_round_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundps $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundps $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1391,10 +1861,15 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_round_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1402,10 +1877,15 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_round_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1413,13 +1893,21 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestri128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestri128:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestri128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1427,16 +1915,27 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestri128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovdqa (%eax), %xmm0
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, (%ecx), %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestri128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmovdqa (%eax), %xmm0
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestri $7, (%ecx), %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestri128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmovdqa64 (%eax), %xmm0
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestri $7, (%ecx), %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a2
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
@@ -1444,15 +1943,30 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
}
-define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestria128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestria128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: seta %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestria128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seta %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1460,59 +1974,113 @@ declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestric128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestric128:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestric128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrio128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: seto %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestrio128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: seto %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestrio128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seto %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestris128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: sets %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestris128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: sets %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestris128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sets %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestriz128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestriz128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: sete %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestriz128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sete %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1520,12 +2088,19 @@ declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrm128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestrm $7, %xmm1, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestrm128:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestrm $7, %xmm1, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestrm128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestrm $7, %xmm1, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1533,13 +2108,21 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrm128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestrm $7, (%ecx), %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestrm128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestrm $7, (%ecx), %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestrm128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestrm $7, (%ecx), %xmm0
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a2
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
@@ -1547,11 +2130,17 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2
define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistri128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistri128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistri128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1559,14 +2148,23 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistri128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: vmovdqa (%ecx), %xmm0
-; CHECK-NEXT: vpcmpistri $7, (%eax), %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistri128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: vmovdqa (%ecx), %xmm0
+; AVX-NEXT: vpcmpistri $7, (%eax), %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistri128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: vmovdqa64 (%ecx), %xmm0
+; AVX512VL-NEXT: vpcmpistri $7, (%eax), %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a1
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
@@ -1575,12 +2173,19 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistria128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistria128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistria128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1588,12 +2193,19 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistric128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistric128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistric128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1601,12 +2213,19 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrio128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: seto %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistrio128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: seto %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistrio128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seto %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1614,12 +2233,19 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistris128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: sets %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistris128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: sets %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistris128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1627,12 +2253,19 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistriz128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistriz128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistriz128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1640,10 +2273,15 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrm128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistrm $7, %xmm1, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistrm128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpcmpistrm $7, %xmm1, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistrm128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpcmpistrm $7, %xmm1, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1651,11 +2289,17 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrm128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpcmpistrm $7, (%eax), %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistrm128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vpcmpistrm $7, (%eax), %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistrm128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpcmpistrm $7, (%eax), %xmm0
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a1
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
@@ -1663,10 +2307,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1
define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_add_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_add_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_add_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1674,10 +2323,15 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_cmp_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cmp_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cmp_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1685,10 +2339,15 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_cmp_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cmp_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cmp_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1696,12 +2355,23 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comieq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comieq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comieq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1709,12 +2379,19 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comige_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comige_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comige_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1722,12 +2399,19 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comigt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comigt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comigt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1735,12 +2419,19 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comile_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comile_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comile_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1748,12 +2439,19 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comilt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comilt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comilt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1761,12 +2459,23 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comineq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comineq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comineq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1774,11 +2483,17 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_cvtsi2ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cvtsi2ss:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cvtsi2ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1786,10 +2501,15 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_cvtss2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cvtss2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtss2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cvtss2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtss2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1797,10 +2517,15 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_cvttss2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttss2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cvttss2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cvttss2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttss2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1808,10 +2533,15 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_div_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_div_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_div_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1819,11 +2549,17 @@ declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind read
define void @test_x86_sse_ldmxcsr(i8* %a0) {
-; CHECK-LABEL: test_x86_sse_ldmxcsr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vldmxcsr (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ldmxcsr:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vldmxcsr (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ldmxcsr:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vldmxcsr (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.sse.ldmxcsr(i8* %a0)
ret void
}
@@ -1832,10 +2568,15 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_max_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_max_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_max_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1843,10 +2584,15 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_max_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_max_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_max_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1854,10 +2600,15 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_min_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_min_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_min_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1865,10 +2616,15 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_min_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_min_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_min_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1876,10 +2632,15 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_movmsk_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskps %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_movmsk_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskps %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_movmsk_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskps %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1888,10 +2649,15 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_mul_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_mul_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_mul_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1899,10 +2665,15 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rcp_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcpps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rcp_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vrcpps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rcp_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrcp14ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1910,10 +2681,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rcp_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcpss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rcp_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rcp_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrcpss %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1921,10 +2697,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rsqrt_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrtps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rsqrt_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vrsqrtps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rsqrt_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrsqrt14ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1932,10 +2713,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rsqrt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rsqrt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rsqrt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1943,10 +2729,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_sqrt_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_sqrt_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_sqrt_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1954,10 +2745,15 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_sqrt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_sqrt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_sqrt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1965,34 +2761,33 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @test_x86_sse_stmxcsr(i8* %a0) {
-; CHECK-LABEL: test_x86_sse_stmxcsr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vstmxcsr (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_stmxcsr:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vstmxcsr (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_stmxcsr:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vstmxcsr (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.sse.stmxcsr(i8* %a0)
ret void
}
declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
-define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_storeu_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovups %xmm0, (%eax)
-; CHECK-NEXT: retl
- call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
- ret void
-}
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
-
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_sub_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_sub_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_sub_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -2000,12 +2795,23 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomieq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomieq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomieq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2013,12 +2819,19 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomige_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomige_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomige_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2026,12 +2839,19 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomigt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomigt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomigt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2039,12 +2859,19 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomile_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomile_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomile_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2052,12 +2879,19 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomilt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomilt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomilt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2065,12 +2899,23 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomineq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomineq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomineq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2078,10 +2923,15 @@ declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnon
define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_ssse3_pabs_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpabsb %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pabs_b_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpabsb %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pabs_b_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsb %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -2089,10 +2939,15 @@ declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_ssse3_pabs_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpabsd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pabs_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pabs_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsd %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2100,10 +2955,15 @@ declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_ssse3_pabs_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpabsw %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pabs_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpabsw %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pabs_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsw %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2111,10 +2971,15 @@ declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phadd_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phadd_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2122,10 +2987,15 @@ declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phadd_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phadd_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2133,10 +3003,15 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phadd_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phadd_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2144,10 +3019,15 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphsubd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phsub_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phsub_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2155,10 +3035,15 @@ declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phsub_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phsub_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2166,10 +3051,15 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphsubw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phsub_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phsub_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2177,10 +3067,15 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2188,10 +3083,15 @@ declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind
define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pmul_hr_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pmul_hr_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2199,10 +3099,15 @@ declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind
define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_ssse3_pshuf_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pshuf_b_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pshuf_b_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -2210,10 +3115,15 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsignb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_psign_b_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsignb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_psign_b_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -2221,10 +3131,15 @@ declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsignd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_psign_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsignd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_psign_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2232,10 +3147,15 @@ declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsignw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_psign_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsignw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_psign_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2243,10 +3163,15 @@ declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_addsub_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_addsub_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_addsub_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2254,10 +3179,15 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_addsub_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_addsub_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_addsub_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2265,10 +3195,15 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_blendv_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_blendv_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_blendv_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2276,10 +3211,15 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_blendv_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_blendv_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_blendv_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2287,10 +3227,15 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_cmp_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cmp_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cmp_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2298,50 +3243,91 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_cmp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cmp_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cmp_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpleps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpordps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpleps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpordps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%a2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0) ; <<8 x float>> [#uses=1]
%a3 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a2, i8 1) ; <<8 x float>> [#uses=1]
%a4 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a3, i8 2) ; <<8 x float>> [#uses=1]
@@ -2380,11 +3366,16 @@ declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounw
define <4 x float> @test_x86_avx_cvt_pd2_ps_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_pd2_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2psy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvt_pd2_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2psy %ymm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -2392,89 +3383,64 @@ declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
define <4 x i32> @test_x86_avx_cvt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_pd2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2dqy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvt_pd2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
-define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
-
-
define <8 x i32> @test_x86_avx_cvt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_ps2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvt_ps2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtps2dq %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvt_ps2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtps2dq %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
-define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
-
-
define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtdq2_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvtdq2_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtdq2_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
-define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
-
-
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_dp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_dp_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_dp_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2482,10 +3448,15 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_hadd_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hadd_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hadd_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2493,10 +3464,15 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_hadd_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hadd_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hadd_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2504,10 +3480,15 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_hsub_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hsub_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hsub_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2515,10 +3496,15 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_hsub_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hsub_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hsub_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2526,11 +3512,17 @@ declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind
define <32 x i8> @test_x86_avx_ldu_dq_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_ldu_dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vlddqu (%eax), %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ldu_dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vlddqu (%eax), %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ldu_dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vlddqu (%eax), %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -2538,11 +3530,17 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
define <2 x double> @test_x86_avx_maskload_pd(i8* %a0, <2 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -2550,11 +3548,17 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly
define <4 x double> @test_x86_avx_maskload_pd_256(i8* %a0, <4 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2562,11 +3566,17 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind read
define <4 x float> @test_x86_avx_maskload_ps(i8* %a0, <4 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -2574,11 +3584,17 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly
define <8 x float> @test_x86_avx_maskload_ps_256(i8* %a0, <8 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2586,11 +3602,17 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind reado
define void @test_x86_avx_maskstore_pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2)
ret void
}
@@ -2598,12 +3620,18 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
ret void
}
@@ -2611,11 +3639,17 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
define void @test_x86_avx_maskstore_ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2)
ret void
}
@@ -2623,12 +3657,18 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
ret void
}
@@ -2636,10 +3676,15 @@ declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwin
define <4 x double> @test_x86_avx_max_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_max_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_max_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_max_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2647,10 +3692,15 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_x86_avx_max_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_max_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_max_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_max_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2658,10 +3708,15 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_x86_avx_min_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_min_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_min_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_min_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2669,10 +3724,15 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_x86_avx_min_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_min_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_min_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_min_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2680,11 +3740,16 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_movmsk_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskpd %ymm0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_movmsk_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskpd %ymm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_movmsk_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskpd %ymm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2692,11 +3757,16 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_movmsk_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskps %ymm0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_movmsk_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskps %ymm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_movmsk_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskps %ymm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2709,13 +3779,20 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_ptestc_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %ymm1, %ymm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ptestc_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vptest %ymm1, %ymm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ptestc_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vptest %ymm1, %ymm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2723,13 +3800,20 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_ptestnzc_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %ymm1, %ymm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ptestnzc_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %ymm1, %ymm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ptestnzc_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %ymm1, %ymm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2737,13 +3821,20 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_ptestz_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %ymm1, %ymm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ptestz_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %ymm1, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ptestz_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %ymm1, %ymm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2751,10 +3842,15 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_rcp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcpps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_rcp_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vrcpps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2762,10 +3858,15 @@ declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_round_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundpd $7, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_round_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $7, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_round_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundpd $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2773,10 +3874,15 @@ declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind read
define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_round_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundps $7, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_round_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $7, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_round_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundps $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2784,10 +3890,15 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrtps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vrsqrtps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2795,10 +3906,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtpd %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtpd %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2806,73 +3922,33 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
- ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
- ; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_avx_storeu_dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
- ret void
-}
-declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
-
-
-define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
- ; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_avx_storeu_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovupd %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
- call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
- ret void
-}
-declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
-
-
-define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_storeu_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovups %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
- ret void
-}
-declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
-
-
define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_vbroadcastf128_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vbroadcastf128 (%eax), %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vbroadcastf128_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2880,11 +3956,17 @@ declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly
define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_vbroadcastf128_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vbroadcastf128 (%eax), %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vbroadcastf128_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2892,10 +3974,15 @@ declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vperm2f128_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vperm2f128_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2903,10 +3990,15 @@ declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>,
define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vperm2f128_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vperm2f128_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2914,65 +4006,31 @@ declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8
define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vperm2f128_si_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vperm2f128_si_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
-
-
-define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
-; CHECK-NEXT: retl
- %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0]
-; CHECK-NEXT: retl
- %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4]
-; CHECK-NEXT: retl
- %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
-
-
define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -2980,38 +4038,59 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_pd_256_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd {{.*}}, %ymm0, %ymm0 ## ymm0 = ymm0[1,0,2,3]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_pd_256_2:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, <4 x i32>* %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_ps_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpermilps (%eax), %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_ps_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vpermilps (%eax), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpermilps (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%a2 = load <4 x i32>, <4 x i32>* %a1
%res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
@@ -3020,10 +4099,15 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -3031,12 +4115,19 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestpd %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestpd %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3044,13 +4135,20 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %ymm1, %ymm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestpd %ymm1, %ymm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestpd %ymm1, %ymm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3058,12 +4156,19 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestps %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestps %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3071,13 +4176,20 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestps %ymm1, %ymm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3085,12 +4197,19 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3098,13 +4217,20 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %ymm1, %ymm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %ymm1, %ymm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %ymm1, %ymm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3112,12 +4238,19 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3125,13 +4258,20 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %ymm1, %ymm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3139,12 +4279,19 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3152,13 +4299,20 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %ymm1, %ymm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %ymm1, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %ymm1, %ymm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3166,12 +4320,19 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3179,13 +4340,20 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %ymm1, %ymm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3193,11 +4361,15 @@ declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readn
define void @test_x86_avx_vzeroall() {
-; CHECK-LABEL: test_x86_avx_vzeroall:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vzeroall
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vzeroall:
+; AVX: ## BB#0:
+; AVX-NEXT: vzeroall
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vzeroall:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vzeroall
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.vzeroall()
ret void
}
@@ -3205,11 +4377,15 @@ declare void @llvm.x86.avx.vzeroall() nounwind
define void @test_x86_avx_vzeroupper() {
-; CHECK-LABEL: test_x86_avx_vzeroupper:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vzeroupper:
+; AVX: ## BB#0:
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vzeroupper:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.vzeroupper()
ret void
}
@@ -3218,113 +4394,175 @@ declare void @llvm.x86.avx.vzeroupper() nounwind
; Make sure instructions with no AVX equivalents, but are associated with SSEX feature flags still work
define void @monitor(i8* %P, i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: monitor:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: leal (%eax), %eax
-; CHECK-NEXT: monitor
-; CHECK-NEXT: retl
+; AVX-LABEL: monitor:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: leal (%eax), %eax
+; AVX-NEXT: monitor
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: monitor:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: leal (%eax), %eax
+; AVX512VL-NEXT: monitor
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse3.monitor(i8* %P, i32 %E, i32 %H)
ret void
}
declare void @llvm.x86.sse3.monitor(i8*, i32, i32) nounwind
define void @mwait(i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: mwait:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: mwait
-; CHECK-NEXT: retl
+; AVX-LABEL: mwait:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: mwait
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: mwait:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: mwait
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse3.mwait(i32 %E, i32 %H)
ret void
}
declare void @llvm.x86.sse3.mwait(i32, i32) nounwind
define void @sfence() nounwind {
-; CHECK-LABEL: sfence:
-; CHECK: ## BB#0:
-; CHECK-NEXT: sfence
-; CHECK-NEXT: retl
+; AVX-LABEL: sfence:
+; AVX: ## BB#0:
+; AVX-NEXT: sfence
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: sfence:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: sfence
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse.sfence()
ret void
}
declare void @llvm.x86.sse.sfence() nounwind
define void @lfence() nounwind {
-; CHECK-LABEL: lfence:
-; CHECK: ## BB#0:
-; CHECK-NEXT: lfence
-; CHECK-NEXT: retl
+; AVX-LABEL: lfence:
+; AVX: ## BB#0:
+; AVX-NEXT: lfence
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: lfence:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: lfence
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse2.lfence()
ret void
}
declare void @llvm.x86.sse2.lfence() nounwind
define void @mfence() nounwind {
-; CHECK-LABEL: mfence:
-; CHECK: ## BB#0:
-; CHECK-NEXT: mfence
-; CHECK-NEXT: retl
+; AVX-LABEL: mfence:
+; AVX: ## BB#0:
+; AVX-NEXT: mfence
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: mfence:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: mfence
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse2.mfence()
ret void
}
declare void @llvm.x86.sse2.mfence() nounwind
define void @clflush(i8* %p) nounwind {
-; CHECK-LABEL: clflush:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: clflush (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: clflush:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: clflush (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: clflush:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: clflush (%eax)
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse2.clflush(i8* %p)
ret void
}
declare void @llvm.x86.sse2.clflush(i8*) nounwind
define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
-; CHECK-LABEL: crc32_32_8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: crc32b {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: crc32_32_8:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: crc32b {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: crc32_32_8:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: crc32b {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: retl
%tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
ret i32 %tmp
}
declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
-; CHECK-LABEL: crc32_32_16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: crc32w {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: crc32_32_16:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: crc32w {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: crc32_32_16:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: crc32w {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: retl
%tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
ret i32 %tmp
}
declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: crc32_32_32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: crc32l {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: crc32_32_32:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: crc32_32_32:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: retl
%tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
ret i32 %tmp
}
declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
-; CHECK-LABEL: movnt_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpaddq LCPI277_0, %xmm0, %xmm0
-; CHECK-NEXT: vmovntdq %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: movnt_dq:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: movnt_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
+; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a3) nounwind
@@ -3333,12 +4571,18 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
-; CHECK-LABEL: movnt_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovntps %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: movnt_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmovntps %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: movnt_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmovntps %ymm0, (%eax)
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
ret void
}
@@ -3346,14 +4590,22 @@ declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
; add operation forces the execution domain.
-; CHECK-LABEL: movnt_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovntpd %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: movnt_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovntpd %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: movnt_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovntpd %ymm0, (%eax)
+; AVX512VL-NEXT: retl
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
ret void
@@ -3363,10 +4615,15 @@ declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
; Check for pclmulqdq
define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_pclmulqdq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_pclmulqdq:
+; AVX: ## BB#0:
+; AVX-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_pclmulqdq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86_64.ll b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
index 5a466fc3250f..252574d84d8f 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx512vl | FileCheck %s
define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
; CHECK: vcvtsd2si
diff --git a/test/CodeGen/X86/avx-isa-check.ll b/test/CodeGen/X86/avx-isa-check.ll
index 77bfbd4bb423..dffc8078e44f 100644
--- a/test/CodeGen/X86/avx-isa-check.ll
+++ b/test/CodeGen/X86/avx-isa-check.ll
@@ -1,5 +1,6 @@
; check AVX2 instructions that are disabled in case avx512VL/avx512BW present
-
+
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=corei7-avx -o /dev/null
; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=core-avx2 -mattr=+avx2 -o /dev/null
; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -o /dev/null
; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -mattr=+avx512vl -o /dev/null
@@ -568,3 +569,114 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %a) {
%shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <8 x i16> %shift
}
+
+define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+entry:
+ %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %C = zext <8 x i8> %B to <8 x i16>
+ ret <8 x i16> %C
+}
+
+define <32 x i8> @_broadcast32xi8(i8 %a) {
+ %b = insertelement <32 x i8> undef, i8 %a, i32 0
+ %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
+ ret <32 x i8> %c
+}
+
+define <16 x i8> @_broadcast16xi8(i8 %a) {
+ %b = insertelement <16 x i8> undef, i8 %a, i32 0
+ %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %c
+}
+
+define <16 x i16> @_broadcast16xi16(i16 %a) {
+ %b = insertelement <16 x i16> undef, i16 %a, i32 0
+ %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
+ ret <16 x i16> %c
+}
+
+define <8 x i16> @_broadcast8xi16(i16 %a) {
+ %b = insertelement <8 x i16> undef, i16 %a, i32 0
+ %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %c
+}
+
+define <8 x i32> @_broadcast8xi32(i32 %a) {
+ %b = insertelement <8 x i32> undef, i32 %a, i32 0
+ %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+ ret <8 x i32> %c
+}
+
+define <4 x i32> @_broadcast4xi32(i32 %a) {
+ %b = insertelement <4 x i32> undef, i32 %a, i32 0
+ %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %c
+}
+
+define <4 x i64> @_broadcast4xi64(i64 %a) {
+ %b = insertelement <4 x i64> undef, i64 %a, i64 0
+ %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %c
+}
+
+define <2 x i64> @_broadcast2xi64(i64 %a) {
+ %b = insertelement <2 x i64> undef, i64 %a, i64 0
+ %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %c
+}
+
+define <8 x float> @_broadcast8xfloat(float %a) {
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <4 x float> @_broadcast4xfloat(float %a) {
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+define <4 x double> @_broadcast4xdouble(double %a) {
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
+
+define <2 x double> @_broadcast2xdouble(double %a) {
+ %b = insertelement <2 x double> undef, double %a, i32 0
+ %c = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %c
+}
+
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = fmul <4 x float> %a0, %a1
+ %res = fsub <4 x float> %x, %a2
+ ret <4 x float> %res
+}
+
+define <32 x i8> @test_cmpgtb(<32 x i8> %A) {
+; generate the follow code
+; vpxor %ymm1, %ymm1, %ymm1
+; vpcmpgtb %ymm0, %ymm1, %ymm0
+ %B = ashr <32 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %B
+}
+
+define <4 x float> @_inreg4xfloat(float %a) {
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+define <8 x float> @_inreg8xfloat(float %a) {
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <4 x double> @_inreg4xdouble(double %a) {
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
diff --git a/test/CodeGen/X86/avx-select.ll b/test/CodeGen/X86/avx-select.ll
index 58a75ef0a25d..cdd3180d6245 100644
--- a/test/CodeGen/X86/avx-select.ll
+++ b/test/CodeGen/X86/avx-select.ll
@@ -1,19 +1,34 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; CHECK: _select00
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind {
+; CHECK-LABEL: select00:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB0_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %ymm0, %ymm1
+; CHECK-NEXT: LBB0_2:
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <8 x i32> zeroinitializer, <8 x i32> %b
%res = xor <8 x i32> %b, %selres
ret <8 x i32> %res
}
-; CHECK: _select01
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <4 x i64> @select01(i32 %a, <4 x i64> %b) nounwind {
+; CHECK-LABEL: select01:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB1_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %ymm0, %ymm1
+; CHECK-NEXT: LBB1_2:
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <4 x i64> zeroinitializer, <4 x i64> %b
%res = xor <4 x i64> %b, %selres
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index 033a95276608..b65412d99eb4 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -10,8 +10,7 @@ define <8 x i32> @vshift00(<8 x i32> %a) {
; CHECK-NEXT: vpslld $2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+ %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %s
}
@@ -48,8 +47,7 @@ define <8 x i32> @vshift03(<8 x i32> %a) {
; CHECK-NEXT: vpsrld $2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+ %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %s
}
@@ -86,8 +84,7 @@ define <8 x i32> @vshift06(<8 x i32> %a) {
; CHECK-NEXT: vpsrad $2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+ %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %s
}
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index fae5b41abfa6..d0634ab59f56 100755
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -6,7 +6,7 @@ define <4 x i64> @test1(<4 x i64> %a) nounwind {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: retl
%b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
ret <4 x i64>%b
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index ebaaf0e8d00d..1914b5134bee 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
@@ -14,7 +15,8 @@ entry:
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -26,7 +28,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcC:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmovq %rdi, %xmm0
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -122,9 +124,8 @@ entry:
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcH:
; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -134,8 +135,7 @@ entry:
define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
; CHECK-LABEL: splat_load_2f64_11:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %ptr
%x1 = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 1>
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 27be9fd2fcd1..70c8ecb9d4ad 100755
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -1,22 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
-; CHECK-LABEL: trunc_64_32
-; CHECK: pshufd
-; CHECK: pshufd
-; CHECK: pblendw
+; CHECK-LABEL: trunc_64_32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%B = trunc <4 x i64> %A to <4 x i32>
ret <4 x i32>%B
}
+
define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
-; CHECK-LABEL: trunc_32_16
-; CHECK: pshufb
+; CHECK-LABEL: trunc_32_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%B = trunc <8 x i32> %A to <8 x i16>
ret <8 x i16>%B
}
+
define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
-; CHECK-LABEL: trunc_16_8
-; CHECK: pshufb
+; CHECK-LABEL: trunc_16_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%B = trunc <16 x i16> %A to <16 x i8>
ret <16 x i8> %B
}
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 0c92f4884fb7..b312be9aa6b2 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -173,14 +173,12 @@ define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtabl
; X32-LABEL: load_splat_8i32_4i32_33333333:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
@@ -277,16 +275,12 @@ define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable re
; X32-LABEL: load_splat_4i64_2i64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps (%eax), %xmm0
-; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, <2 x i64>* %ptr
@@ -315,14 +309,12 @@ define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwta
; X32-LABEL: load_splat_2f64_2f64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps (%eax), %xmm0
-; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
diff --git a/test/CodeGen/X86/avx-vbroadcastf128.ll b/test/CodeGen/X86/avx-vbroadcastf128.ll
new file mode 100644
index 000000000000..176246b093ec
--- /dev/null
+++ b/test/CodeGen/X86/avx-vbroadcastf128.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
+
+define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
+; X32-LABEL: test_broadcast_2f64_4f64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2f64_4f64:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x double>, <2 x double> *%p
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x double> %2
+}
+
+define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
+; X32-LABEL: test_broadcast_2i64_4i64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2i64_4i64:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64> *%p
+ %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %2
+}
+
+define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
+; X32-LABEL: test_broadcast_4f32_8f32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4f32_8f32:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x float>, <4 x float> *%p
+ %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %2
+}
+
+define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
+; X32-LABEL: test_broadcast_4i32_8i32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4i32_8i32:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32> *%p
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %2
+}
+
+define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
+; X32-LABEL: test_broadcast_8i16_16i16:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_8i16_16i16:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16> *%p
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {
+; X32-LABEL: test_broadcast_16i8_32i7:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_16i8_32i7:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8> *%p
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <32 x i8> %2
+}
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index 297fb250c5ff..2feddddaf780 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -1,28 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; CHECK-LABEL: A:
-; CHECK-NOT: vunpck
-; CHECK: vextractf128 $1
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: A:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8>
ret <8 x float> %shuffle
}
-; CHECK-LABEL: B:
-; CHECK-NOT: vunpck
-; CHECK: vextractf128 $1
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: B:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
ret <4 x double> %shuffle
}
-; CHECK-LABEL: t0:
-; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t0:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
%1 = bitcast float* %addr to <4 x float>*
@@ -30,13 +34,12 @@ entry:
ret void
}
-declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
-
-; CHECK-LABEL: t2:
-; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t2:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
%1 = bitcast double* %addr to <2 x double>*
@@ -44,13 +47,12 @@ entry:
ret void
}
-declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-
-; CHECK-LABEL: t4:
-; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t4:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
%1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
@@ -59,11 +61,12 @@ entry:
ret void
}
-declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
-
-; CHECK-LABEL: t5:
-; CHECK: vmovaps %xmm0, (%rdi)
define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t5:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
%1 = bitcast float* %addr to <4 x float>*
@@ -71,9 +74,12 @@ entry:
ret void
}
-; CHECK-LABEL: t6:
-; CHECK: vmovaps %xmm0, (%rdi)
define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t6:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
%1 = bitcast double* %addr to <2 x double>*
@@ -81,9 +87,12 @@ entry:
ret void
}
-; CHECK-LABEL: t7:
-; CHECK: vmovaps %xmm0, (%rdi)
define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t7:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
%1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
@@ -92,9 +101,12 @@ entry:
ret void
}
-; CHECK-LABEL: t8:
-; CHECK: vmovups %xmm0, (%rdi)
define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t8:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
%1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
@@ -105,6 +117,12 @@ entry:
; PR15462
define void @t9(i64* %p) {
+; CHECK-LABEL: t9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
store i64 0, i64* %p
%q = getelementptr i64, i64* %p, i64 1
store i64 0, i64* %q
@@ -113,9 +131,8 @@ define void @t9(i64* %p) {
%s = getelementptr i64, i64* %p, i64 3
store i64 0, i64* %s
ret void
-
-; CHECK-LABEL: t9:
-; CHECK: vxorps %xmm
-; CHECK-NOT: vextractf
-; CHECK: vmovups
}
+
+declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
+declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 0958008d9a3e..740fd77d82e2 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: A:
+define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45670123:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; ALL-NEXT: retq
@@ -12,28 +12,63 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: B:
+define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45670123_mem:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
; ALL-NEXT: retq
entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+ %a = load <8 x float>, <8 x float>* %pa
+ %b = load <8 x float>, <8 x float>* %pb
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %shuffle
}
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: C:
+define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_0123cdef:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v8f32_01230123:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_01230123:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v8f32_01230123_mem:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_01230123_mem:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX2-NEXT: retq
+entry:
+ %a = load <8 x float>, <8 x float>* %pa
+ %b = load <8 x float>, <8 x float>* %pb
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %shuffle
}
-define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: D:
+define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45674567:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -42,28 +77,30 @@ entry:
ret <8 x float> %shuffle
}
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: E:
+define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45674567_mem:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
; ALL-NEXT: retq
entry:
- %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
- ret <32 x i8> %shuffle
+ %a = load <8 x float>, <8 x float>* %pa
+ %b = load <8 x float>, <8 x float>* %pb
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %shuffle
}
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: E2:
+define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v32i8_2323:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
- ret <4 x i64> %shuffle
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %shuffle
}
-define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: Ei:
+define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v32i8_2323_domain:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
@@ -71,7 +108,7 @@ define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: Ei:
+; AVX2-LABEL: shuffle_v32i8_2323_domain:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
@@ -83,14 +120,24 @@ entry:
ret <32 x i8> %shuffle
}
-define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E2i:
+define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v4i64_6701:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v4i64_6701_domain:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E2i:
+; AVX2-LABEL: shuffle_v4i64_6701_domain:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -103,8 +150,8 @@ entry:
ret <4 x i64> %shuffle
}
-define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E3i:
+define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
@@ -112,7 +159,7 @@ define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E3i:
+; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
@@ -125,14 +172,14 @@ entry:
ret <8 x i32> %shuffle
}
-define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E4i:
+define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v16i16_4501:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E4i:
+; AVX2-LABEL: shuffle_v16i16_4501:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -144,8 +191,8 @@ entry:
ret <16 x i16> %shuffle
}
-define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E5i:
+define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v16i16_4501_mem:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovaps (%rsi), %ymm1
@@ -153,7 +200,7 @@ define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E5i:
+; AVX2-LABEL: shuffle_v16i16_4501_mem:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
@@ -170,8 +217,8 @@ entry:
;;;; Cases with undef indicies mixed in the mask
-define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F:
+define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67u9ub:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
@@ -180,8 +227,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F2(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F2:
+define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67uu67:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -190,8 +237,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F3(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F3:
+define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67uuab:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
@@ -200,8 +247,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F4(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F4:
+define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67uuef:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: retq
@@ -210,8 +257,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F5(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F5:
+define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu674567:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -220,8 +267,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F6(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F6:
+define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu6789ab:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
@@ -230,8 +277,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F7(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F7:
+define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_4567uu67:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -240,8 +287,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F8(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F8:
+define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_4567uuef:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: retq
@@ -252,8 +299,8 @@ entry:
;;;; Cases we must not select vperm2f128
-define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: G:
+define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67ucuf:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
@@ -266,9 +313,18 @@ entry:
;; Test zero mask generation.
;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
+;; TODO: When building for optsize we should use vperm2f128.
-define <4 x double> @vperm2z_0x08(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x08:
+define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz01:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz01_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
@@ -276,8 +332,17 @@ define <4 x double> @vperm2z_0x08(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x18(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x18:
+define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz23:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz23_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -286,8 +351,16 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x28(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x28:
+define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz45:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz45_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
@@ -295,8 +368,17 @@ define <4 x double> @vperm2z_0x28(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x38(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x38:
+define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz67:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz67_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -305,8 +387,17 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x80(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x80:
+define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_01zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_01zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
@@ -315,8 +406,16 @@ define <4 x double> @vperm2z_0x80(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x81(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x81:
+define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_23zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_23zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
@@ -324,8 +423,17 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x82(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x82:
+define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_45zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_45zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
@@ -334,8 +442,16 @@ define <4 x double> @vperm2z_0x82(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x83(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x83:
+define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_67zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_67zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
@@ -345,8 +461,8 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) {
;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
-define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: vperm2z_int_0x83:
+define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_67zz:
; AVX1: ## BB#0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -356,7 +472,7 @@ define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: vperm2z_int_0x83:
+; AVX2-LABEL: shuffle_v4i64_67zz:
; AVX2: ## BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -366,3 +482,174 @@ define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
ret <4 x i64> %c
}
+;;; Memory folding cases
+
+define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_4f64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_4f64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <4 x double>, <4 x double> * %pa
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
+ ret <4 x double> %res
+}
+
+define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_4f64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_4f64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <4 x double>, <4 x double> * %pb
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
+ ret <4 x double> %res
+}
+
+define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_8f32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_8f32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <8 x float>, <8 x float> * %pa
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+ ret <8 x float> %res
+}
+
+define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_8f32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_8f32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <8 x float>, <8 x float> * %pb
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+ ret <8 x float> %res
+}
+
+define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_4i64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_4i64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <4 x i64>, <4 x i64> * %pa
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_4i64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_4i64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <4 x i64>, <4 x i64> * %pb
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_8i32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_8i32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <8 x i32>, <8 x i32> * %pa
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_8i32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_8i32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <8 x i32>, <8 x i32> * %pb
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i32> %res
+}
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index a16dc70e81c6..3c52aaf71adc 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,4 +1,9 @@
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
+
+; FASTYMM-NOT: vzeroupper
+; BTVER2-NOT: vzeroupper
declare i32 @foo()
declare <4 x float> @do_sse(<4 x float>)
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index 176292768253..7c16ec800a5e 100755
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -4,8 +4,9 @@
define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
; CHECK-LABEL: trunc4:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
-; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%B = trunc <4 x i64> %A to <4 x i32>
@@ -17,6 +18,7 @@ define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
; CHECK: ## BB#0:
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%B = trunc <8 x i32> %A to <8 x i16>
diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..430628c3f800
--- /dev/null
+++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -0,0 +1,3388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
+
+define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_abs_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpabsb %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_abs_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpabsb %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg = bitcast <4 x i64> %a0 to <32 x i8>
+ %call = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %arg)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_abs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpabsw %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_abs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpabsw %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg = bitcast <4 x i64> %a0 to <16 x i16>
+ %call = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %arg)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_abs_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpabsd %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_abs_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpabsd %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg = bitcast <4 x i64> %a0 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %arg)
+ %res = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = add <32 x i8> %arg0, %arg1
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = add <16 x i16> %arg0, %arg1
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = add <8 x i32> %arg0, %arg1
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = add <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_alignr_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_alignr_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test2_mm256_alignr_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
+; X32-NEXT: retl
+;
+; X64-LABEL: test2_mm256_alignr_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_and_si256:
+; X32: # BB#0:
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_and_si256:
+; X64: # BB#0:
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = and <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_andnot_si256:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; X32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_andnot_si256:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %res = and <4 x i64> %not, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_avg_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_avg_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_avg_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_avg_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_blend_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_blend_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %res = bitcast <4 x i32> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_blend_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+ %res = bitcast <8 x i32> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_blendv_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blendv_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
+ %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res = bitcast <16 x i8> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = bitcast <4 x i32> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = bitcast <8 x i32> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastsi128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsi128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
+; X32-LABEL: test_mm256_broadcastsi128_si256_mem:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %a0 = load <2 x i64>, <2 x i64>* %p0
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %res
+}
+
+define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res = bitcast <8 x i16> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_bslli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_bslli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_bsrli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_bsrli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp eq <32 x i8> %arg0, %arg1
+ %res = sext <32 x i1> %cmp to <32 x i8>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp eq <16 x i16> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i16>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp eq <8 x i32> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i32>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp eq <4 x i64> %a0, %a1
+ %res = sext <4 x i1> %cmp to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp sgt <32 x i8> %arg0, %arg1
+ %res = sext <32 x i1> %cmp to <32 x i8>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp sgt <16 x i16> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i16>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp sgt <8 x i32> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i32>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp sgt <4 x i64> %a0, %a1
+ %res = sext <4 x i1> %cmp to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi8_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxbw %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi8_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext = sext <16 x i8> %arg0 to <16 x i16>
+ %res = bitcast <16 x i16> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi8_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxbd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi8_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxbd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %ext = sext <8 x i8> %shuf to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi8_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxbq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi8_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxbq %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = sext <4 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi16_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi16_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext = sext <8 x i16> %arg0 to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi16_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxwq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi16_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxwq %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = sext <4 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi32_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxdq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi32_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxdq %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = sext <4 x i32> %arg0 to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu8_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu8_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext = zext <16 x i8> %arg0 to <16 x i16>
+ %res = bitcast <16 x i16> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu8_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu8_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %ext = zext <8 x i8> %shuf to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu8_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu8_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = zext <4 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu16_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu16_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext = zext <8 x i16> %arg0 to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu16_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu16_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = zext <4 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu32_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu32_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = zext <4 x i32> %arg0 to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extracti128_si256:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extracti128_si256:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hadd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hadd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hadds_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadds_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hsub_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hsub_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hsubs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsubs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovdqa %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
+ %bc = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
+ %bc = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
+; X32-NEXT: vmovdqa %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
+ ret <4 x i64> %res
+}
+
+define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
+ %sext = sext <2 x i1> %cmp to <2 x i64>
+ %mask = bitcast <2 x i64> %sext to <2 x double>
+ %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast double *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
+; X32-NEXT: vmovapd %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
+; X64-NEXT: vmovapd %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
+ %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
+
+define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast double *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
+ ret <4 x double> %res
+}
+
+define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %mask = bitcast <4 x i32> %sext to <4 x float>
+ %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
+ ret <4 x float> %call
+}
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
+ ret <4 x float> %call
+}
+
+define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
+; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovaps %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
+; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovaps %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
+ %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
+ ret <8 x float> %call
+}
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
+
+define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
+ ret <8 x float> %call
+}
+
+define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
+ ret <2 x i64> %call
+}
+declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
+ ret <2 x i64> %call
+}
+
+define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovdqa %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
+ ret <4 x i64> %call
+}
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
+ ret <4 x i64> %call
+}
+
+define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
+ %sext = sext <2 x i1> %cmp to <2 x i64>
+ %mask = bitcast <2 x i64> %sext to <2 x double>
+ %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
+ ret <2 x double> %call
+}
+declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
+
+define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast double *%a1 to i8*
+ %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
+ ret <2 x double> %call
+}
+
+define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovapd %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovapd %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
+ %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
+ ret <4 x double> %call
+}
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
+ ret <4 x double> %call
+}
+
+define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %mask = bitcast <4 x i32> %sext to <4 x float>
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
+ ret <4 x float> %call
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
+ ret <4 x float> %call
+}
+
+define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
+; X32-NEXT: vmovaps %xmm1, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %mask = bitcast <4 x i32> %sext to <4 x float>
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
+ ret <4 x float> %call
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
+ ret <4 x float> %call
+}
+
+define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test0_mm256_inserti128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test0_mm256_inserti128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT: retq
+ %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test1_mm256_inserti128_si256:
+; X32: # BB#0:
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test1_mm256_inserti128_si256:
+; X64: # BB#0:
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_madd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_madd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_maddubs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maddubs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
+
+define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
+
+define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
+
+define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
+
+define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
+
+define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
+
+define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
+
+define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp sgt <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp sgt <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp sgt <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp ugt <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp ugt <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp ugt <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp slt <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp slt <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp slt <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp ult <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp ult <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp ult <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_movemask_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpmovmskb %ymm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movemask_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpmovmskb %ymm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mpsadbw_epu8:
+; X32: # BB#0:
+; X32-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mpsadbw_epu8:
+; X64: # BB#0:
+; X64-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
+ %bc = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
+
+define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mul_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mul_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mulhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mulhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mulhi_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mulhi_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mulhrs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mulhrs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mullo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mullo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = mul <16 x i16> %arg0, %arg1
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mullo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mullo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = mul <8 x i32> %arg0, %arg1
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_or_si256:
+; X32: # BB#0:
+; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_or_si256:
+; X64: # BB#0:
+; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = or <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packs_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packs_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packus_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packus_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packus_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packus_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_permute2x128_si256:
+; X32: # BB#0:
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2x128_si256:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 49)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_permute4x64_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute4x64_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_permute4x64_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute4x64_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_permutevar8x32_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar8x32_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_permutevar8x32_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar8x32_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
+
+define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sad_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sad_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
+ %res = bitcast <8 x i32> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_shuffle_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_shufflehi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shufflehi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_shufflelo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shufflelo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sign_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sign_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sign_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sign_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sign_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sign_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sll_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sll_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sll_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sll_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sll_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sll_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsllw $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsllw $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpslld $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpslld $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllq $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllq $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_si256:
+; X32: # BB#0:
+; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_si256:
+; X64: # BB#0:
+; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sllv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sllv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sllv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sllv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sllv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sllv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sllv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sllv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sra_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sra_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sra_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sra_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srai_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsraw $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srai_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsraw $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srai_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrad $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srai_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrad $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srav_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srav_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_srav_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srav_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_srl_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srl_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_srl_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srl_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_srl_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srl_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsrlw $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsrlw $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrld $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrld $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlq $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlq $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_si256:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_si256:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srlv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srlv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_srlv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srlv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srlv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srlv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_srlv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srlv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
+; X32-LABEL: test_mm256_stream_load_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntdqa (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_load_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovntdqa (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> *%a0 to i8*
+ %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
+
+define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = sub <32 x i8> %arg0, %arg1
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = sub <16 x i16> %arg0, %arg1
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = sub <8 x i32> %arg0, %arg1
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = sub <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_xor_si256:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_xor_si256:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = xor <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index 36b6da5ef960..b6b8447beda1 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -1,7 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx2 | FileCheck %s
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpblendw
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; CHECK-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -9,7 +13,10 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpblendd
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -17,7 +24,10 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpblendd
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -25,7 +35,10 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind
define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vmpsadbw
+; CHECK-LABEL: test_x86_avx2_mpsadbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -33,7 +46,10 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind re
define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
- ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+; CHECK-LABEL: test_x86_avx2_psll_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -41,7 +57,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
- ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+; CHECK-LABEL: test_x86_avx2_psrl_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -49,7 +68,10 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
- ; CHECK: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; CHECK-LABEL: test_x86_avx2_psll_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -57,7 +79,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
- ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+; CHECK-LABEL: test_x86_avx2_psrl_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -65,9 +90,11 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx2_vextracti128:
-; CHECK: vextracti128
-
+; CHECK-LABEL: test_x86_avx2_vextracti128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7)
ret <2 x i64> %res
}
@@ -75,9 +102,10 @@ declare <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64>, i8) nounwind readnone
define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_vinserti128:
-; CHECK: vinserti128
-
+; CHECK-LABEL: test_x86_avx2_vinserti128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7)
ret <4 x i64> %res
}
@@ -85,10 +113,10 @@ declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind
define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
- ; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
- ; CHECK-NEXT: retl
+; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retl
%res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
ret <4 x double> %res
}
@@ -96,10 +124,10 @@ declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind
define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
- ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
- ; CHECK-NEXT: retl
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
ret <4 x float> %res
}
@@ -107,10 +135,10 @@ declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readon
define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
- ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
- ; CHECK-NEXT: retl
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
ret <8 x float> %res
}
@@ -203,3 +231,284 @@ define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
+
+
+define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: retl
+ %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
+
+; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
+define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_avx_storeu_dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vpaddb LCPI33_0, %ymm0, %ymm0
+; CHECK-NEXT: vmovdqu %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
+define <32 x i8> @mm256_max_epi8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_max_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_max_epi16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_max_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_max_epi32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_max_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @mm256_max_epu8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_max_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_max_epu16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_max_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_max_epu32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_max_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @mm256_min_epi8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_min_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_min_epi16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_min_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_min_epi32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_min_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @mm256_min_epu8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_min_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_min_epu16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_min_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_min_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 606aca9dc02b..2a04de5fe907 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1,7 +1,17 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=AVX512VL
define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpackssdw
+; AVX2-LABEL: test_x86_avx2_packssdw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packssdw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -9,7 +19,15 @@ declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readno
define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpacksswb
+; AVX2-LABEL: test_x86_avx2_packsswb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packsswb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -17,7 +35,15 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpackuswb
+; AVX2-LABEL: test_x86_avx2_packuswb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packuswb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -25,7 +51,15 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpaddsb
+; AVX2-LABEL: test_x86_avx2_padds_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -33,7 +67,15 @@ declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpaddsw
+; AVX2-LABEL: test_x86_avx2_padds_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -41,7 +83,15 @@ declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpaddusb
+; AVX2-LABEL: test_x86_avx2_paddus_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_paddus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -49,7 +99,15 @@ declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnon
define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpaddusw
+; AVX2-LABEL: test_x86_avx2_paddus_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_paddus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -57,7 +115,15 @@ declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind read
define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpavgb
+; AVX2-LABEL: test_x86_avx2_pavg_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pavg_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -65,7 +131,15 @@ declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpavgw
+; AVX2-LABEL: test_x86_avx2_pavg_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pavg_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -73,7 +147,15 @@ declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readno
define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmaddwd
+; AVX2-LABEL: test_x86_avx2_pmadd_wd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -81,7 +163,15 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmaxs_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmaxsw
+; AVX2-LABEL: test_x86_avx2_pmaxs_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -89,7 +179,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pmaxu_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpmaxub
+; AVX2-LABEL: test_x86_avx2_pmaxu_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -97,7 +195,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmins_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpminsw
+; AVX2-LABEL: test_x86_avx2_pmins_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmins_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -105,7 +211,15 @@ declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpminub
+; AVX2-LABEL: test_x86_avx2_pminu_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -113,7 +227,16 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
- ; CHECK: vpmovmskb
+; AVX2-LABEL: test_x86_avx2_pmovmskb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmovmskb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmovmskb %ymm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -121,7 +244,15 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmulhw
+; AVX2-LABEL: test_x86_avx2_pmulh_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulh_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -129,7 +260,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmulhuw
+; AVX2-LABEL: test_x86_avx2_pmulhu_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -137,7 +276,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmuludq
+; AVX2-LABEL: test_x86_avx2_pmulu_dq:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulu_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -145,7 +292,15 @@ declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnon
define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsadbw
+; AVX2-LABEL: test_x86_avx2_psad_bw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psad_bw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -153,7 +308,15 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpslld
+; AVX2-LABEL: test_x86_avx2_psll_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -161,7 +324,15 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsllq
+; AVX2-LABEL: test_x86_avx2_psll_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -169,7 +340,15 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpsllw
+; AVX2-LABEL: test_x86_avx2_psll_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -177,7 +356,15 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) {
- ; CHECK: vpslld
+; AVX2-LABEL: test_x86_avx2_pslli_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpslld $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -185,7 +372,15 @@ declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) {
- ; CHECK: vpsllq
+; AVX2-LABEL: test_x86_avx2_pslli_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -193,7 +388,15 @@ declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) {
- ; CHECK: vpsllw
+; AVX2-LABEL: test_x86_avx2_pslli_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -201,7 +404,15 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsrad
+; AVX2-LABEL: test_x86_avx2_psra_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -209,7 +420,15 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpsraw
+; AVX2-LABEL: test_x86_avx2_psra_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -217,7 +436,15 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) {
- ; CHECK: vpsrad
+; AVX2-LABEL: test_x86_avx2_psrai_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrad $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -225,7 +452,15 @@ declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) {
- ; CHECK: vpsraw
+; AVX2-LABEL: test_x86_avx2_psrai_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsraw $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -233,7 +468,15 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsrld
+; AVX2-LABEL: test_x86_avx2_psrl_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -241,7 +484,15 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsrlq
+; AVX2-LABEL: test_x86_avx2_psrl_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -249,7 +500,15 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpsrlw
+; AVX2-LABEL: test_x86_avx2_psrl_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -257,7 +516,15 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
- ; CHECK: vpsrld
+; AVX2-LABEL: test_x86_avx2_psrli_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrld $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -265,7 +532,15 @@ declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) {
- ; CHECK: vpsrlq
+; AVX2-LABEL: test_x86_avx2_psrli_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -273,7 +548,15 @@ declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) {
- ; CHECK: vpsrlw
+; AVX2-LABEL: test_x86_avx2_psrli_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -281,7 +564,15 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsubsb
+; AVX2-LABEL: test_x86_avx2_psubs_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -289,7 +580,15 @@ declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpsubsw
+; AVX2-LABEL: test_x86_avx2_psubs_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -297,7 +596,15 @@ declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsubusb
+; AVX2-LABEL: test_x86_avx2_psubus_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -305,7 +612,15 @@ declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnon
define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpsubusw
+; AVX2-LABEL: test_x86_avx2_psubus_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -313,7 +628,15 @@ declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind read
define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
- ; CHECK: vpabsb
+; AVX2-LABEL: test_x86_avx2_pabs_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpabsb %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pabs_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsb %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -321,7 +644,15 @@ declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) {
- ; CHECK: vpabsd
+; AVX2-LABEL: test_x86_avx2_pabs_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pabs_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsd %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -329,7 +660,15 @@ declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
- ; CHECK: vpabsw
+; AVX2-LABEL: test_x86_avx2_pabs_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpabsw %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pabs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsw %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -337,7 +676,15 @@ declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vphaddd
+; AVX2-LABEL: test_x86_avx2_phadd_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phadd_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -345,7 +692,15 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphaddsw
+; AVX2-LABEL: test_x86_avx2_phadd_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phadd_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -353,7 +708,15 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphaddw
+; AVX2-LABEL: test_x86_avx2_phadd_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phadd_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -361,7 +724,15 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vphsubd
+; AVX2-LABEL: test_x86_avx2_phsub_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phsub_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -369,7 +740,15 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphsubsw
+; AVX2-LABEL: test_x86_avx2_phsub_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phsub_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -377,7 +756,15 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphsubw
+; AVX2-LABEL: test_x86_avx2_phsub_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phsub_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -385,7 +772,15 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpmaddubsw
+; AVX2-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -393,7 +788,15 @@ declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind rea
define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmulhrsw
+; AVX2-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -401,7 +804,15 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpshufb
+; AVX2-LABEL: test_x86_avx2_pshuf_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pshuf_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -409,7 +820,15 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsignb
+; AVX2-LABEL: test_x86_avx2_psign_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psign_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -417,7 +836,15 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsignd
+; AVX2-LABEL: test_x86_avx2_psign_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psign_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -425,7 +852,15 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpsignw
+; AVX2-LABEL: test_x86_avx2_psign_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psign_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -433,8 +868,17 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) {
- ; CHECK: movl
- ; CHECK: vmovntdqa
+; AVX2-LABEL: test_x86_avx2_movntdqa:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vmovntdqa (%eax), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_movntdqa:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmovntdqa (%eax), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -442,7 +886,15 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vmpsadbw
+; AVX2-LABEL: test_x86_avx2_mpsadbw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_mpsadbw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -450,7 +902,15 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpackusdw
+; AVX2-LABEL: test_x86_avx2_packusdw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packusdw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -458,7 +918,15 @@ declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readno
define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
- ; CHECK: vpblendvb
+; AVX2-LABEL: test_x86_avx2_pblendvb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendvb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -466,7 +934,15 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpblendw
+; AVX2-LABEL: test_x86_avx2_pblendw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -474,7 +950,15 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpmaxsb
+; AVX2-LABEL: test_x86_avx2_pmaxsb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -482,7 +966,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmaxsd
+; AVX2-LABEL: test_x86_avx2_pmaxsd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -490,7 +982,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmaxud
+; AVX2-LABEL: test_x86_avx2_pmaxud:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -498,7 +998,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmaxuw
+; AVX2-LABEL: test_x86_avx2_pmaxuw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -506,7 +1014,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpminsb
+; AVX2-LABEL: test_x86_avx2_pminsb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -514,7 +1030,15 @@ declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpminsd
+; AVX2-LABEL: test_x86_avx2_pminsd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -522,7 +1046,15 @@ declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpminud
+; AVX2-LABEL: test_x86_avx2_pminud:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -530,111 +1062,22 @@ declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpminuw
+; AVX2-LABEL: test_x86_avx2_pminuw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
-define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
- ; CHECK: vpmovsxbd
- %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
- ; CHECK: vpmovsxbq
- %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
- ; CHECK: vpmovsxbw
- %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
- ; CHECK: vpmovsxdq
- %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
- ; CHECK: vpmovsxwd
- %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
- ; CHECK: vpmovsxwq
- %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
- ; CHECK: vpmovzxbd
- %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
- ; CHECK: vpmovzxbq
- %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
- ; CHECK: vpmovzxbw
- %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
- ; CHECK: vpmovzxdq
- %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
- ; CHECK: vpmovzxwd
- %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
- ; CHECK: vpmovzxwq
- %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
-
-
define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmuldq
%res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -642,7 +1085,15 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpblendd
+; AVX2-LABEL: test_x86_avx2_pblendd_128:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendd_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -650,29 +1101,53 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpblendd
+; AVX2-LABEL: test_x86_avx2_pblendd_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+; Check that the arguments are swapped between the intrinsic definition
+; and its lowering. Indeed, the offsets are the first source in
+; the instruction.
define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
- ; Check that the arguments are swapped between the intrinsic definition
- ; and its lowering. Indeed, the offsets are the first source in
- ; the instruction.
- ; CHECK: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-LABEL: test_x86_avx2_permd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_permd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+; Check that the arguments are swapped between the intrinsic definition
+; and its lowering. Indeed, the offsets are the first source in
+; the instruction.
define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
- ; Check that the arguments are swapped between the intrinsic definition
- ; and its lowering. Indeed, the offsets are the first source in
- ; the instruction.
- ; CHECK: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-LABEL: test_x86_avx2_permps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_permps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -680,7 +1155,15 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
- ; CHECK: vperm2i128
+; AVX2-LABEL: test_x86_avx2_vperm2i128:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_vperm2i128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -688,7 +1171,17 @@ declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind r
define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskload_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -696,7 +1189,17 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
define <4 x i64> @test_x86_avx2_maskload_q_256(i8* %a0, <4 x i64> %a1) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskload_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -704,7 +1207,17 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonl
define <4 x i32> @test_x86_avx2_maskload_d(i8* %a0, <4 x i32> %a1) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskload_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -712,7 +1225,17 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
define <8 x i32> @test_x86_avx2_maskload_d_256(i8* %a0, <8 x i32> %a1) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskload_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -720,7 +1243,17 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonl
define void @test_x86_avx2_maskstore_q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskstore_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2)
ret void
}
@@ -728,7 +1261,18 @@ declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskstore_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
ret void
}
@@ -736,7 +1280,17 @@ declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
define void @test_x86_avx2_maskstore_d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskstore_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2)
ret void
}
@@ -744,7 +1298,18 @@ declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskstore_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
ret void
}
@@ -752,7 +1317,15 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsllvd
+; AVX2-LABEL: test_x86_avx2_psllv_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -760,7 +1333,15 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsllvd
+; AVX2-LABEL: test_x86_avx2_psllv_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -768,7 +1349,15 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsllvq
+; AVX2-LABEL: test_x86_avx2_psllv_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -776,7 +1365,15 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
- ; CHECK: vpsllvq
+; AVX2-LABEL: test_x86_avx2_psllv_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -784,7 +1381,15 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsrlvd
+; AVX2-LABEL: test_x86_avx2_psrlv_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -792,7 +1397,15 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsrlvd
+; AVX2-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -800,7 +1413,15 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsrlvq
+; AVX2-LABEL: test_x86_avx2_psrlv_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -808,7 +1429,15 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
- ; CHECK: vpsrlvq
+; AVX2-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -816,33 +1445,79 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsravd
+; AVX2-LABEL: test_x86_avx2_psrav_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
+define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
+; AVX2-LABEL: test_x86_avx2_psrav_d_const:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
+; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [2,9,4294967284,23]
+; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsravd
+; AVX2-LABEL: test_x86_avx2_psrav_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
-; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
-define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
- ; CHECK: vmovdqu
- ; add operation forces the execution domain.
- %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
- ret void
+define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) {
+; AVX2-LABEL: test_x86_avx2_psrav_d_256_const:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
+ ret <8 x i32> %res
}
-declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
-define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
- <4 x i32> %idx, <2 x double> %mask) {
- ; CHECK: vgatherdpd
+define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_pd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
ret <2 x double> %res
@@ -850,9 +1525,18 @@ define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
<4 x i32>, <2 x double>, i8) nounwind readonly
-define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
- <4 x i32> %idx, <4 x double> %mask) {
- ; CHECK: vgatherdpd
+define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1, <4 x i32> %idx, <4 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_pd_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
ret <4 x double> %res
@@ -860,9 +1544,18 @@ define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
<4 x i32>, <4 x double>, i8) nounwind readonly
-define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
- <2 x i64> %idx, <2 x double> %mask) {
- ; CHECK: vgatherqpd
+define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1, <2 x i64> %idx, <2 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_pd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
ret <2 x double> %res
@@ -870,9 +1563,18 @@ define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
<2 x i64>, <2 x double>, i8) nounwind readonly
-define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
- <4 x i64> %idx, <4 x double> %mask) {
- ; CHECK: vgatherqpd
+define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1, <4 x i64> %idx, <4 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_pd_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
ret <4 x double> %res
@@ -880,9 +1582,18 @@ define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
<4 x i64>, <4 x double>, i8) nounwind readonly
-define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
- <4 x i32> %idx, <4 x float> %mask) {
- ; CHECK: vgatherdps
+define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1, <4 x i32> %idx, <4 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_ps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -890,9 +1601,18 @@ define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
<4 x i32>, <4 x float>, i8) nounwind readonly
-define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
- <8 x i32> %idx, <8 x float> %mask) {
- ; CHECK: vgatherdps
+define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1, <8 x i32> %idx, <8 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_ps_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
ret <8 x float> %res
@@ -900,9 +1620,18 @@ define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
<8 x i32>, <8 x float>, i8) nounwind readonly
-define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
- <2 x i64> %idx, <4 x float> %mask) {
- ; CHECK: vgatherqps
+define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1, <2 x i64> %idx, <4 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_ps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -910,9 +1639,19 @@ define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
<2 x i64>, <4 x float>, i8) nounwind readonly
-define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
- <4 x i64> %idx, <4 x float> %mask) {
- ; CHECK: vgatherqps
+define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_ps_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -920,9 +1659,18 @@ define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
<4 x i64>, <4 x float>, i8) nounwind readonly
-define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
- <4 x i32> %idx, <2 x i64> %mask) {
- ; CHECK: vpgatherdq
+define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1, <4 x i32> %idx, <2 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
ret <2 x i64> %res
@@ -930,9 +1678,18 @@ define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
<4 x i32>, <2 x i64>, i8) nounwind readonly
-define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
- <4 x i32> %idx, <4 x i64> %mask) {
- ; CHECK: vpgatherdq
+define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1, <4 x i32> %idx, <4 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
ret <4 x i64> %res
@@ -940,9 +1697,18 @@ define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
<4 x i32>, <4 x i64>, i8) nounwind readonly
-define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
- <2 x i64> %idx, <2 x i64> %mask) {
- ; CHECK: vpgatherqq
+define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1, <2 x i64> %idx, <2 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
ret <2 x i64> %res
@@ -950,9 +1716,18 @@ define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
<2 x i64>, <2 x i64>, i8) nounwind readonly
-define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
- <4 x i64> %idx, <4 x i64> %mask) {
- ; CHECK: vpgatherqq
+define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1, <4 x i64> %idx, <4 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
ret <4 x i64> %res
@@ -960,9 +1735,18 @@ define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
<4 x i64>, <4 x i64>, i8) nounwind readonly
-define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
- <4 x i32> %idx, <4 x i32> %mask) {
- ; CHECK: vpgatherdd
+define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1, <4 x i32> %idx, <4 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -970,9 +1754,18 @@ define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
<4 x i32>, <4 x i32>, i8) nounwind readonly
-define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
- <8 x i32> %idx, <8 x i32> %mask) {
- ; CHECK: vpgatherdd
+define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1, <8 x i32> %idx, <8 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
ret <8 x i32> %res
@@ -980,9 +1773,18 @@ define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
<8 x i32>, <8 x i32>, i8) nounwind readonly
-define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
- <2 x i64> %idx, <4 x i32> %mask) {
- ; CHECK: vpgatherqd
+define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1, <2 x i64> %idx, <4 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -990,9 +1792,19 @@ define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
<2 x i64>, <4 x i32>, i8) nounwind readonly
-define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1,
- <4 x i64> %idx, <4 x i32> %mask) {
- ; CHECK: vpgatherqd
+define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -1001,13 +1813,25 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
<4 x i64>, <4 x i32>, i8) nounwind readonly
; PR13298
-define <8 x float> @test_gather_mask(<8 x float> %a0, float* %a,
- <8 x i32> %idx, <8 x float> %mask,
- float* nocapture %out) {
-; CHECK: test_gather_mask
-; CHECK: vmovaps %ymm2, [[DEST:%.*]]
-; CHECK: vgatherdps [[DEST]]
+define <8 x float> @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx, <8 x float> %mask, float* nocapture %out) {
;; gather with mask
+; AVX2-LABEL: test_gather_mask:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX2-NEXT: vmovaps %ymm2, %ymm3
+; AVX2-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0
+; AVX2-NEXT: vmovups %ymm2, (%eax)
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_gather_mask:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: vmovaps %ymm2, %ymm3
+; AVX512VL-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0
+; AVX512VL-NEXT: vmovups %ymm2, (%eax)
+; AVX512VL-NEXT: retl
%a_i8 = bitcast float* %a to i8*
%res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll
index 3d4fcec6078e..e187933f66be 100644
--- a/test/CodeGen/X86/avx2-logic.ll
+++ b/test/CodeGen/X86/avx2-logic.ll
@@ -53,35 +53,6 @@ define <32 x i8> @vpblendvb(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y) {
ret <32 x i8> %min
}
-define <8 x i32> @signd(<8 x i32> %a, <8 x i32> %b) nounwind {
-entry:
-; CHECK-LABEL: signd:
-; CHECK: psignd
-; CHECK-NOT: sub
-; CHECK: ret
- %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <8 x i32> zeroinitializer, %a
- %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <8 x i32> %a, %0
- %2 = and <8 x i32> %b.lobit, %sub
- %cond = or <8 x i32> %1, %2
- ret <8 x i32> %cond
-}
-
-define <8 x i32> @blendvb(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) nounwind {
-entry:
-; CHECK-LABEL: blendvb:
-; CHECK: pblendvb
-; CHECK: ret
- %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <8 x i32> zeroinitializer, %a
- %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <8 x i32> %c, %0
- %2 = and <8 x i32> %a, %b.lobit
- %cond = or <8 x i32> %1, %2
- ret <8 x i32> %cond
-}
-
define <8 x i32> @allOnes() nounwind {
; CHECK: vpcmpeqd
; CHECK-NOT: vinsert
diff --git a/test/CodeGen/X86/avx2-nontemporal.ll b/test/CodeGen/X86/avx2-nontemporal.ll
index 058358f13b86..55c966f6f884 100644
--- a/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/test/CodeGen/X86/avx2-nontemporal.ll
@@ -1,18 +1,69 @@
-; RUN: llc < %s -march=x86 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X64
-define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E) {
-; CHECK: vmovntps %y
+define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %F, <16 x i16> %G, <32 x i8> %H) nounwind {
+; X32-LABEL: f:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-32, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: vmovdqa 104(%ebp), %ymm3
+; X32-NEXT: vmovdqa 72(%ebp), %ymm4
+; X32-NEXT: vmovdqa 40(%ebp), %ymm5
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: vaddps .LCPI0_0, %ymm0, %ymm0
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vpaddq .LCPI0_1, %ymm2, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: vaddpd .LCPI0_2, %ymm1, %ymm0
+; X32-NEXT: vmovntpd %ymm0, (%eax)
+; X32-NEXT: vpaddd .LCPI0_3, %ymm5, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: vpaddw .LCPI0_4, %ymm4, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: vpaddb .LCPI0_5, %ymm3, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: f:
+; X64: # BB#0:
+; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vaddpd {{.*}}(%rip), %ymm1, %ymm0
+; X64-NEXT: vmovntpd %ymm0, (%rdi)
+; X64-NEXT: vpaddd {{.*}}(%rip), %ymm3, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vpaddw {{.*}}(%rip), %ymm4, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vpaddb {{.*}}(%rip), %ymm5, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%cast = bitcast i8* %B to <8 x float>*
- %A2 = fadd <8 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
+ %A2 = fadd <8 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
-; CHECK: vmovntdq %y
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
-; CHECK: vmovntpd %y
%cast2 = bitcast i8* %B to <4 x double>*
- %C2 = fadd <4 x double> %C, <double 0x0, double 0x0, double 0x0, double 0x4200000000000000>
+ %C2 = fadd <4 x double> %C, <double 1.0, double 2.0, double 3.0, double 4.0>
store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
+ %cast3 = bitcast i8* %B to <8 x i32>*
+ %F2 = add <8 x i32> %F, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+ store <8 x i32> %F2, <8 x i32>* %cast3, align 32, !nontemporal !0
+ %cast4 = bitcast i8* %B to <16 x i16>*
+ %G2 = add <16 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+ store <16 x i16> %G2, <16 x i16>* %cast4, align 32, !nontemporal !0
+ %cast5 = bitcast i8* %B to <32 x i8>*
+ %H2 = add <32 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+ store <32 x i8> %H2, <32 x i8>* %cast5, align 32, !nontemporal !0
ret void
}
diff --git a/test/CodeGen/X86/avx2-phaddsub.ll b/test/CodeGen/X86/avx2-phaddsub.ll
index 3f9c95cfd070..88c70ad84fa0 100644
--- a/test/CodeGen/X86/avx2-phaddsub.ll
+++ b/test/CodeGen/X86/avx2-phaddsub.ll
@@ -1,71 +1,88 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
-; CHECK-LABEL: phaddw1:
-; CHECK: vphaddw
define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: phaddw1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
%b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
%r = add <16 x i16> %a, %b
ret <16 x i16> %r
}
-; CHECK-LABEL: phaddw2:
-; CHECK: vphaddw
define <16 x i16> @phaddw2(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: phaddw2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
%b = shufflevector <16 x i16> %y, <16 x i16> %x, <16 x i32> <i32 16, i32 18, i32 20, i32 22, i32 0, i32 2, i32 4, i32 6, i32 24, i32 26, i32 28, i32 30, i32 8, i32 10, i32 12, i32 14>
%r = add <16 x i16> %a, %b
ret <16 x i16> %r
}
-; CHECK-LABEL: phaddd1:
-; CHECK: vphaddd
define <8 x i32> @phaddd1(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phaddd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = add <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phaddd2:
-; CHECK: vphaddd
define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phaddd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
%b = shufflevector <8 x i32> %y, <8 x i32> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
%r = add <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phaddd3:
-; CHECK: vphaddd
define <8 x i32> @phaddd3(<8 x i32> %x) {
+; CHECK-LABEL: phaddd3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = add <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phsubw1:
-; CHECK: vphsubw
define <16 x i16> @phsubw1(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: phsubw1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
%b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
%r = sub <16 x i16> %a, %b
ret <16 x i16> %r
}
-; CHECK-LABEL: phsubd1:
-; CHECK: vphsubd
define <8 x i32> @phsubd1(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phsubd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = sub <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phsubd2:
-; CHECK: vphsubd
define <8 x i32> @phsubd2(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phsubd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 undef, i32 8, i32 undef, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 undef, i32 9, i32 11, i32 5, i32 7, i32 undef, i32 15>
%r = sub <8 x i32> %a, %b
diff --git a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll
deleted file mode 100644
index 6bd6a5041d41..000000000000
--- a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s
-
-define <16 x i16> @test_lvm_x86_avx2_pmovsxbw(<16 x i8>* %a) {
-; CHECK-LABEL: test_lvm_x86_avx2_pmovsxbw
-; CHECK: vpmovsxbw (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %1)
- ret <16 x i16> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbd
-; CHECK: vpmovsxbd (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbq
-; CHECK: vpmovsxbq (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %1)
- ret <4 x i64> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwd
-; CHECK: vpmovsxwd (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwq
-; CHECK: vpmovsxwq (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %1)
- ret <4 x i64> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxdq
-; CHECK: vpmovsxdq (%rdi), %ymm0
- %1 = load <4 x i32>, <4 x i32>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %1)
- ret <4 x i64> %2
-}
-
-define <16 x i16> @test_lvm_x86_avx2_pmovzxbw(<16 x i8>* %a) {
-; CHECK-LABEL: test_lvm_x86_avx2_pmovzxbw
-; CHECK: vpmovzxbw (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %1)
- ret <16 x i16> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbd
-; CHECK: vpmovzxbd (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbq
-; CHECK: vpmovzxbq (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %1)
- ret <4 x i64> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwd
-; CHECK: vpmovzxwd (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwq
-; CHECK: vpmovzxwq (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %1)
- ret <4 x i64> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxdq
-; CHECK: vpmovzxdq (%rdi), %ymm0
- %1 = load <4 x i32>, <4 x i32>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %1)
- ret <4 x i64> %2
-}
-
-declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>)
-declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>)
-declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>)
-declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>)
-declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>)
-declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>)
-declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>)
-declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>)
-declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>)
-declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>)
-declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>)
-declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>)
diff --git a/test/CodeGen/X86/avx2-pmovxrm.ll b/test/CodeGen/X86/avx2-pmovxrm.ll
new file mode 100644
index 000000000000..1d0626f66eea
--- /dev/null
+++ b/test/CodeGen/X86/avx2-pmovxrm.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define <16 x i16> @test_llvm_x86_avx2_pmovsxbw(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxbw:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxbw (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxbw:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxbw (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = sext <16 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxbd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxbd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxbd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxbd (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = sext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxbq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxbq (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxbq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxbq (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxwd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxwd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxwd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxwd (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = sext <8 x i16> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxwq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxwq (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxwq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxwq (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxdq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxdq (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxdq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxdq (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %a, align 1
+ %2 = sext <4 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <16 x i16> @test_llvm_x86_avx2_pmovzxbw(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxbw:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxbw:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = zext <16 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxbd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxbd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = zext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxbq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxbq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxwd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxwd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = zext <8 x i16> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxwq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxwq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxdq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxdq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %a, align 1
+ %2 = zext <4 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 8fd50ae3015d..2ecf2fa5a6e7 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -442,8 +442,7 @@ define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable re
; X32-LABEL: load_splat_2i64_2i64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2i64_2i64_1111:
@@ -494,14 +493,12 @@ define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwta
; X32-LABEL: load_splat_2f64_2f64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps (%eax), %xmm0
-; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
@@ -643,7 +640,7 @@ define void @crash() nounwind alwaysinline {
; X32-NEXT: je LBB31_1
; X32-NEXT: ## BB#2: ## %ret
; X32-NEXT: retl
-; X32-NEXT: .align 4, 0x90
+; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: LBB31_1: ## %footer349VF
; X32-NEXT: ## =>This Inner Loop Header: Depth=1
; X32-NEXT: jmp LBB31_1
@@ -655,7 +652,7 @@ define void @crash() nounwind alwaysinline {
; X64-NEXT: je LBB31_1
; X64-NEXT: ## BB#2: ## %ret
; X64-NEXT: retq
-; X64-NEXT: .align 4, 0x90
+; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: LBB31_1: ## %footer349VF
; X64-NEXT: ## =>This Inner Loop Header: Depth=1
; X64-NEXT: jmp LBB31_1
diff --git a/test/CodeGen/X86/avx2-vbroadcasti128.ll b/test/CodeGen/X86/avx2-vbroadcasti128.ll
new file mode 100644
index 000000000000..2f11735af046
--- /dev/null
+++ b/test/CodeGen/X86/avx2-vbroadcasti128.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
+; X32-LABEL: test_broadcast_2f64_4f64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovapd (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vaddpd LCPI0_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2f64_4f64:
+; X64: ## BB#0:
+; X64-NEXT: vmovapd (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x double>, <2 x double> *%p
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0>
+ ret <4 x double> %3
+}
+
+define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
+; X32-LABEL: test_broadcast_2i64_4i64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2i64_4i64:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64> *%p
+ %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4>
+ ret <4 x i64> %3
+}
+
+define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
+; X32-LABEL: test_broadcast_4f32_8f32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vaddps LCPI2_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4f32_8f32:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x float>, <4 x float> *%p
+ %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
+ ret <8 x float> %3
+}
+
+define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
+; X32-LABEL: test_broadcast_4i32_8i32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddd LCPI3_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4i32_8i32:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32> *%p
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+ ret <8 x i32> %3
+}
+
+define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
+; X32-LABEL: test_broadcast_8i16_16i16:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddw LCPI4_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_8i16_16i16:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16> *%p
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16>
+ ret <16 x i16> %3
+}
+
+define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {
+; X32-LABEL: test_broadcast_16i8_32i7:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddb LCPI5_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_16i8_32i7:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8> *%p
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32>
+ ret <32 x i8> %3
+}
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index b92b78035009..c9ab80bc5499 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -1,266 +1,266 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
; AVX2 Logical Shift Left
define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sllw_1:
-; CHECK-NOT: vpsllw $0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sllw_2:
-; CHECK: vpaddw %ymm0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sllw_3:
-; CHECK: vpsllw $15, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
+; CHECK-LABEL: test_slld_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_slld_1:
-; CHECK-NOT: vpslld $0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
+; CHECK-LABEL: test_slld_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_slld_2:
-; CHECK: vpaddd %ymm0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_vpslld_var(i32 %shift) {
+; CHECK-LABEL: test_vpslld_var:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
+; CHECK-NEXT: vpslld %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%amt = insertelement <8 x i32> undef, i32 %shift, i32 0
%tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
ret <8 x i32> %tmp
}
-; CHECK-LABEL: test_vpslld_var:
-; CHECK: vpslld %xmm0, %ymm1, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
+; CHECK-LABEL: test_slld_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_slld_3:
-; CHECK: vpslld $31, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_sllq_1:
-; CHECK-NOT: vpsllq $0, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_sllq_2:
-; CHECK: vpaddq %ymm0, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_sllq_3:
-; CHECK: vpsllq $63, %ymm0, %ymm0
-; CHECK: ret
-
; AVX2 Arithmetic Shift
define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sraw_1:
-; CHECK-NOT: vpsraw $0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsraw $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sraw_2:
-; CHECK: vpsraw $1, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sraw_3:
-; CHECK: vpsraw $15, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srad_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srad_1:
-; CHECK-NOT: vpsrad $0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srad_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrad $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srad_2:
-; CHECK: vpsrad $1, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srad_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srad_3:
-; CHECK: vpsrad $31, %ymm0, %ymm0
-; CHECK: ret
-
; SSE Logical Shift Right
define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_srlw_1:
-; CHECK-NOT: vpsrlw $0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_srlw_2:
-; CHECK: vpsrlw $1, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_srlw_3:
-; CHECK: vpsrlw $15, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srld_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srld_1:
-; CHECK-NOT: vpsrld $0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srld_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrld $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srld_2:
-; CHECK: vpsrld $1, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srld_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srld_3:
-; CHECK: vpsrld $31, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_srlq_1:
-; CHECK-NOT: vpsrlq $0, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlq $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_srlq_2:
-; CHECK: vpsrlq $1, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_srlq_3:
-; CHECK: vpsrlq $63, %ymm0, %ymm0
-; CHECK: ret
-
-; CHECK-LABEL: @srl_trunc_and_v4i64
-; CHECK: vpand
-; CHECK-NEXT: vpsrlvd
-; CHECK: ret
define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: srl_trunc_and_v4i64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
%trunc = trunc <4 x i64> %and to <4 x i32>
%sra = lshr <4 x i32> %x, %trunc
@@ -272,156 +272,171 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
;
define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: shl_8i16
-; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK: retq
+; CHECK-LABEL: shl_8i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%shl = shl <8 x i16> %r, %a
ret <8 x i16> %shl
}
define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: shl_16i16
-; CHECK: vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: shl_16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
+; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
+; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shl = shl <16 x i16> %r, %a
ret <16 x i16> %shl
}
define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: shl_32i8
-; CHECK: vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpsllw $2, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: shl_32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
+; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpsllw $2, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shl = shl <32 x i8> %r, %a
ret <32 x i8> %shl
}
define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: ashr_8i16
-; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK: retq
+; CHECK-LABEL: ashr_8i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%ashr = ashr <8 x i16> %r, %a
ret <8 x i16> %ashr
}
define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: ashr_16i16
-; CHECK: vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: ashr_16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3
+; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
+; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: retq
%ashr = ashr <16 x i16> %r, %a
ret <16 x i16> %ashr
}
define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: ashr_32i8
-; CHECK: vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4
-; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4
-; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4
-; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
-; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3
-; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3
-; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3
-; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
-; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: ashr_32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
+; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4
+; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4
+; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4
+; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
+; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3
+; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3
+; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3
+; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
+; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%ashr = ashr <32 x i8> %r, %a
ret <32 x i8> %ashr
}
define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: lshr_8i16
-; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK: retq
+; CHECK-LABEL: lshr_8i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%lshr = lshr <8 x i16> %r, %a
ret <8 x i16> %lshr
}
define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: lshr_16i16
-; CHECK: vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: lshr_16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
+; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
+; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: retq
%lshr = lshr <16 x i16> %r, %a
ret <16 x i16> %lshr
}
define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: lshr_32i8
-; CHECK: vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpsrlw $2, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: lshr_32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
+; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $2, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%lshr = lshr <32 x i8> %r, %a
ret <32 x i8> %lshr
}
diff --git a/test/CodeGen/X86/avx2-vperm.ll b/test/CodeGen/X86/avx2-vperm.ll
index d576d0e3741e..cba8bbe4af40 100755
--- a/test/CodeGen/X86/avx2-vperm.ll
+++ b/test/CodeGen/X86/avx2-vperm.ll
@@ -1,34 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_int_8x32:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_int_8x32
-; CHECK: vpermd
%B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> <i32 0, i32 7, i32 2, i32 1, i32 2, i32 7, i32 6, i32 0>
ret <8 x i32> %B
}
define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_fp_8x32:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_fp_8x32
-; CHECK: vpermps
%B = shufflevector <8 x float> %A, <8 x float> undef, <8 x i32> <i32 undef, i32 7, i32 2, i32 undef, i32 4, i32 undef, i32 1, i32 6>
ret <8 x float> %B
}
define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_int_4x64:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_int_4x64
-; CHECK: vpermq
%B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
ret <4 x i64> %B
}
define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_fp_4x64:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_fp_4x64
-; CHECK: vpermpd
%B = shufflevector <4 x double> %A, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
ret <4 x double> %B
}
diff --git a/test/CodeGen/X86/avx512-any_extend_load.ll b/test/CodeGen/X86/avx512-any_extend_load.ll
new file mode 100644
index 000000000000..b4336a86f6b4
--- /dev/null
+++ b/test/CodeGen/X86/avx512-any_extend_load.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gn -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gn -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
+
+define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
+; ALL-LABEL: any_extend_load_v8i64:
+; ALL: # BB#0:
+; ALL-NEXT: vpmovzxbq (%rdi), %zmm0
+; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT: vpmovqb %zmm0, (%rdi)
+; ALL-NEXT: retq
+ %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
+ %1 = zext <8 x i8> %wide.load to <8 x i64>
+ %2 = add nuw nsw <8 x i64> %1, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
+ %3 = xor <8 x i64> %2, zeroinitializer
+ %4 = trunc <8 x i64> %3 to <8 x i8>
+ store <8 x i8> %4, <8 x i8>* %ptr, align 1
+ ret void
+}
+
+define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
+; KNL-LABEL: any_extend_load_v8i32:
+; KNL: # BB#0:
+; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: any_extend_load_v8i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovzxbd (%rdi), %ymm0
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; SKX-NEXT: vpmovdb %ymm0, (%rdi)
+; SKX-NEXT: retq
+ %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
+ %1 = zext <8 x i8> %wide.load to <8 x i32>
+ %2 = add nuw nsw <8 x i32> %1, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ %3 = xor <8 x i32> %2, zeroinitializer
+ %4 = trunc <8 x i32> %3 to <8 x i8>
+ store <8 x i8> %4, <8 x i8>* %ptr, align 1
+ ret void
+}
+
+define void @any_extend_load_v8i16(<8 x i8> * %ptr) {
+; KNL-LABEL: any_extend_load_v8i16:
+; KNL: # BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: any_extend_load_v8i16:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovzxbw (%rdi), %xmm0
+; SKX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT: vpmovwb %xmm0, (%rdi)
+; SKX-NEXT: retq
+ %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
+ %1 = zext <8 x i8> %wide.load to <8 x i16>
+ %2 = add nuw nsw <8 x i16> %1, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+ %3 = xor <8 x i16> %2, zeroinitializer
+ %4 = trunc <8 x i16> %3 to <8 x i8>
+ store <8 x i8> %4, <8 x i8>* %ptr, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 9220e4f269cd..62dece137cc0 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -94,10 +94,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1
; AVX512F-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
@@ -107,10 +107,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512VL-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
; AVX512VL-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm1
; AVX512VL-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
@@ -120,10 +120,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1
; AVX512BW-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
@@ -140,6 +140,128 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
ret <8 x i64>%z
}
+define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
+; AVX512F-LABEL: imulq256:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512F-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: imulq256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512VL-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: imulq256:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512BW-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512BW-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: imulq256:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512DQ-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: retq
+;
+; SKX-LABEL: imulq256:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0
+; SKX-NEXT: retq
+ %z = mul <4 x i64>%x, %y
+ ret <4 x i64>%z
+}
+
+define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
+; AVX512F-LABEL: imulq128:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512F-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: imulq128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: imulq128:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: imulq128:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512DQ-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512DQ-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512DQ-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512DQ-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; SKX-LABEL: imulq128:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0
+; SKX-NEXT: retq
+ %z = mul <2 x i64>%x, %y
+ ret <2 x i64>%z
+}
+
define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
; CHECK-LABEL: mulpd512:
; CHECK: ## BB#0: ## %entry
@@ -553,6 +675,7 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vminpd:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512F-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -560,13 +683,14 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512VL-LABEL: test_mask_vminpd:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxord %ymm4, %ymm4, %ymm4
; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; AVX512VL-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vminpd:
; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512BW-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -574,6 +698,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512DQ-LABEL: test_mask_vminpd:
; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512DQ-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -581,7 +706,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
;
; SKX-LABEL: test_mask_vminpd:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT: vpxord %ymm4, %ymm4, %ymm4
; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -613,6 +738,7 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vmaxpd:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512F-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -620,13 +746,14 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512VL-LABEL: test_mask_vmaxpd:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxord %ymm4, %ymm4, %ymm4
; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vmaxpd:
; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -634,6 +761,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512DQ-LABEL: test_mask_vmaxpd:
; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512DQ-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -641,7 +769,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
;
; SKX-LABEL: test_mask_vmaxpd:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT: vpxord %ymm4, %ymm4, %ymm4
; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-bugfix-23634.ll b/test/CodeGen/X86/avx512-bugfix-23634.ll
index c31a13ad3114..0dcfb7c169f3 100644
--- a/test/CodeGen/X86/avx512-bugfix-23634.ll
+++ b/test/CodeGen/X86/avx512-bugfix-23634.ll
@@ -1,13 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-; CHECK-LABEL: f_fu
-; CHECK-NOT: vpblend
-; CHECK: vmovdqa32 {{.*}} {%k1}
-
define void @f_fu(float* %ret, float* %aa, float %b) {
+; CHECK-LABEL: f_fu:
+; CHECK: ## BB#0: ## %allocas
+; CHECK-NEXT: vcvttss2si %xmm0, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %zmm0
+; CHECK-NEXT: vcvttps2dq (%rsi), %zmm1
+; CHECK-NEXT: vpsrld $31, %zmm0, %zmm2
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm2
+; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2
+; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpblendmd {{.*}}(%rip), %zmm1, %zmm1 {%k1}
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT: vmovups %zmm0, (%rdi)
+; CHECK-NEXT: retq
allocas:
%ptr_cast_for_load = bitcast float* %aa to <16 x float>*
%ptr_masked_load.39 = load <16 x float>, <16 x float>* %ptr_cast_for_load, align 4
@@ -23,13 +36,13 @@ allocas:
%v1.i = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>, <16 x i32> %a_load_to_int32
- %foo_test = add <16 x i32> %div_v019_load_, %b_load_to_int32_broadcast
+ %foo_test = add <16 x i32> %div_v019_load_, %b_load_to_int32_broadcast
- %add_struct_offset_y_struct_offset33_x = add <16 x i32> %foo_test, %v1.i
+ %add_struct_offset_y_struct_offset33_x = add <16 x i32> %foo_test, %v1.i
%val = sitofp <16 x i32> %add_struct_offset_y_struct_offset33_x to <16 x float>
%ptrcast = bitcast float* %ret to <16 x float>*
store <16 x float> %val, <16 x float>* %ptrcast, align 4
ret void
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/X86/avx512-bugfix-26264.ll b/test/CodeGen/X86/avx512-bugfix-26264.ll
new file mode 100644
index 000000000000..b3e1b17076bb
--- /dev/null
+++ b/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw < %s | FileCheck %s --check-prefix=AVX512BW
+
+define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
+; AVX512BW-LABEL: test_load_32f64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
+; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovaps %zmm2, %zmm1
+; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
+; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
+; AVX512BW-NEXT: retq
+ %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
+ ret <32 x double> %res
+}
+
+define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64> %src0) {
+; AVX512BW-LABEL: test_load_32i64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2}
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
+; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovaps %zmm2, %zmm1
+; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
+; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
+; AVX512BW-NEXT: retq
+ %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
+ ret <32 x i64> %res
+}
+
+declare <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0)
+declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index 0f89aa71162e..980b87187d98 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -4,7 +4,8 @@
define <16 x i32> @test2(<16 x i32> %x) {
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
ret <16 x i32>%res
@@ -15,8 +16,8 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index a61aeba5aff9..fce592a5318b 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -1,13 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
-; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
define <16 x i1> @test1() {
-; ALL_X64-LABEL: test1:
-; ALL_X64: ## BB#0:
-; ALL_X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; ALL_X64-NEXT: retq
+; KNL-LABEL: test1:
+; KNL: ## BB#0:
+; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; SKX-NEXT: retq
;
; KNL_X32-LABEL: test1:
; KNL_X32: ## BB#0:
@@ -25,7 +30,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -47,7 +53,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpbroadcastd LCPI1_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <16 x i1>%a, %b
@@ -63,7 +70,8 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -86,8 +94,8 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL_X32-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0
-; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <8 x i1>%a, %b
@@ -102,11 +110,10 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
;
; SKX-LABEL: test4:
; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k0
-; SKX-NEXT: vpslld $31, %xmm1, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k0 {%k1}
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
;
@@ -128,6 +135,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: callq _func8xi1
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-NEXT: vpslld $31, %ymm0, %ymm0
@@ -143,7 +151,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: callq _func8xi1
-; SKX-NEXT: vpmovzxwd %xmm0, %ymm0
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: vpslld $31, %ymm0, %ymm0
; SKX-NEXT: vpsrad $31, %ymm0, %ymm0
; SKX-NEXT: popq %rax
@@ -156,7 +164,8 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: calll L_func8xi1$stub
+; KNL_X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_X32-NEXT: calll _func8xi1
; KNL_X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_X32-NEXT: vpslld $31, %ymm0, %ymm0
; KNL_X32-NEXT: vpsrad $31, %ymm0, %ymm0
@@ -177,10 +186,11 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: Ltmp1:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: callq _func16xi1
-; KNL-NEXT: vpmovzxbd %xmm0, %zmm0
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vpsrad $31, %zmm0, %zmm0
; KNL-NEXT: popq %rax
@@ -194,7 +204,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: callq _func16xi1
-; SKX-NEXT: vpmovzxbd %xmm0, %zmm0
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: vpslld $31, %zmm0, %zmm0
; SKX-NEXT: vpsrad $31, %zmm0, %zmm0
; SKX-NEXT: popq %rax
@@ -206,10 +216,11 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL_X32-NEXT: Ltmp1:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
-; KNL_X32-NEXT: vpbroadcastd LCPI5_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
-; KNL_X32-NEXT: calll L_func16xi1$stub
-; KNL_X32-NEXT: vpmovzxbd %xmm0, %zmm0
+; KNL_X32-NEXT: calll _func16xi1
+; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
; KNL_X32-NEXT: vpsrad $31, %zmm0, %zmm0
; KNL_X32-NEXT: addl $12, %esp
@@ -254,7 +265,7 @@ define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
; KNL_X32-NEXT: Ltmp2:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; KNL_X32-NEXT: calll L_func4xi1$stub
+; KNL_X32-NEXT: calll _func4xi1
; KNL_X32-NEXT: vpslld $31, %xmm0, %xmm0
; KNL_X32-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp
@@ -273,14 +284,15 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: callq _func8xi1
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: movb $85, %al
-; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
@@ -309,15 +321,15 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: calll L_func8xi1$stub
+; KNL_X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_X32-NEXT: calll _func8xi1
; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL_X32-NEXT: vpsllvq LCPI7_0, %zmm0, %zmm0
; KNL_X32-NEXT: movb $85, %al
-; KNL_X32-NEXT: movzbl %al, %eax
; KNL_X32-NEXT: kmovw %eax, %k1
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0
-; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp
; KNL_X32-NEXT: retl
@@ -328,14 +340,23 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
}
define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) {
-; ALL_X64-LABEL: test8:
-; ALL_X64: ## BB#0:
-; ALL_X64-NEXT: testb $1, %dil
-; ALL_X64-NEXT: jne LBB8_2
-; ALL_X64-NEXT: ## BB#1:
-; ALL_X64-NEXT: vmovaps %zmm1, %zmm0
-; ALL_X64-NEXT: LBB8_2:
-; ALL_X64-NEXT: retq
+; KNL-LABEL: test8:
+; KNL: ## BB#0:
+; KNL-NEXT: testb $1, %dil
+; KNL-NEXT: jne LBB8_2
+; KNL-NEXT: ## BB#1:
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: LBB8_2:
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test8:
+; SKX: ## BB#0:
+; SKX-NEXT: testb $1, %dil
+; SKX-NEXT: jne LBB8_2
+; SKX-NEXT: ## BB#1:
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: LBB8_2:
+; SKX-NEXT: retq
;
; KNL_X32-LABEL: test8:
; KNL_X32: ## BB#0:
@@ -358,7 +379,7 @@ define i1 @test9(double %a, double %b) {
;
; KNL_X32-LABEL: test9:
; KNL_X32: ## BB#0:
-; KNL_X32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0
+; KNL_X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; KNL_X32-NEXT: vucomisd {{[0-9]+}}(%esp), %xmm0
; KNL_X32-NEXT: setb %al
; KNL_X32-NEXT: retl
@@ -464,7 +485,7 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; KNL_X32-NEXT: movl %edi, (%esp)
; KNL_X32-NEXT: calll _test11
-; KNL_X32-NEXT: movb %al, %bl
+; KNL_X32-NEXT: movl %eax, %ebx
; KNL_X32-NEXT: movzbl %bl, %eax
; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; KNL_X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index 6e0d18558c51..fceb9c14b7df 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -1,8 +1,19 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; CHECK-LABEL: test1
-; CHECK: vucomisd {{.*}}encoding: [0x62
define double @test1(double %a, double %b) nounwind {
+; ALL-LABEL: test1:
+; ALL: ## BB#0:
+; ALL-NEXT: vucomisd %xmm1, %xmm0
+; ALL-NEXT: jne LBB0_1
+; ALL-NEXT: jnp LBB0_2
+; ALL-NEXT: LBB0_1: ## %l1
+; ALL-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
+; ALL-NEXT: LBB0_2: ## %l2
+; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%tobool = fcmp une double %a, %b
br i1 %tobool, label %l1, label %l2
@@ -14,9 +25,17 @@ l2:
ret double %c1
}
-; CHECK-LABEL: test2
-; CHECK: vucomiss {{.*}}encoding: [0x62
define float @test2(float %a, float %b) nounwind {
+; ALL-LABEL: test2:
+; ALL: ## BB#0:
+; ALL-NEXT: vucomiss %xmm0, %xmm1
+; ALL-NEXT: jbe LBB1_2
+; ALL-NEXT: ## BB#1: ## %l1
+; ALL-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
+; ALL-NEXT: LBB1_2: ## %l2
+; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%tobool = fcmp olt float %a, %b
br i1 %tobool, label %l1, label %l2
@@ -29,18 +48,35 @@ l2:
}
; FIXME: Can use vcmpeqss and extract from the mask here in AVX512.
-; CHECK-LABEL: test3
-; CHECK: vucomiss {{.*}}encoding: [0x62
define i32 @test3(float %a, float %b) {
+; ALL-LABEL: test3:
+; ALL: ## BB#0:
+; ALL-NEXT: vucomiss %xmm1, %xmm0
+; ALL-NEXT: setnp %al
+; ALL-NEXT: sete %cl
+; ALL-NEXT: andb %al, %cl
+; ALL-NEXT: movzbl %cl, %eax
+; ALL-NEXT: retq
%cmp10.i = fcmp oeq float %a, %b
%conv11.i = zext i1 %cmp10.i to i32
ret i32 %conv11.i
}
-; CHECK-LABEL: test5
-; CHECK: ret
define float @test5(float %p) #0 {
+; ALL-LABEL: test5:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vucomiss %xmm1, %xmm0
+; ALL-NEXT: jne LBB3_1
+; ALL-NEXT: jnp LBB3_2
+; ALL-NEXT: LBB3_1: ## %if.end
+; ALL-NEXT: seta %al
+; ALL-NEXT: movzbl %al, %eax
+; ALL-NEXT: leaq {{.*}}(%rip), %rcx
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: LBB3_2: ## %return
+; ALL-NEXT: retq
entry:
%cmp = fcmp oeq float %p, 0.000000e+00
br i1 %cmp, label %return, label %if.end
@@ -55,21 +91,25 @@ return: ; preds = %if.end, %entry
ret float %retval.0
}
-; CHECK-LABEL: test6
-; CHECK: cmpl
-; CHECK-NOT: kmov
-; CHECK: ret
define i32 @test6(i32 %a, i32 %b) {
+; ALL-LABEL: test6:
+; ALL: ## BB#0:
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: sete %al
+; ALL-NEXT: retq
%cmp = icmp eq i32 %a, %b
%res = zext i1 %cmp to i32
ret i32 %res
}
-; CHECK-LABEL: test7
-; CHECK: vucomisd
-; CHECK-NOT: kmov
-; CHECK: ret
define i32 @test7(double %x, double %y) #2 {
+; ALL-LABEL: test7:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: vucomisd %xmm1, %xmm0
+; ALL-NEXT: setne %al
+; ALL-NEXT: retq
entry:
%0 = fcmp one double %x, %y
%or = zext i1 %0 to i32
@@ -77,6 +117,16 @@ entry:
}
define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
+; ALL-LABEL: test8:
+; ALL: ## BB#0:
+; ALL-NEXT: testl %edx, %edx
+; ALL-NEXT: movl $1, %eax
+; ALL-NEXT: cmovel %eax, %edx
+; ALL-NEXT: cmpl $-2147483648, %esi ## imm = 0x80000000
+; ALL-NEXT: cmovnel %edx, %eax
+; ALL-NEXT: cmpl $-1, %edi
+; ALL-NEXT: cmovnel %edx, %eax
+; ALL-NEXT: retq
%tmp1 = icmp eq i32 %a1, -1
%tmp2 = icmp eq i32 %a2, -2147483648
%tmp3 = and i1 %tmp1, %tmp2
@@ -86,11 +136,17 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
ret i32 %res
}
-; CHECK-LABEL: test9
-; CHECK: testb
-; CHECK-NOT: kmov
-; CHECK: ret
define i32 @test9(i64 %a) {
+; ALL-LABEL: test9:
+; ALL: ## BB#0:
+; ALL-NEXT: testb $1, %dil
+; ALL-NEXT: jne LBB7_2
+; ALL-NEXT: ## BB#1: ## %A
+; ALL-NEXT: movl $6, %eax
+; ALL-NEXT: retq
+; ALL-NEXT: LBB7_2: ## %B
+; ALL-NEXT: movl $7, %eax
+; ALL-NEXT: retq
%b = and i64 %a, 1
%cmp10.i = icmp eq i64 %b, 0
br i1 %cmp10.i, label %A, label %B
@@ -99,3 +155,35 @@ A:
B:
ret i32 7
}
+
+define i32 @test10(i64 %b, i64 %c, i1 %d) {
+; ALL-LABEL: test10:
+; ALL: ## BB#0:
+; ALL-NEXT: andl $1, %edx
+; ALL-NEXT: kmovw %edx, %k0
+; ALL-NEXT: cmpq %rsi, %rdi
+; ALL-NEXT: sete %al
+; ALL-NEXT: kmovw %eax, %k1
+; ALL-NEXT: korw %k1, %k0, %k1
+; ALL-NEXT: kxorw %k1, %k0, %k0
+; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: testb %al, %al
+; ALL-NEXT: je LBB8_1
+; ALL-NEXT: ## BB#2: ## %if.end.i
+; ALL-NEXT: movl $6, %eax
+; ALL-NEXT: retq
+; ALL-NEXT: LBB8_1: ## %if.then.i
+; ALL-NEXT: movl $5, %eax
+; ALL-NEXT: retq
+
+ %cmp8.i = icmp eq i64 %b, %c
+ %or1 = or i1 %d, %cmp8.i
+ %xor1 = xor i1 %d, %or1
+ br i1 %xor1, label %if.end.i, label %if.then.i
+
+if.then.i:
+ ret i32 5
+
+if.end.i:
+ ret i32 6
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 586a29545014..914f859927be 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,228 +1,511 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; CHECK-LABEL: sitof32
-; CHECK: vcvtdq2ps %zmm
-; CHECK: ret
define <16 x float> @sitof32(<16 x i32> %a) nounwind {
+; ALL-LABEL: sitof32:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = sitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
-; CHECK-LABEL: sltof864
-; CHECK: vcvtqq2pd
define <8 x double> @sltof864(<8 x i64> %a) {
+; KNL-LABEL: sltof864:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; KNL-NEXT: vpextrq $1, %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof864:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0
+; SKX-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
-; CHECK-LABEL: sltof464
-; CHECK: vcvtqq2pd
define <4 x double> @sltof464(<4 x i64> %a) {
+; KNL-LABEL: sltof464:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpextrq $1, %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof464:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2pd %ymm0, %ymm0
+; SKX-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x double>
ret <4 x double> %b
}
-; CHECK-LABEL: sltof2f32
-; CHECK: vcvtqq2ps
define <2 x float> @sltof2f32(<2 x i64> %a) {
+; KNL-LABEL: sltof2f32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof2f32:
+; SKX: ## BB#0:
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x float>
ret <2 x float>%b
}
-; CHECK-LABEL: sltof4f32_mem
-; CHECK: vcvtqq2psy (%rdi)
define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
+; KNL-LABEL: sltof4f32_mem:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof4f32_mem:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2psy (%rdi), %xmm0
+; SKX-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
}
-; CHECK-LABEL: f64tosl
-; CHECK: vcvttpd2qq
define <4 x i64> @f64tosl(<4 x double> %a) {
+; KNL-LABEL: f64tosl:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; KNL-NEXT: vcvttsd2si %xmm1, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; KNL-NEXT: vcvttsd2si %xmm1, %rax
+; KNL-NEXT: vmovq %rax, %xmm1
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; KNL-NEXT: vcvttsd2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; KNL-NEXT: vcvttsd2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm0
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: f64tosl:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttpd2qq %ymm0, %ymm0
+; SKX-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i64>
ret <4 x i64> %b
}
-; CHECK-LABEL: f32tosl
-; CHECK: vcvttps2qq
define <4 x i64> @f32tosl(<4 x float> %a) {
+; KNL-LABEL: f32tosl:
+; KNL: ## BB#0:
+; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; KNL-NEXT: vcvttss2si %xmm1, %rax
+; KNL-NEXT: vmovq %rax, %xmm1
+; KNL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; KNL-NEXT: vcvttss2si %xmm2, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; KNL-NEXT: vcvttss2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vcvttss2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm0
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: f32tosl:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttps2qq %xmm0, %ymm0
+; SKX-NEXT: retq
%b = fptosi <4 x float> %a to <4 x i64>
ret <4 x i64> %b
}
-; CHECK-LABEL: sltof432
-; CHECK: vcvtqq2ps
define <4 x float> @sltof432(<4 x i64> %a) {
+; KNL-LABEL: sltof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof432:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
-; CHECK-LABEL: ultof432
-; CHECK: vcvtuqq2ps
define <4 x float> @ultof432(<4 x i64> %a) {
+; KNL-LABEL: ultof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ultof432:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
-; CHECK-LABEL: ultof64
-; CHECK: vcvtuqq2pd
define <8 x double> @ultof64(<8 x i64> %a) {
+; KNL-LABEL: ultof64:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; KNL-NEXT: vpextrq $1, %xmm1, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm1, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ultof64:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0
+; SKX-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
-; CHECK-LABEL: fptosi00
-; CHECK: vcvttps2dq %zmm
-; CHECK: ret
define <16 x i32> @fptosi00(<16 x float> %a) nounwind {
+; ALL-LABEL: fptosi00:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: retq
%b = fptosi <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
-; CHECK-LABEL: fptoui00
-; CHECK: vcvttps2udq
-; CHECK: ret
define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
+; ALL-LABEL: fptoui00:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttps2udq %zmm0, %zmm0
+; ALL-NEXT: retq
%b = fptoui <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
-; CHECK-LABEL: fptoui_256
-; CHECK: vcvttps2udq
-; CHECK: ret
define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
+; KNL-LABEL: fptoui_256:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptoui_256:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttps2udq %ymm0, %ymm0
+; SKX-NEXT: retq
%b = fptoui <8 x float> %a to <8 x i32>
ret <8 x i32> %b
}
-; CHECK-LABEL: fptoui_128
-; CHECK: vcvttps2udq
-; CHECK: ret
define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
+; KNL-LABEL: fptoui_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptoui_128:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttps2udq %xmm0, %xmm0
+; SKX-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
-; CHECK-LABEL: fptoui01
-; CHECK: vcvttpd2udq
-; CHECK: ret
define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
+; ALL-LABEL: fptoui01:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; ALL-NEXT: retq
%b = fptoui <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
-; CHECK-LABEL: sitof64
-; CHECK: vcvtdq2pd %ymm
-; CHECK: ret
+define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
+; KNL-LABEL: fptoui_256d:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptoui_256d:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttpd2udq %ymm0, %xmm0
+; SKX-NEXT: retq
+ %b = fptoui <4 x double> %a to <4 x i32>
+ ret <4 x i32> %b
+}
+
define <8 x double> @sitof64(<8 x i32> %a) {
+; ALL-LABEL: sitof64:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%b = sitofp <8 x i32> %a to <8 x double>
ret <8 x double> %b
}
-; CHECK-LABEL: fptosi01
-; CHECK: vcvttpd2dq %zmm
-; CHECK: ret
define <8 x i32> @fptosi01(<8 x double> %a) {
+; ALL-LABEL: fptosi01:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; ALL-NEXT: retq
%b = fptosi <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
-; CHECK-LABEL: fptosi03
-; CHECK: vcvttpd2dq %ymm
-; CHECK: ret
define <4 x i32> @fptosi03(<4 x double> %a) {
+; KNL-LABEL: fptosi03:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptosi03:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; SKX-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
-; CHECK-LABEL: fptrunc00
-; CHECK: vcvtpd2ps %zmm
-; CHECK-NEXT: vcvtpd2ps %zmm
-; CHECK-NEXT: vinsertf
-; CHECK: ret
define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
+; KNL-LABEL: fptrunc00:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2ps %zmm0, %ymm0
+; KNL-NEXT: vcvtpd2ps %zmm1, %ymm1
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptrunc00:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0
+; SKX-NEXT: vcvtpd2ps %zmm1, %ymm1
+; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT: retq
%a = fptrunc <16 x double> %b to <16 x float>
ret <16 x float> %a
}
-; CHECK-LABEL: fptrunc01
-; CHECK: vcvtpd2ps %ymm
define <4 x float> @fptrunc01(<4 x double> %b) {
+; KNL-LABEL: fptrunc01:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2psy %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptrunc01:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
ret <4 x float> %a
}
-; CHECK-LABEL: fptrunc02
-; CHECK: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
+; KNL-LABEL: fptrunc02:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL-NEXT: vcvtpd2psy %ymm0, %xmm0
+; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptrunc02:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
}
-; CHECK-LABEL: fpext00
-; CHECK: vcvtps2pd %ymm0, %zmm0
-; CHECK: ret
define <8 x double> @fpext00(<8 x float> %b) nounwind {
+; ALL-LABEL: fpext00:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%a = fpext <8 x float> %b to <8 x double>
ret <8 x double> %a
}
-; CHECK-LABEL: fpext01
-; CHECK: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
-; CHECK: ret
define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
+; KNL-LABEL: fpext01:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtps2pd %xmm0, %ymm0
+; KNL-NEXT: vcmpltpd %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; KNL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fpext01:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd %ymm2, %ymm1, %k1
+; SKX-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = fpext <4 x float> %b to <4 x double>
%mask = fcmp ogt <4 x double>%a1, %b1
%c = select <4 x i1>%mask, <4 x double>%a, <4 x double>zeroinitializer
ret <4 x double> %c
}
-; CHECK-LABEL: funcA
-; CHECK: vcvtsi2sdq (%rdi){{.*}} encoding: [0x62
-; CHECK: ret
define double @funcA(i64* nocapture %e) {
+; ALL-LABEL: funcA:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i64, i64* %e, align 8
%conv = sitofp i64 %tmp1 to double
ret double %conv
}
-; CHECK-LABEL: funcB
-; CHECK: vcvtsi2sdl (%{{.*}} encoding: [0x62
-; CHECK: ret
define double @funcB(i32* %e) {
+; ALL-LABEL: funcB:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i32, i32* %e, align 4
%conv = sitofp i32 %tmp1 to double
ret double %conv
}
-; CHECK-LABEL: funcC
-; CHECK: vcvtsi2ssl (%{{.*}} encoding: [0x62
-; CHECK: ret
define float @funcC(i32* %e) {
+; ALL-LABEL: funcC:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i32, i32* %e, align 4
%conv = sitofp i32 %tmp1 to float
ret float %conv
}
-; CHECK-LABEL: i64tof32
-; CHECK: vcvtsi2ssq (%{{.*}} encoding: [0x62
-; CHECK: ret
define float @i64tof32(i64* %e) {
+; ALL-LABEL: i64tof32:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i64, i64* %e, align 8
%conv = sitofp i64 %tmp1 to float
ret float %conv
}
-; CHECK-LABEL: fpext
-; CHECK: vcvtss2sd {{.*}} encoding: [0x62
-; CHECK: ret
define void @fpext() {
+; ALL-LABEL: fpext:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: retq
entry:
%f = alloca float, align 4
%d = alloca double, align 8
@@ -232,12 +515,13 @@ entry:
ret void
}
-; CHECK-LABEL: fpround_scalar
-; CHECK: vmovsd {{.*}} encoding: [0x62
-; CHECK: vcvtsd2ss {{.*}} encoding: [0x62
-; CHECK: vmovss {{.*}} encoding: [0x62
-; CHECK: ret
define void @fpround_scalar() nounwind uwtable {
+; ALL-LABEL: fpround_scalar:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: retq
entry:
%f = alloca float, align 4
%d = alloca double, align 8
@@ -247,179 +531,258 @@ entry:
ret void
}
-; CHECK-LABEL: long_to_double
-; CHECK: vmovq {{.*}} encoding: [0x62
-; CHECK: ret
define double @long_to_double(i64 %x) {
+; ALL-LABEL: long_to_double:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovq %rdi, %xmm0
+; ALL-NEXT: retq
%res = bitcast i64 %x to double
ret double %res
}
-; CHECK-LABEL: double_to_long
-; CHECK: vmovq {{.*}} encoding: [0x62
-; CHECK: ret
define i64 @double_to_long(double %x) {
+; ALL-LABEL: double_to_long:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: retq
%res = bitcast double %x to i64
ret i64 %res
}
-; CHECK-LABEL: int_to_float
-; CHECK: vmovd {{.*}} encoding: [0x62
-; CHECK: ret
define float @int_to_float(i32 %x) {
+; ALL-LABEL: int_to_float:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovd %edi, %xmm0
+; ALL-NEXT: retq
%res = bitcast i32 %x to float
ret float %res
}
-; CHECK-LABEL: float_to_int
-; CHECK: vmovd {{.*}} encoding: [0x62
-; CHECK: ret
define i32 @float_to_int(float %x) {
+; ALL-LABEL: float_to_int:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: retq
%res = bitcast float %x to i32
ret i32 %res
}
define <16 x double> @uitof64(<16 x i32> %a) nounwind {
-; CHECK-LABEL: uitof64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm2
-; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm0
-; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: uitof64:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtudq2pd %ymm0, %zmm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vcvtudq2pd %ymm0, %zmm1
+; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof64:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm2
+; SKX-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm1
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x double>
ret <16 x double> %b
}
-; CHECK-LABEL: uitof64_256
-; CHECK: vcvtudq2pd
-; CHECK: ret
define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
+; KNL-LABEL: uitof64_256:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof64_256:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0
+; SKX-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x double>
ret <4 x double> %b
}
-; CHECK-LABEL: uitof32
-; CHECK: vcvtudq2ps
-; CHECK: ret
define <16 x float> @uitof32(<16 x i32> %a) nounwind {
+; ALL-LABEL: uitof32:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
-; CHECK-LABEL: uitof32_256
-; CHECK: vcvtudq2ps
-; CHECK: ret
define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
+; KNL-LABEL: uitof32_256:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof32_256:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0
+; SKX-NEXT: retq
%b = uitofp <8 x i32> %a to <8 x float>
ret <8 x float> %b
}
-; CHECK-LABEL: uitof32_128
-; CHECK: vcvtudq2ps
-; CHECK: ret
define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
+; KNL-LABEL: uitof32_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof32_128:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0
+; SKX-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
-; CHECK-LABEL: @fptosi02
-; CHECK: vcvttss2si {{.*}} encoding: [0x62
-; CHECK: ret
define i32 @fptosi02(float %a) nounwind {
+; ALL-LABEL: fptosi02:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttss2si %xmm0, %eax
+; ALL-NEXT: retq
%b = fptosi float %a to i32
ret i32 %b
}
-; CHECK-LABEL: @fptoui02
-; CHECK: vcvttss2usi {{.*}} encoding: [0x62
-; CHECK: ret
define i32 @fptoui02(float %a) nounwind {
+; ALL-LABEL: fptoui02:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttss2usi %xmm0, %eax
+; ALL-NEXT: retq
%b = fptoui float %a to i32
ret i32 %b
}
-; CHECK-LABEL: @uitofp02
-; CHECK: vcvtusi2ss
-; CHECK: ret
define float @uitofp02(i32 %a) nounwind {
+; ALL-LABEL: uitofp02:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
+; ALL-NEXT: retq
%b = uitofp i32 %a to float
ret float %b
}
-; CHECK-LABEL: @uitofp03
-; CHECK: vcvtusi2sd
-; CHECK: ret
define double @uitofp03(i32 %a) nounwind {
+; ALL-LABEL: uitofp03:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
+; ALL-NEXT: retq
%b = uitofp i32 %a to double
ret double %b
}
-; CHECK-LABEL: @sitofp_16i1_float
-; CHECK: vpmovm2d
-; CHECK: vcvtdq2ps
define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
+; KNL-LABEL: sitofp_16i1_float:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sitofp_16i1_float:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
+; SKX-NEXT: vpmovm2d %k0, %zmm0
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0
+; SKX-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = sitofp <16 x i1> %mask to <16 x float>
ret <16 x float> %1
}
-; CHECK-LABEL: @sitofp_16i8_float
-; CHECK: vpmovsxbd
-; CHECK: vcvtdq2ps
define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
+; ALL-LABEL: sitofp_16i8_float:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbd %xmm0, %zmm0
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <16 x i8> %a to <16 x float>
ret <16 x float> %1
}
-; CHECK-LABEL: @sitofp_16i16_float
-; CHECK: vpmovsxwd
-; CHECK: vcvtdq2ps
define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
+; ALL-LABEL: sitofp_16i16_float:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <16 x i16> %a to <16 x float>
ret <16 x float> %1
}
-; CHECK-LABEL: @sitofp_8i16_double
-; CHECK: vpmovsxwd
-; CHECK: vcvtdq2pd
define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
+; ALL-LABEL: sitofp_8i16_double:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <8 x i16> %a to <8 x double>
ret <8 x double> %1
}
-; CHECK-LABEL: sitofp_8i8_double
-; CHECK: vpmovzxwd
-; CHECK: vpslld
-; CHECK: vpsrad
-; CHECK: vcvtdq2pd
define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
+; ALL-LABEL: sitofp_8i8_double:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: vpslld $24, %ymm0, %ymm0
+; ALL-NEXT: vpsrad $24, %ymm0, %ymm0
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <8 x i8> %a to <8 x double>
ret <8 x double> %1
}
-
-; CHECK-LABEL: @sitofp_8i1_double
-; CHECK: vpmovm2d
-; CHECK: vcvtdq2pd
define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
+; KNL-LABEL: sitofp_8i1_double:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sitofp_8i1_double:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; SKX-NEXT: vpmovm2d %k0, %ymm0
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0
+; SKX-NEXT: retq
%cmpres = fcmp ogt <8 x double> %a, zeroinitializer
%1 = sitofp <8 x i1> %cmpres to <8 x double>
ret <8 x double> %1
}
-; CHECK-LABEL: @uitofp_16i8
-; CHECK: vpmovzxbd
-; CHECK: vcvtudq2ps
define <16 x float> @uitofp_16i8(<16 x i8>%a) {
+; ALL-LABEL: uitofp_16i8:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = uitofp <16 x i8> %a to <16 x float>
ret <16 x float>%b
}
-; CHECK-LABEL: @uitofp_16i16
-; CHECK: vpmovzxwd
-; CHECK: vcvtudq2ps
define <16 x float> @uitofp_16i16(<16 x i16>%a) {
+; ALL-LABEL: uitofp_16i16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = uitofp <16 x i16> %a to <16 x float>
ret <16 x float>%b
}
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index bc1509684475..faac7b20fd61 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -15,7 +15,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = zext <8 x i8> %a to <8 x i16>
@@ -59,7 +59,7 @@ define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbw (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
%x = zext <16 x i8> %a to <16 x i16>
@@ -90,15 +90,10 @@ define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi
}
define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
-; KNL-LABEL: zext_16x8_to_16x16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: retq
-;
-; SKX-LABEL: zext_16x8_to_16x16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpmovzxbw %xmm0, %ymm0
-; SKX-NEXT: retq
+; ALL-LABEL: zext_16x8_to_16x16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; ALL-NEXT: retq
%x = zext <16 x i8> %a to <16 x i16>
ret <16 x i16> %x
}
@@ -117,7 +112,7 @@ define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
-; SKX-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; SKX-NEXT: retq
%x = zext <16 x i8> %a to <16 x i16>
%ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
@@ -175,7 +170,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
-; SKX-NEXT: vpmovzxbw (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
; SKX-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
%x = zext <32 x i8> %a to <32 x i16>
@@ -223,7 +218,7 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
;
; SKX-LABEL: zext_32x8_to_32x16:
; SKX: ## BB#0:
-; SKX-NEXT: vpmovzxbw %ymm0, %zmm0
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; SKX-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
@@ -250,7 +245,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
-; SKX-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; SKX-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
@@ -314,8 +309,8 @@ define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; SKX-LABEL: zext_4x8mem_to_4x32:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
%x = zext <4 x i8> %a to <4 x i32>
@@ -335,7 +330,7 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; SKX-LABEL: sext_4x8mem_to_4x32:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
@@ -353,13 +348,14 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x8mem_to_8x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = zext <8 x i8> %a to <8 x i32>
@@ -376,6 +372,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxbd (%rdi), %ymm0
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x8mem_to_8x32:
@@ -396,14 +393,14 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8mem_to_16x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
%x = zext <16 x i8> %a to <16 x i32>
@@ -438,14 +435,14 @@ define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8_to_16x32_mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
-; SKX-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: retq
%x = zext <16 x i8> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
@@ -475,7 +472,7 @@ define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
; ALL-LABEL: zext_16x8_to_16x32:
; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxbd %xmm0, %zmm0
+; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; ALL-NEXT: retq
%x = zext <16 x i8> %i to <16 x i32>
ret <16 x i32> %x
@@ -503,8 +500,8 @@ define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind re
; SKX-LABEL: zext_2x8mem_to_2x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
%x = zext <2 x i8> %a to <2 x i64>
@@ -524,7 +521,7 @@ define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwin
; SKX-LABEL: sext_2x8mem_to_2x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
@@ -555,8 +552,8 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; SKX-LABEL: zext_4x8mem_to_4x64:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
%x = zext <4 x i8> %a to <4 x i64>
@@ -577,7 +574,7 @@ define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwin
; SKX-LABEL: sext_4x8mem_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
@@ -602,14 +599,14 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x8mem_to_8x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = zext <8 x i8> %a to <8 x i64>
@@ -660,8 +657,8 @@ define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x16mem_to_4x32:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = zext <4 x i16> %a to <4 x i32>
@@ -681,7 +678,7 @@ define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounw
; SKX-LABEL: sext_4x16mem_to_4x32mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
@@ -710,13 +707,14 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16mem_to_8x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = zext <8 x i16> %a to <8 x i32>
@@ -733,6 +731,7 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw
; KNL-NEXT: vpmovsxwd (%rdi), %ymm0
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x16mem_to_8x32mask:
@@ -766,13 +765,14 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16_to_8x32mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
-; SKX-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: retq
%x = zext <8 x i16> %a to <8 x i32>
%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
@@ -780,15 +780,10 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
}
define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
-; KNL-LABEL: zext_8x16_to_8x32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: retq
-;
-; SKX-LABEL: zext_8x16_to_8x32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpmovzxwd %xmm0, %ymm0
-; SKX-NEXT: retq
+; ALL-LABEL: zext_8x16_to_8x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: retq
%x = zext <8 x i16> %a to <8 x i32>
ret <8 x i32> %x
}
@@ -799,14 +794,14 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x16mem_to_16x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; SKX-NEXT: retq
%a = load <16 x i16>,<16 x i16> *%i,align 1
%x = zext <16 x i16> %a to <16 x i32>
@@ -850,14 +845,14 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x16_to_16x32mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
-; SKX-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; SKX-NEXT: retq
%x = zext <16 x i16> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
@@ -867,7 +862,7 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun
define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
; ALL-LABEL: zext_16x16_to_16x32:
; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwd %ymm0, %zmm0
+; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; ALL-NEXT: retq
%x = zext <16 x i16> %a to <16 x i32>
ret <16 x i32> %x
@@ -886,8 +881,8 @@ define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind
; SKX-LABEL: zext_2x16mem_to_2x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
%x = zext <2 x i16> %a to <2 x i64>
@@ -908,7 +903,7 @@ define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounw
; SKX-LABEL: sext_2x16mem_to_2x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
@@ -940,8 +935,8 @@ define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x16mem_to_4x64:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = zext <4 x i16> %a to <4 x i64>
@@ -962,7 +957,7 @@ define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounw
; SKX-LABEL: sext_4x16mem_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
@@ -987,14 +982,14 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16mem_to_8x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = zext <8 x i16> %a to <8 x i64>
@@ -1039,14 +1034,14 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16_to_8x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
-; SKX-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: retq
%x = zext <8 x i16> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
@@ -1056,7 +1051,7 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind
define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
; ALL-LABEL: zext_8x16_to_8x64:
; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwq %xmm0, %zmm0
+; ALL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; ALL-NEXT: retq
%ret = zext <8 x i16> %a to <8 x i64>
ret <8 x i64> %ret
@@ -1075,8 +1070,8 @@ define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind
; SKX-LABEL: zext_2x32mem_to_2x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
-; SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero
; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
%x = zext <2 x i32> %a to <2 x i64>
@@ -1097,7 +1092,7 @@ define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounw
; SKX-LABEL: sext_2x32mem_to_2x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
@@ -1129,8 +1124,8 @@ define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x32mem_to_4x64:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
%x = zext <4 x i32> %a to <4 x i64>
@@ -1151,7 +1146,7 @@ define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounw
; SKX-LABEL: sext_4x32mem_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
@@ -1192,8 +1187,8 @@ define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x32_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
-; SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SKX-NEXT: retq
%x = zext <4 x i32> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
@@ -1206,14 +1201,14 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x32mem_to_8x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SKX-NEXT: retq
%a = load <8 x i32>,<8 x i32> *%i,align 1
%x = zext <8 x i32> %a to <8 x i64>
@@ -1267,14 +1262,14 @@ define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x32_to_8x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
-; SKX-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; SKX-NEXT: retq
%x = zext <8 x i32> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
@@ -1312,8 +1307,7 @@ define <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
define <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
; KNL-LABEL: zext_8i1_to_8xi64:
; KNL: ## BB#0:
-; KNL-NEXT: movzbl %dil, %eax
-; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
@@ -1334,6 +1328,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_16i8_to_16i1:
@@ -1341,6 +1336,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%mask_b = trunc <16 x i8>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
@@ -1348,19 +1344,13 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
}
define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
-; KNL-LABEL: trunc_16i32_to_16i1:
-; KNL: ## BB#0:
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_16i32_to_16i1:
-; SKX: ## BB#0:
-; SKX-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX-NEXT: vpmovd2m %zmm0, %k0
-; SKX-NEXT: kmovw %k0, %eax
-; SKX-NEXT: retq
+; ALL-LABEL: trunc_16i32_to_16i1:
+; ALL: ## BB#0:
+; ALL-NEXT: vpslld $31, %zmm0, %zmm0
+; ALL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
%mask_b = trunc <16 x i32>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
ret i16 %mask
@@ -1377,10 +1367,9 @@ define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
; SKX-LABEL: trunc_4i32_to_4i1:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpslld $31, %xmm1, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1}
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
%mask_a = trunc <4 x i32>%a to <4 x i1>
@@ -1398,6 +1387,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_8i16_to_8i1:
@@ -1405,6 +1395,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%mask_b = trunc <8 x i16>%a to <8 x i1>
%mask = bitcast <8 x i1> %mask_b to i8
@@ -1414,9 +1405,12 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL-LABEL: sext_8i1_8i32:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; KNL-NEXT: knotw %k0, %k1
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: retq
;
@@ -1442,6 +1436,7 @@ define i16 @trunc_i32_to_i1(i32 %a) {
; ALL-NEXT: kmovw %eax, %k1
; ALL-NEXT: korw %k0, %k1, %k0
; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; ALL-NEXT: retq
%a_i = trunc i32 %a to i1
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
@@ -1454,6 +1449,7 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8i1_8i16:
@@ -1470,7 +1466,8 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
; KNL-LABEL: sext_16i1_16i32:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16i1_16i32:
@@ -1532,265 +1529,264 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %eax, %xmm4
+; KNL-NEXT: vmovd %r15d, %xmm4
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftlw $13, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $6, %esi, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $7, %edi, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %edi
+; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $10, %r10d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $11, %r11d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $12, %ebx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %ebp, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $2, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $1, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT: kshiftlw $0, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %eax, %xmm5
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5
; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $13, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $3, %edi, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %edi
+; KNL-NEXT: vpinsrb $3, %edx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r12d
; KNL-NEXT: kshiftlw $12, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $4, %esi, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $11, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $5, %r13d, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
; KNL-NEXT: kmovw %k0, %r13d
; KNL-NEXT: kshiftlw $10, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $6, %r8d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
; KNL-NEXT: kshiftlw $9, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $7, %r10d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: kshiftlw $8, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $8, %r11d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %edi
; KNL-NEXT: kshiftlw $7, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $9, %ebx, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r8d
; KNL-NEXT: kshiftlw $6, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r9d
; KNL-NEXT: kshiftlw $5, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $11, %r14d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %ebx
; KNL-NEXT: kshiftlw $4, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $12, %r15d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: vpinsrb $12, %ebp, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %ebp
; KNL-NEXT: kshiftlw $3, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %r9d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: vpinsrb $13, %r11d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: kshiftlw $2, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $14, %r12d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r11d
; KNL-NEXT: kshiftlw $1, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: vptestmd %zmm6, %zmm6, %k0
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vptestmd %zmm7, %zmm7, %k0
; KNL-NEXT: kshiftlw $0, %k1, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %ecx, %xmm5
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: vmovd %eax, %xmm6
+; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $2, %edi, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: vpinsrb $3, %edx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $5, %r8d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $6, %r10d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $7, %r11d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %esi
-; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $8, %ebx, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $9, %ebp, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $10, %r14d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: vpinsrb $10, %ebx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $11, %r15d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: vpinsrb $11, %ebp, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $13, %r12d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vpinsrb $13, %r11d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r14d
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $15, %edx, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6
; KNL-NEXT: kmovw %k1, %r15d
-; KNL-NEXT: vptestmd %zmm7, %zmm7, %k1
; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %eax, %xmm6
+; KNL-NEXT: vmovd %r12d, %xmm7
; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $14, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $15, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $2, %edi, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: kshiftlw $13, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: kshiftlw $12, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: kshiftlw $11, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $5, %r13d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r13d
-; KNL-NEXT: kshiftlw $10, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k0, %edi
-; KNL-NEXT: kshiftlw $9, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $7, %ebx, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %ebx
-; KNL-NEXT: kshiftlw $8, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $8, %ebp, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %ebp
-; KNL-NEXT: kshiftlw $7, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $9, %r10d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: kshiftlw $6, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $10, %r11d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r11d
-; KNL-NEXT: kshiftlw $5, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $11, %esi, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: kshiftlw $4, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r14d
-; KNL-NEXT: kshiftlw $3, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %r9d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: kshiftlw $2, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $14, %r15d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r15d
-; KNL-NEXT: kshiftlw $1, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $15, %r12d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $0, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %edx, %xmm7
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vpinsrb $1, %eax, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $3, %r8d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $5, %edi, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $6, %ebx, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $7, %ebp, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $9, %r11d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $10, %esi, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $11, %r14d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $12, %r9d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $13, %r15d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $2, %edx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $3, %r13d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $4, %eax, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $5, %esi, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $6, %edi, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $7, %r8d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $8, %r9d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $9, %ebx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $11, %r10d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $12, %r11d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $13, %r14d, %xmm7, %xmm7
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
@@ -1803,8 +1799,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2
-; KNL-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm4
-; KNL-NEXT: vpinsrb $15, %edx, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $14, %r15d, %xmm7, %xmm4
+; KNL-NEXT: vpinsrb $15, %r12d, %xmm4, %xmm4
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
@@ -1821,15 +1817,206 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %zmm2, %zmm2
; SKX-NEXT: vpmovb2m %zmm2, %k1
-; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; SKX-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; SKX-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: kshiftrq $32, %k1, %k1
-; SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm3, %zmm0
-; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z}
; SKX-NEXT: retq
%ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
ret <64 x i16> %ret
}
+define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone {
+; ALL-LABEL: shuffle_zext_16x8_to_16x16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16, i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 13, i32 16, i32 14, i32 16, i32 15, i32 16>
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) {
+; ALL-LABEL: zext_32x8_to_16x16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 8, i32 32, i32 9, i32 32, i32 10, i32 32, i32 11, i32 32, i32 12, i32 32, i32 13, i32 32, i32 14, i32 32, i32 15, i32 32>
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) {
+; ALL-LABEL: zext_32x8_to_8x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; ALL-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
+ %2 = bitcast <32 x i8> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) {
+; ALL-LABEL: zext_32x8_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) {
+; ALL-LABEL: zext_16x16_to_8x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16>
+ %2 = bitcast <16 x i16> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) {
+; ALL-LABEL: zext_16x16_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
+ %2 = bitcast <16 x i16> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) {
+; ALL-LABEL: zext_8x32_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+ %2 = bitcast <8 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
+; KNL-LABEL: zext_64xi1_to_64xi8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_64xi1_to_64xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <64 x i8> %x, %y
+ %1 = zext <64 x i1> %mask to <64 x i8>
+ ret <64 x i8> %1
+}
+
+define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
+; KNL-LABEL: zext_32xi1_to_32xi16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_32xi1_to_32xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <32 x i16> %x, %y
+ %1 = zext <32 x i1> %mask to <32 x i16>
+ ret <32 x i16> %1
+}
+
+define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 {
+; KNL-LABEL: zext_16xi1_to_16xi16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16xi1_to_16xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <16 x i16> %x, %y
+ %1 = zext <16 x i1> %mask to <16 x i16>
+ ret <16 x i16> %1
+}
+
+
+define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
+; KNL-LABEL: zext_32xi1_to_32xi8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_32xi1_to_32xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <32 x i16> %x, %y
+ %1 = zext <32 x i1> %mask to <32 x i8>
+ ret <32 x i8> %1
+}
+
+define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
+; KNL-LABEL: zext_4xi1_to_4x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
+; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpsrld $31, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4xi1_to_4x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SKX-NEXT: vpandq %xmm2, %xmm1, %xmm1
+; SKX-NEXT: vpandq %xmm2, %xmm0, %xmm0
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <4 x i8> %x, %y
+ %1 = zext <4 x i1> %mask to <4 x i32>
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
+; KNL-LABEL: zext_2xi1_to_2xi64:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
+; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_2xi1_to_2xi64:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SKX-NEXT: vpandq %xmm2, %xmm1, %xmm1
+; SKX-NEXT: vpandq %xmm2, %xmm0, %xmm0
+; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <2 x i8> %x, %y
+ %1 = zext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %1
+}
diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll
index 703f7832588c..8bd57c0fc1da 100644
--- a/test/CodeGen/X86/avx512-extract-subvector.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector.ll
@@ -14,6 +14,7 @@ define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind {
define <8 x i16> @extract_subvector128_v32i16_first_element(<32 x i16> %x) nounwind {
; SKX-LABEL: extract_subvector128_v32i16_first_element:
; SKX: ## BB#0:
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %r1
@@ -31,6 +32,7 @@ define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind {
define <16 x i8> @extract_subvector128_v64i8_first_element(<64 x i8> %x) nounwind {
; SKX-LABEL: extract_subvector128_v64i8_first_element:
; SKX: ## BB#0:
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %r1
@@ -54,3 +56,291 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind {
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
ret <32 x i8> %r1
}
+
+define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8f64_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextractf64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+ %1 = bitcast double* %addr to <2 x double>*
+ store <2 x double> %0, <2 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8f32_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextractf32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast float* %addr to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4i64_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %1 = bitcast i64* %addr to <2 x i64>*
+ store <2 x i64> %0, <2 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8i32_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i32* %addr to <4 x i32>*
+ store <4 x i32> %0, <4 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v16i16_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i16* %addr to <8 x i16>*
+ store <8 x i16> %0, <8 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v32i8_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %1 = bitcast i8* %addr to <16 x i8>*
+ store <16 x i8> %0, <16 x i8>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4f64_store_lo(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4f64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovupd %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast double* %addr to <2 x double>*
+ store <2 x double> %0, <2 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4f32_store_lo(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4f32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast float* %addr to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v2i64_store_lo(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v2i64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu64 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast i64* %addr to <2 x i64>*
+ store <2 x i64> %0, <2 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4i32_store_lo(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4i32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast i32* %addr to <4 x i32>*
+ store <4 x i32> %0, <4 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v8i16_store_lo(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8i16_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i16* %addr to <8 x i16>*
+ store <8 x i16> %0, <8 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v16i8_store_lo(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v16i8_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i8* %addr to <16 x i8>*
+ store <16 x i8> %0, <16 x i8>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v2f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v2f64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovupd %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast double* %addr to <2 x double>*
+ store <2 x double> %0, <2 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4f32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast float* %addr to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v2i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v2i64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu64 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast i64* %addr to <2 x i64>*
+ store <2 x i64> %0, <2 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4i32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast i32* %addr to <4 x i32>*
+ store <4 x i32> %0, <4 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v8i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v8i16_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i16* %addr to <8 x i16>*
+ store <8 x i16> %0, <8 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v16i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v16i8_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i8* %addr to <16 x i8>*
+ store <16 x i8> %0, <16 x i8>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4f64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovupd %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast double* %addr to <4 x double>*
+ store <4 x double> %0, <4 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v8f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v8f32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast float* %addr to <8 x float>*
+ store <8 x float> %0, <8 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4i64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu64 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast i64* %addr to <4 x i64>*
+ store <4 x i64> %0, <4 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v8i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v8i32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i32* %addr to <8 x i32>*
+ store <8 x i32> %0, <8 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v16i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v16i16_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i16* %addr to <16 x i16>*
+ store <16 x i16> %0, <16 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v32i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v32i8_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %1 = bitcast i8* %addr to <32 x i8>*
+ store <32 x i8> %0, <32 x i8>* %1, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index c30fc909f09b..d8026cd987c2 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1,78 +1,104 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_x86_vfnmadd_ps_z
- ; CHECK: vfnmadd213ps %zmm
+; CHECK-LABEL: test_x86_vfnmadd_ps_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd_ps
- ; CHECK: vfnmadd213ps %zmm
+; CHECK-LABEL: test_mask_vfnmadd_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfnmadd_pd_z
- ; CHECK: vfnmadd213pd %zmm
+; CHECK-LABEL: test_x86_vfnmadd_pd_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd_pd
- ; CHECK: vfnmadd213pd %zmm
+; CHECK-LABEL: test_mask_vfnmadd_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_x86_vfnmsubps_z
- ; CHECK: vfnmsub213ps %zmm
+; CHECK-LABEL: test_x86_vfnmsubps_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub_ps
- ; CHECK: vfnmsub213ps %zmm
+; CHECK-LABEL: test_mask_vfnmsub_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfnmsubpd_z
- ; CHECK: vfnmsub213pd %zmm
+; CHECK-LABEL: test_x86_vfnmsubpd_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub_pd
- ; CHECK: vfnmsub213pd %zmm
+; CHECK-LABEL: test_mask_vfnmsub_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_x86_vfmaddsubps_z
- ; CHECK: vfmaddsub213ps %zmm
+; CHECK-LABEL: test_x86_vfmaddsubps_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
; CHECK-LABEL: test_mask_fmaddsub_ps:
-; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
ret <16 x float> %res
}
@@ -80,16 +106,21 @@ define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16
declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfmaddsubpd_z
- ; CHECK: vfmaddsub213pd %zmm
+; CHECK-LABEL: test_x86_vfmaddsubpd_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmaddsub_pd
- ; CHECK: vfmaddsub213pd %zmm
+; CHECK-LABEL: test_mask_vfmaddsub_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
@@ -97,8 +128,7 @@ define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1,
define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -115,8 +145,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -133,8 +162,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -200,8 +228,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -231,71 +258,96 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0,
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne
- ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn
- ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp
- ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz
- ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current
- ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne
- ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn
- ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp
- ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz
- ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current
- ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
@@ -305,8 +357,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -336,71 +387,96 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <1
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne
- ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn
- ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp
- ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz
- ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current
- ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne
- ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn
- ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp
- ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz
- ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current
- ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
@@ -408,8 +484,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0,
define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -426,8 +501,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -444,8 +518,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -508,71 +581,96 @@ define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <1
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne
- ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn
- ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp
- ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz
- ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current
- ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne
- ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn
- ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp
- ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz
- ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current
- ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
@@ -580,8 +678,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0
define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -598,8 +695,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double>, <8 x do
define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -646,8 +742,7 @@ define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <
define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
index 9279441a23c7..b2d08355a851 100644
--- a/test/CodeGen/X86/avx512-fma.ll
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -67,11 +67,17 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8
}
define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
-; ALL-LABEL: test_x86_fmsub_213:
-; ALL: ## BB#0:
-; ALL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; ALL-NEXT: vmovaps %zmm1, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: test_x86_fmsub_213:
+; KNL: ## BB#0:
+; KNL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmsub_213:
+; SKX: ## BB#0:
+; SKX-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
ret double %res
@@ -86,7 +92,8 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
;
; SKX-LABEL: test_x86_fmsub_213_m:
; SKX: ## BB#0:
-; SKX-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0
+; SKX-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
%a2 = load double , double *%a2_ptr
%x = fmul double %a0, %a1
@@ -95,11 +102,17 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
}
define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
-; ALL-LABEL: test_x86_fmsub_231_m:
-; ALL: ## BB#0:
-; ALL-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
-; ALL-NEXT: vmovaps %zmm1, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: test_x86_fmsub_231_m:
+; KNL: ## BB#0:
+; KNL-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmsub_231_m:
+; SKX: ## BB#0:
+; SKX-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%a2 = load double , double *%a2_ptr
%x = fmul double %a0, %a2
%res = fsub double %x, %a1
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 9ba18192f5d2..d6bc66b591b2 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -183,7 +183,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
ret <8 x float> %res;
@@ -281,7 +281,7 @@ define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -314,7 +314,7 @@ define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm0 {%k1}
@@ -332,7 +332,7 @@ define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x
; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
@@ -350,7 +350,7 @@ define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -369,7 +369,7 @@ define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
@@ -386,7 +386,7 @@ define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1}
@@ -404,7 +404,7 @@ define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x
; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
@@ -422,7 +422,7 @@ define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -455,7 +455,7 @@ define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm0 {%k1}
@@ -488,7 +488,7 @@ define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -507,7 +507,7 @@ define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
@@ -524,7 +524,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm0 {%k1}
@@ -542,7 +542,7 @@ define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x
; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
diff --git a/test/CodeGen/X86/avx512-inc-dec.ll b/test/CodeGen/X86/avx512-inc-dec.ll
index f04ca878f434..5183c9d0fb8f 100644
--- a/test/CodeGen/X86/avx512-inc-dec.ll
+++ b/test/CodeGen/X86/avx512-inc-dec.ll
@@ -2,7 +2,7 @@
;CHECK-LABEL: test
;CHECK-NOT: dec
-;CHECK_NOT: enc
+;CHECK-NOT: enc
;CHECK: ret
define i32 @test(i32 %a, i32 %b) {
%a1 = add i32 %a, -1
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 41ec62c7e047..2c42aca33e45 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,11 +1,25 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
-;CHECK-LABEL: test1:
-;CHECK: vinsertps
-;CHECK: vinsertf32x4
-;CHECK: ret
define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
+; KNL-LABEL: test1:
+; KNL: ## BB#0:
+; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; KNL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test1:
+; SKX: ## BB#0:
+; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; SKX-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; SKX-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: retq
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -15,19 +29,19 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; KNL-LABEL: test2:
; KNL: ## BB#0:
-; KNL-NEXT: vmovhpd (%rdi), %xmm0, %xmm2
+; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; KNL-NEXT: vmovsd %xmm1, %xmm2, %xmm1
+; KNL-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
; KNL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test2:
; SKX: ## BB#0:
-; SKX-NEXT: vmovhpd (%rdi), %xmm0, %xmm2
+; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; SKX-NEXT: vinsertf64x2 $0, %xmm2, %zmm0, %zmm0
; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm2
-; SKX-NEXT: vmovsd %xmm1, %xmm2, %xmm1
+; SKX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
; SKX-NEXT: vinsertf64x2 $3, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%rrr = load double, double* %br
@@ -36,11 +50,20 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
ret <8 x double> %rrr3
}
-;CHECK-LABEL: test3:
-;CHECK: vextractf32x4 $1
-;CHECK: vinsertf32x4 $0
-;CHECK: ret
define <16 x float> @test3(<16 x float> %x) nounwind {
+; KNL-LABEL: test3:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractf32x4 $1, %zmm0, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test3:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm1
+; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
+; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: retq
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
@@ -67,70 +90,140 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
ret <8 x i64> %rrr2
}
-;CHECK-LABEL: test5:
-;CHECK: vextractps
-;CHECK: ret
define i32 @test5(<4 x float> %x) nounwind {
+; KNL-LABEL: test5:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractps $3, %xmm0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test5:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractps $3, %xmm0, %eax
+; SKX-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
}
-;CHECK-LABEL: test6:
-;CHECK: vextractps {{.*}}, (%rdi)
-;CHECK: ret
define void @test6(<4 x float> %x, float* %out) nounwind {
+; KNL-LABEL: test6:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractps $3, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test6:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractps $3, %xmm0, (%rdi)
+; SKX-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
}
-;CHECK-LABEL: test7
-;CHECK: vmovd
-;CHECK: vpermps %zmm
-;CHECK: ret
define float @test7(<16 x float> %x, i32 %ind) nounwind {
+; KNL-LABEL: test7:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test7:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
-;CHECK-LABEL: test8
-;CHECK: vmovq
-;CHECK: vpermpd %zmm
-;CHECK: ret
define double @test8(<8 x double> %x, i32 %ind) nounwind {
+; KNL-LABEL: test8:
+; KNL: ## BB#0:
+; KNL-NEXT: movslq %edi, %rax
+; KNL-NEXT: vmovq %rax, %xmm1
+; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test8:
+; SKX: ## BB#0:
+; SKX-NEXT: movslq %edi, %rax
+; SKX-NEXT: vmovq %rax, %xmm1
+; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
-;CHECK-LABEL: test9
-;CHECK: vmovd
-;CHECK: vpermps %ymm
-;CHECK: ret
define float @test9(<8 x float> %x, i32 %ind) nounwind {
+; KNL-LABEL: test9:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test9:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; SKX-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
-;CHECK-LABEL: test10
-;CHECK: vmovd
-;CHECK: vpermd %zmm
-;CHECK: vmovd %xmm0, %eax
-;CHECK: ret
define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
+; KNL-LABEL: test10:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test10:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
-;CHECK-LABEL: test11
-;CHECK: vpcmpltud
-;CHECK: kshiftlw $11
-;CHECK: kshiftrw $15
-;CHECK: testb
-;CHECK: je
-;CHECK: ret
-;CHECK: ret
define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
+; KNL-LABEL: test11:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $11, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: je LBB10_2
+; KNL-NEXT: ## BB#1: ## %A
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+; KNL-NEXT: LBB10_2: ## %B
+; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test11:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $11, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: je LBB10_2
+; SKX-NEXT: ## BB#1: ## %A
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+; SKX-NEXT: LBB10_2: ## %B
+; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT: retq
%cmp_res = icmp ult <16 x i32> %a, %b
%ia = extractelement <16 x i1> %cmp_res, i32 4
br i1 %ia, label %A, label %B
@@ -141,73 +234,144 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
ret <16 x i32>%c
}
-;CHECK-LABEL: test12
-;CHECK: vpcmpgtq
-;CHECK: kshiftlw $15
-;CHECK: kshiftrw $15
-;CHECK: testb
-;CHECK: ret
-
define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
-
+; KNL-LABEL: test12:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
+; KNL-NEXT: vpcmpgtq %zmm1, %zmm3, %k1
+; KNL-NEXT: kunpckbw %k0, %k1, %k0
+; KNL-NEXT: kshiftlw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: cmoveq %rsi, %rdi
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test12:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
+; SKX-NEXT: vpcmpgtq %zmm1, %zmm3, %k1
+; SKX-NEXT: kunpckbw %k0, %k1, %k0
+; SKX-NEXT: kshiftlw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: cmoveq %rsi, %rdi
+; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <16 x i64> %a, %b
%extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
ret i64 %res
}
-;CHECK-LABEL: test13
-;CHECK: cmpl %esi, %edi
-;CHECK: setb %al
-;CHECK: andl $1, %eax
-;CHECK: kmovw %eax, %k0
-;CHECK: movw $-4
-;CHECK: korw
define i16 @test13(i32 %a, i32 %b) {
+; KNL-LABEL: test13:
+; KNL: ## BB#0:
+; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: setb %al
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: movw $-4, %ax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test13:
+; SKX: ## BB#0:
+; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: setb %al
+; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: movw $-4, %ax
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: korw %k0, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq
%cmp_res = icmp ult i32 %a, %b
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
%res = bitcast <16 x i1> %maskv to i16
ret i16 %res
}
-;CHECK-LABEL: test14
-;CHECK: vpcmpgtq
-;KNL: kshiftlw $11
-;KNL: kshiftrw $15
-;KNL: testb
-;SKX: kshiftlb $3
-;SKX: kshiftrb $7
-;SKX: testb
-;CHECK: ret
-
define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
-
+; KNL-LABEL: test14:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT: kshiftlw $11, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: cmoveq %rsi, %rdi
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test14:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
+; SKX-NEXT: kshiftlb $3, %k0, %k0
+; SKX-NEXT: kshiftrb $7, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: cmoveq %rsi, %rdi
+; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <8 x i64> %a, %b
%extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
ret i64 %res
}
-;CHECK-LABEL: test15
-;CHECK: movb (%rdi), %al
-;CHECK: andb $1, %al
-;CHECK: movw $-1, %ax
-;CHECK: cmovew
define i16 @test15(i1 *%addr) {
+; KNL-LABEL: test15:
+; KNL: ## BB#0:
+; KNL-NEXT: movb (%rdi), %al
+; KNL-NEXT: xorl %ecx, %ecx
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: movw $-1, %ax
+; KNL-NEXT: cmovew %cx, %ax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test15:
+; SKX: ## BB#0:
+; SKX-NEXT: movb (%rdi), %al
+; SKX-NEXT: xorl %ecx, %ecx
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: movw $-1, %ax
+; SKX-NEXT: cmovew %cx, %ax
+; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 1
%x1 = insertelement <16 x i1> undef, i1 %x, i32 10
%x2 = bitcast <16 x i1>%x1 to i16
ret i16 %x2
}
-;CHECK-LABEL: test16
-;CHECK: movb (%rdi), %al
-;CHECK: andw $1, %ax
-;CHECK: kmovw
-;CHECK: kshiftlw $10
-;CHECK: korw
-;CHECK: ret
define i16 @test16(i1 *%addr, i16 %a) {
+; KNL-LABEL: test16:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %esi, %k1
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test16:
+; SKX: ## BB#0:
+; SKX-NEXT: movzbl (%rdi), %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: kmovd %eax, %k0
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: korw %k0, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i16 %a to <16 x i1>
%x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
@@ -215,15 +379,30 @@ define i16 @test16(i1 *%addr, i16 %a) {
ret i16 %x2
}
-;CHECK-LABEL: test17
-;KNL: movb (%rdi), %al
-;KNL: andw $1, %ax
-;KNL: kshiftlw $4
-;KNL: korw
-;SKX: kshiftlb $4
-;SKX: korb
-;CHECK: ret
define i8 @test17(i1 *%addr, i8 %a) {
+; KNL-LABEL: test17:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %esi, %k1
+; KNL-NEXT: kshiftlw $4, %k0, %k0
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test17:
+; SKX: ## BB#0:
+; SKX-NEXT: movzbl (%rdi), %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: kmovd %eax, %k0
+; SKX-NEXT: kmovb %esi, %k1
+; SKX-NEXT: kshiftlb $4, %k0, %k0
+; SKX-NEXT: korb %k0, %k1, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i8 %a to <8 x i1>
%x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
@@ -232,6 +411,13 @@ define i8 @test17(i1 *%addr, i8 %a) {
}
define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
+; KNL-LABEL: extract_v8i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v8i64:
; SKX: ## BB#0:
; SKX-NEXT: vpextrq $1, %xmm0, %rax
@@ -245,10 +431,17 @@ define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
}
define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
+; KNL-LABEL: extract_v4i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v4i64:
; SKX: ## BB#0:
; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
; SKX-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
@@ -258,6 +451,12 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
}
define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
+; KNL-LABEL: extract_v2i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v2i64:
; SKX: ## BB#0:
; SKX-NEXT: vmovq %xmm0, %rax
@@ -270,6 +469,13 @@ define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
}
define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
+; KNL-LABEL: extract_v16i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrd $1, %xmm0, %eax
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v16i32:
; SKX: ## BB#0:
; SKX-NEXT: vpextrd $1, %xmm0, %eax
@@ -283,10 +489,17 @@ define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
}
define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
+; KNL-LABEL: extract_v8i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrd $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v8i32:
; SKX: ## BB#0:
; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
; SKX-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
@@ -296,6 +509,12 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
}
define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
+; KNL-LABEL: extract_v4i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrd $1, %xmm0, %eax
+; KNL-NEXT: vpextrd $3, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v4i32:
; SKX: ## BB#0:
; SKX-NEXT: vpextrd $1, %xmm0, %eax
@@ -308,11 +527,20 @@ define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
}
define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
+; KNL-LABEL: extract_v32i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrw $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v32i16:
; SKX: ## BB#0:
; SKX-NEXT: vpextrw $1, %xmm0, %eax
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <32 x i16> %x, i32 1
%r2 = extractelement <32 x i16> %x, i32 9
@@ -321,11 +549,20 @@ define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
}
define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
+; KNL-LABEL: extract_v16i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrw $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v16i16:
; SKX: ## BB#0:
; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <16 x i16> %x, i32 1
%r2 = extractelement <16 x i16> %x, i32 9
@@ -334,10 +571,18 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
}
define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
+; KNL-LABEL: extract_v8i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrw $1, %xmm0, %eax
+; KNL-NEXT: vpextrw $3, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v8i16:
; SKX: ## BB#0:
; SKX-NEXT: vpextrw $1, %xmm0, %eax
; SKX-NEXT: vpextrw $3, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <8 x i16> %x, i32 1
%r2 = extractelement <8 x i16> %x, i32 3
@@ -346,11 +591,20 @@ define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
}
define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
+; KNL-LABEL: extract_v64i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrb $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v64i8:
; SKX: ## BB#0:
; SKX-NEXT: vpextrb $1, %xmm0, %eax
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <64 x i8> %x, i32 1
%r2 = extractelement <64 x i8> %x, i32 17
@@ -359,11 +613,20 @@ define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
}
define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
+; KNL-LABEL: extract_v32i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrb $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v32i8:
; SKX: ## BB#0:
; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <32 x i8> %x, i32 1
%r2 = extractelement <32 x i8> %x, i32 17
@@ -372,10 +635,18 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
}
define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
+; KNL-LABEL: extract_v16i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrb $1, %xmm0, %eax
+; KNL-NEXT: vpextrb $3, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v16i8:
; SKX: ## BB#0:
; SKX-NEXT: vpextrb $1, %xmm0, %eax
; SKX-NEXT: vpextrb $3, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <16 x i8> %x, i32 1
%r2 = extractelement <16 x i8> %x, i32 3
@@ -384,6 +655,15 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
}
define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
+; KNL-LABEL: insert_v8i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; KNL-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
+; KNL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v8i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
@@ -399,11 +679,20 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
}
define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
+; KNL-LABEL: insert_v4i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v4i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -414,6 +703,12 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
}
define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
+; KNL-LABEL: insert_v2i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
+; KNL-NEXT: vpinsrq $3, %rdi, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v2i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
@@ -426,6 +721,15 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
}
define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
+; KNL-LABEL: insert_v16i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; KNL-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
+; KNL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v16i32:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
@@ -454,7 +758,7 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -520,7 +824,7 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -573,11 +877,20 @@ define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
}
define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
+; KNL-LABEL: insert_v32i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v32i8:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -646,7 +959,7 @@ define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
;
; SKX-LABEL: test_insert_128_v8f64:
; SKX: ## BB#0:
-; SKX-NEXT: vunpcklpd %xmm1, %xmm0, %xmm1
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; SKX-NEXT: vinsertf64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%r = insertelement <8 x double> %x, double %y, i32 1
@@ -656,13 +969,13 @@ define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
; KNL-LABEL: test_insert_128_v16f32:
; KNL: ## BB#0:
-; KNL-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_insert_128_v16f32:
; SKX: ## BB#0:
-; SKX-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm1
+; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%r = insertelement <16 x float> %x, float %y, i32 1
@@ -679,7 +992,7 @@ define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
;
; SKX-LABEL: test_insert_128_v16i16:
; SKX: ## BB#0:
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -697,7 +1010,7 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
;
; SKX-LABEL: test_insert_128_v32i8:
; SKX: ## BB#0:
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-intel-ocl.ll b/test/CodeGen/X86/avx512-intel-ocl.ll
index 2e1b27e4aecf..69e06f547ced 100644
--- a/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -15,7 +15,8 @@ declare i32 @func_int(i32, i32)
; X32-LABEL: testf16_inp
; X32: vaddps {{.*}}, {{%zmm[0-1]}}
-; X32: movl %eax, (%esp)
+; Push is not deemed profitable if we're realigning the stack.
+; X32: {{pushl|movl}} %eax
; X32: call
; X32: ret
@@ -68,10 +69,10 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload
; X64-LABEL: test_prolog_epilog
-; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill
-; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill
-; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill
-; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill
+; X64: kmovq %k7, {{.*}}(%rsp) ## 8-byte Spill
+; X64: kmovq %k6, {{.*}}(%rsp) ## 8-byte Spill
+; X64: kmovq %k5, {{.*}}(%rsp) ## 8-byte Spill
+; X64: kmovq %k4, {{.*}}(%rsp) ## 8-byte Spill
; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill
; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill
; X64: call
@@ -102,4 +103,4 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a
%mask1 = xor <16 x i1> %cmp_res, %mask
%c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
ret <16 x float> %c
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..7a0424bd2eeb
--- /dev/null
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -0,0 +1,1134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
+
+define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastd %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastd %xmm0, %zmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %zmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm512_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %zmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm512_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %zmm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x double> @test_mm512_movddup_pd(<8 x double> %a0) {
+; X32-LABEL: test_mm512_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_movddup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_mask_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_movddup_pd(i8 %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
+; X32-LABEL: test_mm512_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_mask_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
+; X32-LABEL: test_mm512_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_mask_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
+; X32-LABEL: test_mm512_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_mask_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
+; X32-LABEL: test_mm512_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_mask_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
+; X32-LABEL: test_mm512_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_mask_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+; X32-LABEL: test_mm512_mask_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_maskz_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..7d0535546dfa
--- /dev/null
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -0,0 +1,1089 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
+
+define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res2, %res3
+ ret <8 x double> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res2, %res3
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res3, %res2
+ ret <8 x i64> %res4
+}
+
+define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
+; CHECK-LABEL: test_store1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovups %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+ call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
+
+define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
+; CHECK-LABEL: test_store2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovupd %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+ call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
+
+define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
+; CHECK-LABEL: test_mask_store_aligned_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovaps %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+ call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
+
+define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
+; CHECK-LABEL: test_mask_store_aligned_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovapd %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+ call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqu32 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
+
+define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
+
+define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
+ %res4 = fadd <16 x float> %res2, %res1
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
+
+define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
+ %res4 = fadd <16 x float> %res2, %res1
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
+
+define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x double> %res2, %res1
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
+
+define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x double> %res2, %res1
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
+
+declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
+ %res4 = add <16 x i32> %res2, %res1
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i64> %res2, %res1
+ ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> %res, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
+ %res4 = add <16 x i32> %res2, %res1
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> %res, i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i64> %res2, %res1
+ ret <8 x i64> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx512_pslli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx512_pslli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrai_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrai_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
+
+define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
+; CHECK-LABEL: test_storent_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovntdq %zmm0, (%rdi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
+ ret void
+}
+
+declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
+
+define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
+; CHECK-LABEL: test_storent_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovntpd %zmm0, (%rdi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
+ ret void
+}
+
+declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>)
+
+define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
+; CHECK-LABEL: test_storent_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
+ ret void
+}
+
+define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_xor_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+ ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_or_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_or_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+ ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_and_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_and_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+ ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_xor_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_or_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_and_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 7179f742cc66..65ed77374388 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -7,11 +7,9 @@ define i32 @test_kortestz(i16 %a0, i16 %a1) {
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kortestw %k0, %k1
; CHECK-NEXT: sete %al
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
ret i32 %res
@@ -69,6 +67,7 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) {
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kunpckbw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
ret i16 %res
@@ -126,26 +125,6 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
}
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
-define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rsqrt14_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrt14ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rcp14_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_sqrt_pd_512:
; CHECK: ## BB#0:
@@ -424,12 +403,154 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
- %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
- ret i64 %res
+
+ %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
}
-declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtsd2si %xmm0, %rcx
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+
+ %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
+}
+declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2usi %xmm0, %rcx
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+
+ %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
+}
+declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2si64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2si %xmm0, %rcx
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+
+ %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
+}
+declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtsd2si %xmm0, %ecx
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2usi %xmm0, %ecx
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2si32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2si %xmm0, %ecx
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
; CHECK-LABEL: test_x86_vcvtph2ps_512:
@@ -482,13 +603,20 @@ define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
-
-define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
+define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
; CHECK-LABEL: test_x86_vcvtps2ph_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
+; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0
; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
+ store <16 x i16> %res1, <16 x i16> * %dst
+ %res = add <16 x i16> %res2, %res3
ret <16 x i16> %res
}
@@ -514,100 +642,6 @@ define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
}
declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
-define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
-; CHECK: kmovw %edi, %k1
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
-
- %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
-
-
-define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
-; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
-
- %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res2, %res3
- ret <8 x double> %res4
-}
-declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
-
-define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
- %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
- %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res2, %res3
- ret <16 x i32> %res4
-}
-declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_x86_pbroadcastd_i32_512(i32 %a0) {
-; CHECK-LABEL: test_x86_pbroadcastd_i32_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpbroadcastd %edi, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
-
-define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
- %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
- %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
- %res3 = add <8 x i64> %res, %res1
- %res4 = add <8 x i64> %res2, %res3
- ret <8 x i64> %res4
-}
-declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
-; CHECK-LABEL: test_x86_pbroadcastq_i64_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpbroadcastq %rdi, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
-
define <16 x i32> @test_conflict_d(<16 x i32> %a) {
; CHECK-LABEL: test_conflict_d:
; CHECK: ## BB#0:
@@ -643,8 +677,7 @@ define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_conflict_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -689,8 +722,7 @@ define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_lzcnt_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -698,70 +730,12 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
ret <8 x i64> %res
}
-define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
- ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
-
-define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vblendmpd %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
- ret <8 x double> %res
-}
-
-define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
-; CHECK-LABEL: test_x86_mask_blend_pd_512_memop:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %b = load <8 x double>, <8 x double>* %ptr
- %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
-
-define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_d_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
-
-define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
-
define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
; CHECK-LABEL: test_cmpps:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
ret i16 %res
@@ -773,6 +747,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
ret i8 %res
@@ -825,8 +800,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpabsq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpabsq %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
@@ -837,149 +811,41 @@ define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x
ret <8 x i64> %res2
}
-define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
+define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
; CHECK-LABEL: test_vptestmq:
; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
- ret i8 %res
+ %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
+ %res2 = add i8 %res1, %res
+ ret i8 %res2
}
-declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
+declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
-define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
+define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
; CHECK-LABEL: test_vptestmd:
; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
- ret i16 %res
-}
-declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
-
-define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_store1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
-
-define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_store2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
-
-define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_store_aligned_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
-
-define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_store_aligned_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
-
-define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps (%rdi), %zmm0
-; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
- %res4 = fadd <16 x float> %res2, %res1
- ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
-
-define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovups (%rdi), %zmm0
-; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
- %res4 = fadd <16 x float> %res2, %res1
- ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
-
-define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovapd (%rdi), %zmm0
-; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x double> %res2, %res1
- ret <8 x double> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
-
-define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovupd (%rdi), %zmm0
-; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x double> %res2, %res1
- ret <8 x double> %res4
+ %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
+ %res2 = add i16 %res1, %res
+ ret i16 %res2
}
-
-declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
+declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_valign_q:
@@ -993,8 +859,7 @@ define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
; CHECK-LABEL: test_mask_valign_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1028,127 +893,33 @@ define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
-define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_pcmpeq_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
-
-define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_pcmpeq_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
-
-define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_pcmpgt_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
-
-define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_pcmpgt_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
-
define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_cmp_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k7
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1174,29 +945,29 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k2, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1223,29 +994,29 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_ucmp_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpequd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k7
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1271,29 +1042,29 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpequd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k2, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1320,36 +1091,28 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_cmp_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k7
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
@@ -1374,39 +1137,30 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k1 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
@@ -1434,36 +1188,28 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_ucmp_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpequq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k7
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
@@ -1488,39 +1234,30 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpequq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k1 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
@@ -1591,204 +1328,6 @@ define <4 x double> @test_vextractf64x4(<8 x double> %a) {
declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
-define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx512_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx512_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
- ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
- ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrai_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
- ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psll_d:
; CHECK: ## BB#0:
@@ -1833,8 +1372,7 @@ define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psll_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1845,8 +1383,7 @@ define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -1899,8 +1436,7 @@ define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1911,8 +1447,7 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -1965,8 +1500,7 @@ define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1977,8 +1511,7 @@ define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2031,8 +1564,7 @@ define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2043,8 +1575,7 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2098,8 +1629,7 @@ define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2110,8 +1640,7 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2164,8 +1693,7 @@ define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2176,8 +1704,7 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2378,8 +1905,7 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float>
define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rn:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2390,8 +1916,7 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rd:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2402,8 +1927,7 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_ru:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2414,8 +1938,7 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2423,142 +1946,6 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
ret <8 x double> %res
}
-define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_xor_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
- ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_xor_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
- ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_or_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
- ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_or_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
- ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_and_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
- ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_and_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
- ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_xor_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
- ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_xor_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
- ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_or_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
- ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_or_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
- ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_and_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
- ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_and_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
- ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-
define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_add_epi32_rr:
; CHECK: ## BB#0:
@@ -2779,8 +2166,7 @@ define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2791,8 +2177,7 @@ define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -2812,8 +2197,7 @@ define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2825,8 +2209,7 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -2849,8 +2232,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2864,8 +2246,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -2889,8 +2270,7 @@ define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2901,8 +2281,7 @@ define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -2922,8 +2301,7 @@ define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2935,8 +2313,7 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -2959,8 +2336,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2974,8 +2350,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -2999,8 +2374,7 @@ define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -3011,8 +2385,7 @@ define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -3032,8 +2405,7 @@ define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3045,8 +2417,7 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -3070,8 +2441,7 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3086,8 +2456,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -3112,8 +2481,7 @@ define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -3124,8 +2492,7 @@ define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -3145,8 +2512,7 @@ define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3158,8 +2524,7 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -3183,8 +2548,7 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3199,8 +2563,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -4314,8 +3677,7 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32
define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4347,8 +3709,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4380,8 +3741,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4411,8 +3771,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4446,8 +3805,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1
@@ -4481,8 +3839,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
@@ -4517,8 +3874,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x do
define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm2
; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1
@@ -4556,8 +3912,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
@@ -4590,8 +3945,7 @@ declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x doub
define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
@@ -4617,142 +3971,6 @@ define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16
ret <16 x float> %res2
}
-declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
-; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
- %res2 = fadd <8 x double> %res, %res1
- ret <8 x double> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
-; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
- %res2 = fadd <16 x float> %res, %res1
- ret <16 x float> %res2
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
-; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
- %res2 = fadd <8 x double> %res, %res1
- ret <8 x double> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
-; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
- %res2 = fadd <16 x float> %res, %res1
- ret <16 x float> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6]
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
- %res3 = add <8 x i64> %res, %res1
- %res4 = add <8 x i64> %res2, %res3
- ret <8 x i64> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
-; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
- %res2 = add <16 x i32> %res, %res1
- ret <16 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpunpckldq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
-; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
- %res2 = add <16 x i32> %res, %res1
- ret <16 x i32> %res2
-}
-
declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
@@ -4778,8 +3996,7 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
; CHECK-NEXT: retq
@@ -4861,8 +4078,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqw %zmm0, %xmm0
@@ -4882,8 +4098,7 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
; CHECK-NEXT: retq
@@ -4897,8 +4112,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm0
@@ -4932,8 +4146,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm0
@@ -4967,8 +4180,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovqd %zmm0, %ymm0
@@ -4988,8 +4200,7 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: retq
@@ -5003,8 +4214,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovsqd %zmm0, %ymm0
@@ -5038,8 +4248,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovusqd %zmm0, %ymm0
@@ -5277,8 +4486,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>,
define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5310,8 +4518,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5327,8 +4534,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>
define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -5344,8 +4550,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5377,8 +4582,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double
define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5410,8 +4614,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5427,8 +4630,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>
define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5461,8 +4663,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>,
define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5505,39 +4706,6 @@ define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16
ret <16 x i32> %res2
}
-
-declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
-define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vscalefss %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
- %res2 = fadd <4 x float> %res, %res1
- ret <4 x float> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
-define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
- %res2 = fadd <2 x double> %res, %res1
- ret <2 x double> %res2
-}
-
declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
@@ -5601,8 +4769,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
@@ -5623,8 +4790,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
; CHECK-NEXT: kandw %k2, %k1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
@@ -5647,8 +4813,7 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
@@ -5661,15 +4826,16 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k1
; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k1
-; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: kmovw %edi, %k2
-; CHECK-NEXT: kandw %k2, %k1, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k2 {%k1}
+; CHECK-NEXT: kmovw %k2, %ecx
+; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: andb %cl, %al
+; CHECK-NEXT: andb %dl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
@@ -5688,7 +4854,7 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5703,10 +4869,9 @@ declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>
define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
@@ -5726,7 +4891,7 @@ define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32>
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5741,9 +4906,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8
define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5758,8 +4922,7 @@ declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8
define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5842,10 +5005,9 @@ declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double
define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufpd {{.*#+}} zmm2 = zmm2[0],k1[1],zmm2[3],k1[2],zmm2[5],k1[4],zmm2[6],k1[6]
-; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = k1[0],zmm0[1],k1[3],zmm0[2],k1[5],zmm0[4],k1[6],zmm0[6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vshufpd {{.*#+}} zmm3 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
@@ -5865,7 +5027,7 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufps {{.*#+}} zmm2 = zmm2[2,1],k1[1,0],zmm2[6,5],k1[5,4],zmm2[10,9],k1[9,8],zmm2[14,13],k1[13,12]
+; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5875,54 +5037,12 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x
ret <16 x float> %res2
}
-declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 = zmm1[0,1,3,2,5,4,6,6]
-; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = k1[0,1,3,2,5,4,6,6]
-; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,3,2,5,4,6,6]
-; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res3, %res2
- ret <8 x double> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpermilps {{.*#+}} zmm1 = zmm1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT: vpermilps {{.*#+}} zmm2 = k1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res3, %res2
- ret <16 x float> %res4
-}
-
declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0
@@ -5957,9 +5077,9 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0,
ret <16 x float> %res4
}
-declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i8)
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
-define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i8 %x4) {
+define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
@@ -5969,17 +5089,17 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 %x4)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i8 %x4)
+ %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
%res3 = fadd <16 x float> %res, %res1
%res4 = fadd <16 x float> %res2, %res3
ret <16 x float> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i8)
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i8 %x4) {
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
@@ -5989,9 +5109,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 %x4)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i8 %x4)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res2, %res3
ret <16 x i32> %res4
@@ -6002,8 +5122,7 @@ declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x do
define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6023,8 +5142,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i3
define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6039,9 +5157,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
ret <8 x i64> %res4
}
-declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float>, <4 x float>, <2 x double>, i8, i32)
+declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
-define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<4 x float> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
+define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
@@ -6050,15 +5168,15 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<4 x float> %x0,<4
; CHECK-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
+ %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
%res2 = fadd <2 x double> %res, %res1
ret <2 x double> %res2
}
-declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double>, <2 x double>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
-define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<2 x double> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
+define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
@@ -6067,8 +5185,8 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<2 x double> %x0,<2
; CHECK-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
+ %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
%res2 = fadd <4 x float> %res, %res1
ret <4 x float> %res2
}
@@ -6112,8 +5230,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0
@@ -6130,8 +5247,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0
@@ -6143,73 +5259,11 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
ret <8 x i64> %res2
}
-declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res2, %res3
- ret <8 x double> %res4
-}
-
define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeqsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
ret i32 %res
@@ -6218,9 +5272,8 @@ define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
ret i32 %res
@@ -6229,9 +5282,8 @@ define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1)
define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeqsd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
ret i32 %res
@@ -6240,9 +5292,8 @@ define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeq_uqsd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
ret i32 %res
@@ -6251,9 +5302,8 @@ define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpltsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
ret i32 %res
@@ -6262,9 +5312,8 @@ define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpngesd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
ret i32 %res
@@ -6273,9 +5322,8 @@ define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1)
define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpltsd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
ret i32 %res
@@ -6284,9 +5332,8 @@ define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpngesd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
ret i32 %res
@@ -6297,9 +5344,8 @@ declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpngess %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
ret i32 %res
@@ -6377,12 +5423,15 @@ declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vaddps %zmm1, %zmm0, %zmm0
-; CHECK: vaddps %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
%res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
@@ -6396,12 +5445,15 @@ declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vaddpd %zmm1, %zmm0, %zmm0
-; CHECK: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
%res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
@@ -6415,12 +5467,15 @@ declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32
define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
@@ -6434,12 +5489,15 @@ declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
@@ -6449,30 +5507,29 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x
ret <8 x i64> %res5
}
-declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 -1)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> zeroinitializer, i8 %x3)
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> %x2, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> zeroinitializer, i8 %x3)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
@@ -6482,17 +5539,17 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> zeroinitializer, i16 %x3)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
@@ -6502,38 +5559,37 @@ define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i8 %x1, <
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
@@ -6543,67 +5599,46 @@ define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i8 %x1, <
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i16, <16 x i32>, i8)
-
-define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i16 %x1, <16 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpshufd $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpshufd $3, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> zeroinitializer, i8 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 -1)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res3, %res2
- ret <16 x i32> %res4
-}
-
declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
@@ -6617,14 +5652,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -6633,61 +5667,58 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vprold $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxbd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbd %xmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -6701,14 +5732,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxbq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -6722,14 +5752,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxdq %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxdq %ymm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -6743,13 +5772,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i
define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxwd %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwd %ymm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -6763,14 +5792,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxwq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -6784,13 +5812,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i1
define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -6804,14 +5832,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -6825,14 +5852,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -6847,13 +5873,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i
define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -6868,14 +5894,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -6884,4 +5909,532 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64>
ret <8 x i64> %res4
}
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res3, %res2
+ ret <8 x i64> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
+; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
+ %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 4)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
+; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vaddps %zmm4, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8)
+ %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 -1, i32 4)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
+; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
+
+define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastd %edi, %zmm2
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)
+
+define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm2
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)
+
+declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res4 = fadd <2 x double> %res, %res1
+ %res5 = fadd <2 x double> %res2, %res3
+ %res6 = fadd <2 x double> %res4, %res5
+ ret <2 x double> %res6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res4 = fadd <4 x float> %res, %res1
+ %res5 = fadd <4 x float> %res2, %res3
+ %res6 = fadd <4 x float> %res4, %res5
+ ret <4 x float> %res6
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm2, %zmm5
+; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res4 = fadd <2 x double> %res, %res1
+ %res5 = fadd <2 x double> %res2, %res3
+ %res6 = fadd <2 x double> %res4, %res5
+ ret <2 x double> %res6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm2, %zmm5
+; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res4 = fadd <4 x float> %res, %res1
+ %res5 = fadd <4 x float> %res2, %res3
+ %res6 = fadd <4 x float> %res4, %res5
+ ret <4 x float> %res6
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %esi
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %q = load float, float* %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
+ ret < 4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %esi
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %q = load float, float* %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
+ ret < 4 x float> %res
+}
+
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %q = load float, float* %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4)
+ ret < 4 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
index c973b706e8fc..d085467868ab 100644
--- a/test/CodeGen/X86/avx512-logic.ll
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -17,6 +17,22 @@ entry:
ret <16 x i32> %x
}
+define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpandnd:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; ALL-NEXT: vpandnd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+ i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %b2 = xor <16 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1,
+ i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %x = and <16 x i32> %a2, %b2
+ ret <16 x i32> %x
+}
+
define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpord:
; ALL: ## BB#0: ## %entry
@@ -58,6 +74,20 @@ entry:
ret <8 x i64> %x
}
+define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpandnq:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ %b2 = xor <8 x i64> %b, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+ %x = and <8 x i64> %a2, %b2
+ ret <8 x i64> %x
+}
+
define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vporq:
; ALL: ## BB#0: ## %entry
@@ -133,6 +163,25 @@ define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
ret <64 x i8> %res
}
+define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; KNL-LABEL: andn_v64i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: andn_v64i8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; SKX-NEXT: retq
+ %b2 = xor <64 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = and <64 x i8> %a, %b2
+ ret <64 x i8> %res
+}
+
define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: or_v64i8:
; KNL: ## BB#0:
@@ -178,6 +227,23 @@ define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
ret <32 x i16> %res
}
+define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; KNL-LABEL: andn_v32i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: andn_v32i16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; SKX-NEXT: retq
+ %b2 = xor <32 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1,
+ i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = and <32 x i16> %a, %b2
+ ret <32 x i16> %res
+}
+
define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: or_v32i16:
; KNL: ## BB#0:
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 015c70a6ba08..cb63f9108e29 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
define i16 @mask16(i16 %x) {
; CHECK-LABEL: mask16:
@@ -8,6 +8,7 @@ define i16 @mask16(i16 %x) {
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: knotw %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -15,13 +16,27 @@ define i16 @mask16(i16 %x) {
ret i16 %ret
}
+define i32 @mask16_zext(i16 %x) {
+; CHECK-LABEL: mask16_zext:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %m2 = bitcast <16 x i1> %m1 to i16
+ %ret = zext i16 %m2 to i32
+ ret i32 %ret
+}
+
define i8 @mask8(i8 %x) {
; KNL-LABEL: mask8:
; KNL: ## BB#0:
-; KNL-NEXT: movzbl %dil, %eax
-; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: mask8:
@@ -29,6 +44,7 @@ define i8 @mask8(i8 %x) {
; SKX-NEXT: kmovb %edi, %k0
; SKX-NEXT: knotb %k0, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -36,6 +52,27 @@ define i8 @mask8(i8 %x) {
ret i8 %ret
}
+define i32 @mask8_zext(i8 %x) {
+; KNL-LABEL: mask8_zext:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: mask8_zext:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb %edi, %k0
+; SKX-NEXT: knotb %k0, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: retq
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %m2 = bitcast <8 x i1> %m1 to i8
+ %ret = zext i8 %m2 to i32
+ ret i32 %ret
+}
+
define void @mask16_mem(i16* %ptr) {
; CHECK-LABEL: mask16_mem:
; CHECK: ## BB#0:
@@ -54,9 +91,11 @@ define void @mask16_mem(i16* %ptr) {
define void @mask8_mem(i8* %ptr) {
; KNL-LABEL: mask8_mem:
; KNL: ## BB#0:
-; KNL-NEXT: kmovw (%rdi), %k0
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k0
; KNL-NEXT: knotw %k0, %k0
-; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: mask8_mem:
@@ -76,15 +115,34 @@ define void @mask8_mem(i8* %ptr) {
define i16 @mand16(i16 %x, i16 %y) {
; CHECK-LABEL: mand16:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: xorl %esi, %eax
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %ma = bitcast i16 %x to <16 x i1>
+ %mb = bitcast i16 %y to <16 x i1>
+ %mc = and <16 x i1> %ma, %mb
+ %md = xor <16 x i1> %ma, %mb
+ %me = or <16 x i1> %mc, %md
+ %ret = bitcast <16 x i1> %me to i16
+ ret i16 %ret
+}
+
+define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
+; CHECK-LABEL: mand16_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw (%rdi), %k0
+; CHECK-NEXT: kmovw (%rsi), %k1
; CHECK-NEXT: kandw %k1, %k0, %k2
; CHECK-NEXT: kxorw %k1, %k0, %k0
; CHECK-NEXT: korw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
- %ma = bitcast i16 %x to <16 x i1>
- %mb = bitcast i16 %y to <16 x i1>
+ %ma = load <16 x i1>, <16 x i1>* %x
+ %mb = load <16 x i1>, <16 x i1>* %y
%mc = and <16 x i1> %ma, %mb
%md = xor <16 x i1> %ma, %mb
%me = or <16 x i1> %mc, %md
@@ -98,6 +156,7 @@ define i8 @shuf_test1(i16 %v) nounwind {
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kshiftrw $8, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: shuf_test1:
@@ -105,6 +164,7 @@ define i8 @shuf_test1(i16 %v) nounwind {
; SKX-NEXT: kmovw %edi, %k0
; SKX-NEXT: kshiftrw $8, %k0, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%v1 = bitcast i16 %v to <16 x i1>
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -119,18 +179,36 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
; CHECK-NEXT: kshiftlw $10, %k0, %k0
; CHECK-NEXT: kshiftrw $15, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i32
ret i32 %res
-}define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+}
+
+define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: kshiftlw $10, %k0, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i16
ret i16 %res
-}define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+}
+
+define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: kshiftlw $10, %k0, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i8
@@ -232,7 +310,6 @@ define void @test7(<8 x i1> %mask) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: movb $85, %al
-; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -246,8 +323,7 @@ define void @test7(<8 x i1> %mask) {
; SKX-NEXT: movb $85, %al
; SKX-NEXT: kmovb %eax, %k1
; SKX-NEXT: korb %k1, %k0, %k0
-; SKX-NEXT: kmovb %k0, %eax
-; SKX-NEXT: testb %al, %al
+; SKX-NEXT: ktestb %k0, %k0
; SKX-NEXT: retq
allocas:
%a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
@@ -266,14 +342,15 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL: ## BB#0:
; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: jg LBB14_1
+; KNL-NEXT: jg LBB17_1
; KNL-NEXT: ## BB#2:
; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1
-; KNL-NEXT: jmp LBB14_3
-; KNL-NEXT: LBB14_1:
+; KNL-NEXT: jmp LBB17_3
+; KNL-NEXT: LBB17_1:
; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
-; KNL-NEXT: LBB14_3:
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: LBB17_3:
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -281,12 +358,12 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; SKX: ## BB#0:
; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2
; SKX-NEXT: cmpl %esi, %edi
-; SKX-NEXT: jg LBB14_1
+; SKX-NEXT: jg LBB17_1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
-; SKX-NEXT: LBB14_1:
+; SKX-NEXT: LBB17_1:
; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
@@ -301,29 +378,30 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test9:
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: jg LBB15_1
+; KNL-NEXT: jg LBB18_1
; KNL-NEXT: ## BB#2:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
-; KNL-NEXT: jmp LBB15_3
-; KNL-NEXT: LBB15_1:
+; KNL-NEXT: jmp LBB18_3
+; KNL-NEXT: LBB18_1:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: LBB15_3:
+; KNL-NEXT: LBB18_3:
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: cmpl %esi, %edi
-; SKX-NEXT: jg LBB15_1
+; SKX-NEXT: jg LBB18_1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm0
-; SKX-NEXT: jmp LBB15_3
-; SKX-NEXT: LBB15_1:
+; SKX-NEXT: jmp LBB18_3
+; SKX-NEXT: LBB18_1:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
-; SKX-NEXT: LBB15_3:
+; SKX-NEXT: LBB18_3:
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
@@ -340,23 +418,23 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test11:
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: jg LBB17_2
+; KNL-NEXT: jg LBB20_2
; KNL-NEXT: ## BB#1:
; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: LBB17_2:
+; KNL-NEXT: LBB20_2:
; KNL-NEXT: retq
;
; SKX-LABEL: test11:
; SKX: ## BB#0:
; SKX-NEXT: cmpl %esi, %edi
-; SKX-NEXT: jg LBB17_1
+; SKX-NEXT: jg LBB20_1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpslld $31, %xmm1, %xmm0
-; SKX-NEXT: jmp LBB17_3
-; SKX-NEXT: LBB17_1:
+; SKX-NEXT: jmp LBB20_3
+; SKX-NEXT: LBB20_1:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: LBB17_3:
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: LBB20_3:
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
%mask = icmp sgt i32 %a1, %b1
@@ -399,7 +477,8 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
; KNL-NEXT: movw $1, %cx
; KNL-NEXT: cmovgw %ax, %cx
; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -420,6 +499,7 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
}
define <64 x i8> @test16(i64 %x) {
+;
; KNL-LABEL: test16:
; KNL: ## BB#0:
; KNL-NEXT: pushq %rbp
@@ -430,432 +510,34 @@ define <64 x i8> @test16(i64 %x) {
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: Ltmp2:
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: pushq %r15
-; KNL-NEXT: pushq %r14
-; KNL-NEXT: pushq %r13
-; KNL-NEXT: pushq %r12
-; KNL-NEXT: pushq %rbx
; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: Ltmp3:
-; KNL-NEXT: .cfi_offset %rbx, -56
-; KNL-NEXT: Ltmp4:
-; KNL-NEXT: .cfi_offset %r12, -48
-; KNL-NEXT: Ltmp5:
-; KNL-NEXT: .cfi_offset %r13, -40
-; KNL-NEXT: Ltmp6:
-; KNL-NEXT: .cfi_offset %r14, -32
-; KNL-NEXT: Ltmp7:
-; KNL-NEXT: .cfi_offset %r15, -24
-; KNL-NEXT: movq %rdi, %rax
-; KNL-NEXT: shrq $32, %rax
-; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl $271, %eax ## imm = 0x10F
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: movl %edi, %ecx
-; KNL-NEXT: andl $1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: movl $257, %ecx ## imm = 0x101
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $258, %ecx ## imm = 0x102
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $259, %ecx ## imm = 0x103
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $260, %ecx ## imm = 0x104
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $261, %ecx ## imm = 0x105
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $262, %ecx ## imm = 0x106
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $263, %ecx ## imm = 0x107
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $264, %ecx ## imm = 0x108
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $265, %ecx ## imm = 0x109
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $266, %ecx ## imm = 0x10A
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $267, %ecx ## imm = 0x10B
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $268, %ecx ## imm = 0x10C
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $269, %ecx ## imm = 0x10D
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $270, %ecx ## imm = 0x10E
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: movl %edi, (%rsp)
+; KNL-NEXT: shrq $32, %rdi
+; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: kmovw (%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2
; KNL-NEXT: movl $1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d
-; KNL-NEXT: movq %r15, %rdx
-; KNL-NEXT: shrq $17, %rdx
-; KNL-NEXT: andb $1, %dl
-; KNL-NEXT: je LBB22_2
-; KNL-NEXT: ## BB#1:
-; KNL-NEXT: movb $-1, %dl
-; KNL-NEXT: LBB22_2:
-; KNL-NEXT: movq %r15, %r11
-; KNL-NEXT: shrq $16, %r11
-; KNL-NEXT: andb $1, %r11b
-; KNL-NEXT: je LBB22_4
-; KNL-NEXT: ## BB#3:
-; KNL-NEXT: movb $-1, %r11b
-; KNL-NEXT: LBB22_4:
-; KNL-NEXT: movq %r15, %r10
-; KNL-NEXT: shrq $18, %r10
-; KNL-NEXT: andb $1, %r10b
-; KNL-NEXT: je LBB22_6
-; KNL-NEXT: ## BB#5:
-; KNL-NEXT: movb $-1, %r10b
-; KNL-NEXT: LBB22_6:
-; KNL-NEXT: movq %r15, %r9
-; KNL-NEXT: shrq $19, %r9
-; KNL-NEXT: andb $1, %r9b
-; KNL-NEXT: je LBB22_8
-; KNL-NEXT: ## BB#7:
-; KNL-NEXT: movb $-1, %r9b
-; KNL-NEXT: LBB22_8:
-; KNL-NEXT: movq %r15, %rbx
-; KNL-NEXT: shrq $20, %rbx
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB22_10
-; KNL-NEXT: ## BB#9:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB22_10:
-; KNL-NEXT: movq %r15, %r12
-; KNL-NEXT: shrq $21, %r12
-; KNL-NEXT: andb $1, %r12b
-; KNL-NEXT: je LBB22_12
-; KNL-NEXT: ## BB#11:
-; KNL-NEXT: movb $-1, %r12b
-; KNL-NEXT: LBB22_12:
-; KNL-NEXT: movq %r15, %r14
-; KNL-NEXT: shrq $22, %r14
-; KNL-NEXT: andb $1, %r14b
-; KNL-NEXT: je LBB22_14
-; KNL-NEXT: ## BB#13:
-; KNL-NEXT: movb $-1, %r14b
-; KNL-NEXT: LBB22_14:
-; KNL-NEXT: movq %r15, %r8
-; KNL-NEXT: shrq $23, %r8
-; KNL-NEXT: andb $1, %r8b
-; KNL-NEXT: je LBB22_16
-; KNL-NEXT: ## BB#15:
-; KNL-NEXT: movb $-1, %r8b
-; KNL-NEXT: LBB22_16:
-; KNL-NEXT: movq %r15, %r13
-; KNL-NEXT: shrq $24, %r13
-; KNL-NEXT: andb $1, %r13b
-; KNL-NEXT: je LBB22_18
-; KNL-NEXT: ## BB#17:
-; KNL-NEXT: movb $-1, %r13b
-; KNL-NEXT: LBB22_18:
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $25, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_20
-; KNL-NEXT: ## BB#19:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_20:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $26, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_22
-; KNL-NEXT: ## BB#21:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_22:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $272, %esi ## imm = 0x110
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $27, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_24
-; KNL-NEXT: ## BB#23:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_24:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $273, %eax ## imm = 0x111
-; KNL-NEXT: bextrl %esi, %edi, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $28, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB22_26
-; KNL-NEXT: ## BB#25:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB22_26:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vmovd %esi, %xmm2
-; KNL-NEXT: movl $274, %esi ## imm = 0x112
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $29, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB22_28
-; KNL-NEXT: ## BB#27:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB22_28:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %eax
-; KNL-NEXT: movzbl %r11b, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $30, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB22_30
-; KNL-NEXT: ## BB#29:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB22_30:
-; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT: movl $275, %eax ## imm = 0x113
-; KNL-NEXT: bextrl %eax, %edi, %r11d
-; KNL-NEXT: movzbl %dl, %edx
-; KNL-NEXT: vmovd %esi, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $31, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_32
-; KNL-NEXT: ## BB#31:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_32:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT: movl $276, %eax ## imm = 0x114
-; KNL-NEXT: bextrl %eax, %edi, %esi
-; KNL-NEXT: movl $277, %r11d ## imm = 0x115
-; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r10b, %r10d
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_34
-; KNL-NEXT: ## BB#33:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_34:
-; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $278, %r11d ## imm = 0x116
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r9b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shlq $63, %rcx
-; KNL-NEXT: sarq $63, %rcx
-; KNL-NEXT: vmovd %ecx, %xmm4
-; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $2, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_36
-; KNL-NEXT: ## BB#35:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_36:
-; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $279, %r9d ## imm = 0x117
-; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %bl, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $3, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_38
-; KNL-NEXT: ## BB#37:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_38:
-; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r9d, %edi, %edx
-; KNL-NEXT: movl $280, %esi ## imm = 0x118
-; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r12b, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $4, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_40
-; KNL-NEXT: ## BB#39:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_40:
-; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %ecx
-; KNL-NEXT: movl $281, %edx ## imm = 0x119
-; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r14b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $5, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_42
-; KNL-NEXT: ## BB#41:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_42:
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $282, %edx ## imm = 0x11A
-; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r8b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $6, %bl
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB22_44
-; KNL-NEXT: ## BB#43:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB22_44:
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %eax
-; KNL-NEXT: movl $283, %ecx ## imm = 0x11B
-; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r13b, %esi
-; KNL-NEXT: movzbl %bl, %edx
-; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $7, %bl
-; KNL-NEXT: je LBB22_46
-; KNL-NEXT: ## BB#45:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB22_46:
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: movl $284, %edx ## imm = 0x11C
-; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
-; KNL-NEXT: movzbl %al, %esi
-; KNL-NEXT: movzbl %bl, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $8, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_48
-; KNL-NEXT: ## BB#47:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_48:
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $285, %edx ## imm = 0x11D
-; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $9, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_50
-; KNL-NEXT: ## BB#49:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_50:
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $286, %edx ## imm = 0x11E
-; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $10, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_52
-; KNL-NEXT: ## BB#51:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_52:
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %edx
-; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $11, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_54
-; KNL-NEXT: ## BB#53:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_54:
-; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
-; KNL-NEXT: shrl $31, %edi
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $12, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_56
-; KNL-NEXT: ## BB#55:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_56:
-; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $13, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_58
-; KNL-NEXT: ## BB#57:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_58:
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $14, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_60
-; KNL-NEXT: ## BB#59:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_60:
-; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2
-; KNL-NEXT: shrq $15, %r15
-; KNL-NEXT: andb $1, %r15b
-; KNL-NEXT: je LBB22_62
-; KNL-NEXT: ## BB#61:
-; KNL-NEXT: movb $-1, %r15b
-; KNL-NEXT: LBB22_62:
-; KNL-NEXT: movzbl %r15b, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; KNL-NEXT: vpsllw $7, %ymm2, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT: leaq -40(%rbp), %rsp
-; KNL-NEXT: popq %rbx
-; KNL-NEXT: popq %r12
-; KNL-NEXT: popq %r13
-; KNL-NEXT: popq %r14
-; KNL-NEXT: popq %r15
+; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
@@ -875,444 +557,47 @@ define <64 x i8> @test16(i64 %x) {
}
define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
+;
; KNL-LABEL: test17:
; KNL: ## BB#0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Ltmp8:
+; KNL-NEXT: Ltmp3:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Ltmp9:
+; KNL-NEXT: Ltmp4:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Ltmp10:
+; KNL-NEXT: Ltmp5:
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: pushq %r15
-; KNL-NEXT: pushq %r14
-; KNL-NEXT: pushq %r13
-; KNL-NEXT: pushq %r12
-; KNL-NEXT: pushq %rbx
; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: Ltmp11:
-; KNL-NEXT: .cfi_offset %rbx, -56
-; KNL-NEXT: Ltmp12:
-; KNL-NEXT: .cfi_offset %r12, -48
-; KNL-NEXT: Ltmp13:
-; KNL-NEXT: .cfi_offset %r13, -40
-; KNL-NEXT: Ltmp14:
-; KNL-NEXT: .cfi_offset %r14, -32
-; KNL-NEXT: Ltmp15:
-; KNL-NEXT: .cfi_offset %r15, -24
-; KNL-NEXT: movq %rdi, %rax
-; KNL-NEXT: shrq $32, %rax
-; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl %edi, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: vmovd %eax, %xmm0
-; KNL-NEXT: movl $257, %eax ## imm = 0x101
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $258, %eax ## imm = 0x102
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $259, %eax ## imm = 0x103
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $260, %eax ## imm = 0x104
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $261, %eax ## imm = 0x105
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $262, %eax ## imm = 0x106
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $263, %eax ## imm = 0x107
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $264, %eax ## imm = 0x108
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $265, %eax ## imm = 0x109
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $266, %eax ## imm = 0x10A
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $267, %eax ## imm = 0x10B
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $268, %eax ## imm = 0x10C
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $269, %eax ## imm = 0x10D
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $270, %eax ## imm = 0x10E
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $271, %eax ## imm = 0x10F
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: movl %edi, (%rsp)
+; KNL-NEXT: shrq $32, %rdi
+; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: kmovw (%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %edx, %esi
; KNL-NEXT: setg %al
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d
-; KNL-NEXT: movq %r15, %rdx
-; KNL-NEXT: shrq $17, %rdx
-; KNL-NEXT: andb $1, %dl
-; KNL-NEXT: je LBB23_2
-; KNL-NEXT: ## BB#1:
-; KNL-NEXT: movb $-1, %dl
-; KNL-NEXT: LBB23_2:
-; KNL-NEXT: movq %r15, %r11
-; KNL-NEXT: shrq $16, %r11
-; KNL-NEXT: andb $1, %r11b
-; KNL-NEXT: je LBB23_4
-; KNL-NEXT: ## BB#3:
-; KNL-NEXT: movb $-1, %r11b
-; KNL-NEXT: LBB23_4:
-; KNL-NEXT: movq %r15, %r10
-; KNL-NEXT: shrq $18, %r10
-; KNL-NEXT: andb $1, %r10b
-; KNL-NEXT: je LBB23_6
-; KNL-NEXT: ## BB#5:
-; KNL-NEXT: movb $-1, %r10b
-; KNL-NEXT: LBB23_6:
-; KNL-NEXT: movq %r15, %r9
-; KNL-NEXT: shrq $19, %r9
-; KNL-NEXT: andb $1, %r9b
-; KNL-NEXT: je LBB23_8
-; KNL-NEXT: ## BB#7:
-; KNL-NEXT: movb $-1, %r9b
-; KNL-NEXT: LBB23_8:
-; KNL-NEXT: movq %r15, %rbx
-; KNL-NEXT: shrq $20, %rbx
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB23_10
-; KNL-NEXT: ## BB#9:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB23_10:
-; KNL-NEXT: movq %r15, %r12
-; KNL-NEXT: shrq $21, %r12
-; KNL-NEXT: andb $1, %r12b
-; KNL-NEXT: je LBB23_12
-; KNL-NEXT: ## BB#11:
-; KNL-NEXT: movb $-1, %r12b
-; KNL-NEXT: LBB23_12:
-; KNL-NEXT: movq %r15, %r14
-; KNL-NEXT: shrq $22, %r14
-; KNL-NEXT: andb $1, %r14b
-; KNL-NEXT: je LBB23_14
-; KNL-NEXT: ## BB#13:
-; KNL-NEXT: movb $-1, %r14b
-; KNL-NEXT: LBB23_14:
-; KNL-NEXT: movq %r15, %r8
-; KNL-NEXT: shrq $23, %r8
-; KNL-NEXT: andb $1, %r8b
-; KNL-NEXT: je LBB23_16
-; KNL-NEXT: ## BB#15:
-; KNL-NEXT: movb $-1, %r8b
-; KNL-NEXT: LBB23_16:
-; KNL-NEXT: movq %r15, %r13
-; KNL-NEXT: shrq $24, %r13
-; KNL-NEXT: andb $1, %r13b
-; KNL-NEXT: je LBB23_18
-; KNL-NEXT: ## BB#17:
-; KNL-NEXT: movb $-1, %r13b
-; KNL-NEXT: LBB23_18:
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $25, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_20
-; KNL-NEXT: ## BB#19:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_20:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $26, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_22
-; KNL-NEXT: ## BB#21:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_22:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $272, %esi ## imm = 0x110
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $27, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_24
-; KNL-NEXT: ## BB#23:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_24:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $273, %eax ## imm = 0x111
-; KNL-NEXT: bextrl %esi, %edi, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $28, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB23_26
-; KNL-NEXT: ## BB#25:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB23_26:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vmovd %esi, %xmm2
-; KNL-NEXT: movl $274, %esi ## imm = 0x112
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $29, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB23_28
-; KNL-NEXT: ## BB#27:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB23_28:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %eax
-; KNL-NEXT: movzbl %r11b, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $30, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB23_30
-; KNL-NEXT: ## BB#29:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB23_30:
-; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT: movl $275, %eax ## imm = 0x113
-; KNL-NEXT: bextrl %eax, %edi, %r11d
-; KNL-NEXT: movzbl %dl, %edx
-; KNL-NEXT: vmovd %esi, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $31, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_32
-; KNL-NEXT: ## BB#31:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_32:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT: movl $276, %eax ## imm = 0x114
-; KNL-NEXT: bextrl %eax, %edi, %esi
-; KNL-NEXT: movl $277, %r11d ## imm = 0x115
-; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r10b, %r10d
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_34
-; KNL-NEXT: ## BB#33:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_34:
-; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $278, %r11d ## imm = 0x116
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r9b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shlq $63, %rcx
-; KNL-NEXT: sarq $63, %rcx
-; KNL-NEXT: vmovd %ecx, %xmm4
-; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $2, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_36
-; KNL-NEXT: ## BB#35:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_36:
-; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $279, %r9d ## imm = 0x117
-; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %bl, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $3, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_38
-; KNL-NEXT: ## BB#37:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_38:
-; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r9d, %edi, %edx
-; KNL-NEXT: movl $280, %esi ## imm = 0x118
-; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r12b, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $4, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_40
-; KNL-NEXT: ## BB#39:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_40:
-; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %ecx
-; KNL-NEXT: movl $281, %edx ## imm = 0x119
-; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r14b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $5, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_42
-; KNL-NEXT: ## BB#41:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_42:
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $282, %edx ## imm = 0x11A
-; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r8b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $6, %bl
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB23_44
-; KNL-NEXT: ## BB#43:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB23_44:
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %eax
-; KNL-NEXT: movl $283, %ecx ## imm = 0x11B
-; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r13b, %esi
-; KNL-NEXT: movzbl %bl, %edx
-; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $7, %bl
-; KNL-NEXT: je LBB23_46
-; KNL-NEXT: ## BB#45:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB23_46:
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: movl $284, %edx ## imm = 0x11C
-; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
-; KNL-NEXT: movzbl %al, %esi
-; KNL-NEXT: movzbl %bl, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $8, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_48
-; KNL-NEXT: ## BB#47:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_48:
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $285, %edx ## imm = 0x11D
-; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $9, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_50
-; KNL-NEXT: ## BB#49:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_50:
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $286, %edx ## imm = 0x11E
-; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $10, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_52
-; KNL-NEXT: ## BB#51:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_52:
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %edx
-; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $11, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_54
-; KNL-NEXT: ## BB#53:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_54:
-; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
-; KNL-NEXT: shrl $31, %edi
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $12, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_56
-; KNL-NEXT: ## BB#55:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_56:
-; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $13, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_58
-; KNL-NEXT: ## BB#57:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_58:
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $14, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_60
-; KNL-NEXT: ## BB#59:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_60:
-; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2
-; KNL-NEXT: shrq $15, %r15
-; KNL-NEXT: andb $1, %r15b
-; KNL-NEXT: je LBB23_62
-; KNL-NEXT: ## BB#61:
-; KNL-NEXT: movb $-1, %r15b
-; KNL-NEXT: LBB23_62:
-; KNL-NEXT: movzbl %r15b, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT: leaq -40(%rbp), %rsp
-; KNL-NEXT: popq %rbx
-; KNL-NEXT: popq %r12
-; KNL-NEXT: popq %r13
-; KNL-NEXT: popq %r14
-; KNL-NEXT: popq %r15
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
@@ -1321,7 +606,6 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
; SKX-NEXT: kmovq %rdi, %k0
; SKX-NEXT: cmpl %edx, %esi
; SKX-NEXT: setg %al
-; SKX-NEXT: andl $1, %eax
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: kshiftlq $5, %k1, %k1
; SKX-NEXT: korq %k1, %k0, %k0
@@ -1337,8 +621,7 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-LABEL: test18:
; KNL: ## BB#0:
-; KNL-NEXT: movzbl %dil, %eax
-; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kshiftlw $7, %k1, %k2
; KNL-NEXT: kshiftrw $15, %k2, %k2
@@ -1348,7 +631,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kshiftlw $7, %k2, %k1
; KNL-NEXT: korw %k1, %k0, %k1
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -1392,9 +676,7 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
-; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
@@ -1403,24 +685,17 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; KNL-LABEL: test22:
; KNL: ## BB#0:
-; KNL-NEXT: vpextrd $3, %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vpextrd $2, %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vpextrd $1, %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vmovd %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: test22:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
store <4 x i1> %a, <4 x i1>* %addr
@@ -1430,20 +705,1243 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; KNL-LABEL: test23:
; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: test23:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k0
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
store <2 x i1> %a, <2 x i1>* %addr
ret void
}
+
+define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
+; KNL-LABEL: store_v1i1:
+; KNL: ## BB#0:
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: kxnorw %k0, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kxorw %k1, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rsi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v1i1:
+; SKX: ## BB#0:
+; SKX-NEXT: andl $1, %edi
+; SKX-NEXT: kmovw %edi, %k0
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: kshiftrw $15, %k1, %k1
+; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rsi)
+; SKX-NEXT: retq
+ %x = xor <1 x i1> %c, <i1 1>
+ store <1 x i1> %x, <1 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
+; KNL-LABEL: store_v2i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v2i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
+; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <2 x i1> %c, <i1 1, i1 1>
+ store <2 x i1> %x, <2 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
+; KNL-LABEL: store_v4i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v4i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
+ store <4 x i1> %x, <4 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
+; KNL-LABEL: store_v8i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v8i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: knotb %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <8 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+ store <8 x i1> %x, <8 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
+; KNL-LABEL: store_v16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k0
+; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kmovw %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <16 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+ store <16 x i1> %x, <16 x i1>* %ptr, align 4
+ ret void
+}
+
+;void f2(int);
+;void f1(int c)
+;{
+; static int v = 0;
+; if (v == 0)
+; v = 1;
+; else
+; v = 0;
+; f2(v);
+;}
+
+@f1.v = internal unnamed_addr global i1 false, align 4
+
+define void @f1(i32 %c) {
+; KNL-LABEL: f1:
+; KNL: ## BB#0: ## %entry
+; KNL-NEXT: movzbl {{.*}}(%rip), %edi
+; KNL-NEXT: movl %edi, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kxnorw %k0, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kxorw %k1, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, {{.*}}(%rip)
+; KNL-NEXT: xorl $1, %edi
+; KNL-NEXT: jmp _f2 ## TAILCALL
+;
+; SKX-LABEL: f1:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: movzbl {{.*}}(%rip), %edi
+; SKX-NEXT: movl %edi, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: kshiftrw $15, %k1, %k1
+; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: kmovb %k0, {{.*}}(%rip)
+; SKX-NEXT: xorl $1, %edi
+; SKX-NEXT: jmp _f2 ## TAILCALL
+entry:
+ %.b1 = load i1, i1* @f1.v, align 4
+ %not..b1 = xor i1 %.b1, true
+ store i1 %not..b1, i1* @f1.v, align 4
+ %0 = zext i1 %not..b1 to i32
+ tail call void @f2(i32 %0) #2
+ ret void
+}
+
+declare void @f2(i32) #1
+
+define void @store_i16_i1(i16 %x, i1 *%y) {
+; CHECK-LABEL: store_i16_i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: movb %dil, (%rsi)
+; CHECK-NEXT: retq
+ %c = trunc i16 %x to i1
+ store i1 %c, i1* %y
+ ret void
+}
+
+define void @store_i8_i1(i8 %x, i1 *%y) {
+; CHECK-LABEL: store_i8_i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: movb %dil, (%rsi)
+; CHECK-NEXT: retq
+ %c = trunc i8 %x to i1
+ store i1 %c, i1* %y
+ ret void
+}
+
+define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
+; KNL-LABEL: test_build_vec_v32i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT: vpand %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_build_vec_v32i1:
+; SKX: ## BB#0:
+; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
+; KNL-LABEL: test_build_vec_v64i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_build_vec_v64i1:
+; SKX: ## BB#0:
+; SKX-NEXT: movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544
+; SKX-NEXT: kmovq %rax, %k1
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %ret = select <64 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <64 x i8> %x, <64 x i8> zeroinitializer
+ ret <64 x i8> %ret
+}
+
+define void @ktest_1(<8 x double> %in, double * %base) {
+; KNL-LABEL: ktest_1:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovupd (%rdi), %zmm1
+; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; KNL-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
+; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: je LBB41_2
+; KNL-NEXT: ## BB#1: ## %L1
+; KNL-NEXT: vmovapd %zmm0, (%rdi)
+; KNL-NEXT: retq
+; KNL-NEXT: LBB41_2: ## %L2
+; KNL-NEXT: vmovapd %zmm0, 8(%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ktest_1:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovupd (%rdi), %zmm1
+; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
+; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; SKX-NEXT: ktestb %k0, %k0
+; SKX-NEXT: je LBB41_2
+; SKX-NEXT: ## BB#1: ## %L1
+; SKX-NEXT: vmovapd %zmm0, (%rdi)
+; SKX-NEXT: retq
+; SKX-NEXT: LBB41_2: ## %L2
+; SKX-NEXT: vmovapd %zmm0, 8(%rdi)
+; SKX-NEXT: retq
+ %addr1 = getelementptr double, double * %base, i64 0
+ %addr2 = getelementptr double, double * %base, i64 1
+
+ %vaddr1 = bitcast double* %addr1 to <8 x double>*
+ %vaddr2 = bitcast double* %addr2 to <8 x double>*
+
+ %val1 = load <8 x double>, <8 x double> *%vaddr1, align 1
+ %val2 = load <8 x double>, <8 x double> *%vaddr2, align 1
+
+ %sel1 = fcmp ogt <8 x double>%in, %val1
+ %val3 = select <8 x i1> %sel1, <8 x double> %val2, <8 x double> zeroinitializer
+ %sel2 = fcmp olt <8 x double> %in, %val3
+ %sel3 = and <8 x i1> %sel1, %sel2
+
+ %int_sel3 = bitcast <8 x i1> %sel3 to i8
+ %res = icmp eq i8 %int_sel3, zeroinitializer
+ br i1 %res, label %L2, label %L1
+L1:
+ store <8 x double> %in, <8 x double>* %vaddr1
+ br label %End
+L2:
+ store <8 x double> %in, <8 x double>* %vaddr2
+ br label %End
+End:
+ ret void
+}
+
+define void @ktest_2(<32 x float> %in, float * %base) {
+;
+; KNL-LABEL: ktest_2:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Ltmp6:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Ltmp7:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Ltmp8:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $32, %rsp
+; KNL-NEXT: vmovups (%rdi), %zmm2
+; KNL-NEXT: vmovups 64(%rdi), %zmm3
+; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $0, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm2
+; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $13, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $2, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $1, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $0, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL-NEXT: vpsllw $7, %ymm2, %ymm2
+; KNL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; KNL-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z}
+; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm4
+; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vpslld $31, %zmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: cmpl $0, (%rsp)
+; KNL-NEXT: je LBB42_2
+; KNL-NEXT: ## BB#1: ## %L1
+; KNL-NEXT: vmovaps %zmm0, (%rdi)
+; KNL-NEXT: vmovaps %zmm1, 64(%rdi)
+; KNL-NEXT: jmp LBB42_3
+; KNL-NEXT: LBB42_2: ## %L2
+; KNL-NEXT: vmovaps %zmm0, 4(%rdi)
+; KNL-NEXT: vmovaps %zmm1, 68(%rdi)
+; KNL-NEXT: LBB42_3: ## %End
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ktest_2:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovups (%rdi), %zmm2
+; SKX-NEXT: vmovups 64(%rdi), %zmm3
+; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1
+; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2
+; SKX-NEXT: kunpckwd %k1, %k2, %k0
+; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z}
+; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z}
+; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1
+; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2
+; SKX-NEXT: kunpckwd %k1, %k2, %k1
+; SKX-NEXT: kord %k1, %k0, %k0
+; SKX-NEXT: ktestd %k0, %k0
+; SKX-NEXT: je LBB42_2
+; SKX-NEXT: ## BB#1: ## %L1
+; SKX-NEXT: vmovaps %zmm0, (%rdi)
+; SKX-NEXT: vmovaps %zmm1, 64(%rdi)
+; SKX-NEXT: retq
+; SKX-NEXT: LBB42_2: ## %L2
+; SKX-NEXT: vmovaps %zmm0, 4(%rdi)
+; SKX-NEXT: vmovaps %zmm1, 68(%rdi)
+; SKX-NEXT: retq
+ %addr1 = getelementptr float, float * %base, i64 0
+ %addr2 = getelementptr float, float * %base, i64 1
+
+ %vaddr1 = bitcast float* %addr1 to <32 x float>*
+ %vaddr2 = bitcast float* %addr2 to <32 x float>*
+
+ %val1 = load <32 x float>, <32 x float> *%vaddr1, align 1
+ %val2 = load <32 x float>, <32 x float> *%vaddr2, align 1
+
+ %sel1 = fcmp ogt <32 x float>%in, %val1
+ %val3 = select <32 x i1> %sel1, <32 x float> %val2, <32 x float> zeroinitializer
+ %sel2 = fcmp olt <32 x float> %in, %val3
+ %sel3 = or <32 x i1> %sel1, %sel2
+
+ %int_sel3 = bitcast <32 x i1> %sel3 to i32
+ %res = icmp eq i32 %int_sel3, zeroinitializer
+ br i1 %res, label %L2, label %L1
+L1:
+ store <32 x float> %in, <32 x float>* %vaddr1
+ br label %End
+L2:
+ store <32 x float> %in, <32 x float>* %vaddr2
+ br label %End
+End:
+ ret void
+}
+
+define <8 x i64> @load_8i1(<8 x i1>* %a) {
+; KNL-LABEL: load_8i1:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_8i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: vpmovm2q %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <8 x i1>, <8 x i1>* %a
+ %c = sext <8 x i1> %b to <8 x i64>
+ ret <8 x i64> %c
+}
+
+define <16 x i32> @load_16i1(<16 x i1>* %a) {
+; KNL-LABEL: load_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw (%rdi), %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovw (%rdi), %k0
+; SKX-NEXT: vpmovm2d %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <16 x i1>, <16 x i1>* %a
+ %c = sext <16 x i1> %b to <16 x i32>
+ ret <16 x i32> %c
+}
+
+define <2 x i16> @load_2i1(<2 x i1>* %a) {
+; KNL-LABEL: load_2i1:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_2i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: vpmovm2q %k0, %xmm0
+; SKX-NEXT: retq
+ %b = load <2 x i1>, <2 x i1>* %a
+ %c = sext <2 x i1> %b to <2 x i16>
+ ret <2 x i16> %c
+}
+
+define <4 x i16> @load_4i1(<4 x i1>* %a) {
+; KNL-LABEL: load_4i1:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_4i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
+ %b = load <4 x i1>, <4 x i1>* %a
+ %c = sext <4 x i1> %b to <4 x i16>
+ ret <4 x i16> %c
+}
+
+define <32 x i16> @load_32i1(<32 x i1>* %a) {
+; KNL-LABEL: load_32i1:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw (%rdi), %k1
+; KNL-NEXT: kmovw 2(%rdi), %k2
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpmovdw %zmm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_32i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovd (%rdi), %k0
+; SKX-NEXT: vpmovm2w %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <32 x i1>, <32 x i1>* %a
+ %c = sext <32 x i1> %b to <32 x i16>
+ ret <32 x i16> %c
+}
+
+define <64 x i8> @load_64i1(<64 x i1>* %a) {
+; KNL-LABEL: load_64i1:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw (%rdi), %k1
+; KNL-NEXT: kmovw 2(%rdi), %k2
+; KNL-NEXT: kmovw 4(%rdi), %k3
+; KNL-NEXT: kmovw 6(%rdi), %k4
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_64i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovq (%rdi), %k0
+; SKX-NEXT: vpmovm2b %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <64 x i1>, <64 x i1>* %a
+ %c = sext <64 x i1> %b to <64 x i8>
+ ret <64 x i8> %c
+}
+
+define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
+; KNL-LABEL: store_8i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_8i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ store <8 x i1> %v, <8 x i1>* %a
+ ret void
+}
+
+define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
+; KNL-LABEL: store_8i1_1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_8i1_1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %v1 = trunc <8 x i16> %v to <8 x i1>
+ store <8 x i1> %v1, <8 x i1>* %a
+ ret void
+}
+
+define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
+; KNL-LABEL: store_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k0
+; SKX-NEXT: kmovw %k0, (%rdi)
+; SKX-NEXT: retq
+ store <16 x i1> %v, <16 x i1>* %a
+ ret void
+}
+
+define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
+; KNL-LABEL: store_32i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, 2(%rdi)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_32i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k0
+; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: retq
+ store <32 x i1> %v, <32 x i1>* %a
+ ret void
+}
+
+define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
+; KNL-LABEL: store_32i1_1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, 2(%rdi)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_32i1_1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %zmm0, %zmm0
+; SKX-NEXT: vpmovw2m %zmm0, %k0
+; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: retq
+ %v1 = trunc <32 x i16> %v to <32 x i1>
+ store <32 x i1> %v1, <32 x i1>* %a
+ ret void
+}
+
+
+define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
+;
+; KNL-LABEL: store_64i1:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Ltmp9:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: pushq %r15
+; KNL-NEXT: Ltmp10:
+; KNL-NEXT: .cfi_def_cfa_offset 24
+; KNL-NEXT: pushq %r14
+; KNL-NEXT: Ltmp11:
+; KNL-NEXT: .cfi_def_cfa_offset 32
+; KNL-NEXT: pushq %r13
+; KNL-NEXT: Ltmp12:
+; KNL-NEXT: .cfi_def_cfa_offset 40
+; KNL-NEXT: pushq %r12
+; KNL-NEXT: Ltmp13:
+; KNL-NEXT: .cfi_def_cfa_offset 48
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: Ltmp14:
+; KNL-NEXT: .cfi_def_cfa_offset 56
+; KNL-NEXT: Ltmp15:
+; KNL-NEXT: .cfi_offset %rbx, -56
+; KNL-NEXT: Ltmp16:
+; KNL-NEXT: .cfi_offset %r12, -48
+; KNL-NEXT: Ltmp17:
+; KNL-NEXT: .cfi_offset %r13, -40
+; KNL-NEXT: Ltmp18:
+; KNL-NEXT: .cfi_offset %r14, -32
+; KNL-NEXT: Ltmp19:
+; KNL-NEXT: .cfi_offset %r15, -24
+; KNL-NEXT: Ltmp20:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vpslld $31, %zmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vmovd %r9d, %xmm3
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, 6(%rdi)
+; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $13, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $2, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $1, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %r10d, %xmm2
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: kshiftlw $0, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
+; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, 4(%rdi)
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %r10d, %xmm1
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $0, %k1, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
+; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: kmovw %k1, 2(%rdi)
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vmovd %r9d, %xmm0
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: popq %r12
+; KNL-NEXT: popq %r13
+; KNL-NEXT: popq %r14
+; KNL-NEXT: popq %r15
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_64i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: kmovq %k0, (%rdi)
+; SKX-NEXT: retq
+ store <64 x i1> %v, <64 x i1>* %a
+ ret void
+}
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
new file mode 100644
index 000000000000..68d283f0e33f
--- /dev/null
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+
+declare void @f()
+define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_4i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp0:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2d %k0, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+
+ %cmp_res = icmp ugt <4 x i32> %a, %b
+ %cmp_res2 = icmp sgt <4 x i32> %a, %b
+ call void @f()
+ %res = or <4 x i1> %cmp_res, %cmp_res2
+ ret <4 x i1> %res
+}
+
+define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_8i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp1:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT: korb %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2w %k0, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+
+ %cmp_res = icmp ugt <8 x i32> %a, %b
+ %cmp_res2 = icmp sgt <8 x i32> %a, %b
+ call void @f()
+ %res = or <8 x i1> %cmp_res, %cmp_res2
+ ret <8 x i1> %res
+}
+
+define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_16i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp2:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2b %k0, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %cmp_res = icmp ugt <16 x i32> %a, %b
+ %cmp_res2 = icmp sgt <16 x i32> %a, %b
+ call void @f()
+ %res = or <16 x i1> %cmp_res, %cmp_res2
+ ret <16 x i1> %res
+}
+
+define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_32i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp3:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
+; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload
+; CHECK-NEXT: kord %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2b %k0, %ymm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %cmp_res = icmp ugt <32 x i16> %a, %b
+ %cmp_res2 = icmp sgt <32 x i16> %a, %b
+ call void @f()
+ %res = or <32 x i1> %cmp_res, %cmp_res2
+ ret <32 x i1> %res
+}
+
+define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_64i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: Ltmp4:
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
+; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
+; CHECK-NEXT: korq %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2b %k0, %zmm0
+; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: retq
+
+ %cmp_res = icmp ugt <64 x i8> %a, %b
+ %cmp_res2 = icmp sgt <64 x i8> %a, %b
+ call void @f()
+ %res = or <64 x i1> %cmp_res, %cmp_res2
+ ret <64 x i1> %res
+}
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 0cd8458f73f5..6b07e9e704db 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -1,279 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
-; CHECK-LABEL: @test1
-; CHECK: vmovd %xmm0, %eax ## encoding: [0x62
-; CHECK: ret
define i32 @test1(float %x) {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %xmm0, %eax ## encoding: [0x62,0xf1,0x7d,0x08,0x7e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = bitcast float %x to i32
ret i32 %res
}
-; CHECK-LABEL: @test2
-; CHECK: vmovd %edi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test2(i32 %x) {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <4 x i32>undef, i32 %x, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test3
-; CHECK: vmovq %rdi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <2 x i64> @test3(i64 %x) {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <2 x i64>undef, i64 %x, i32 0
ret <2 x i64>%res
}
-; CHECK-LABEL: @test4
-; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test4(i32* %x) {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x
%res = insertelement <4 x i32>undef, i32 %y, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test5
-; CHECK: vmovss %xmm0, (%rdi) ## encoding: [0x62
-; CHECK: ret
define void @test5(float %x, float* %y) {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
store float %x, float* %y, align 4
ret void
}
-; CHECK-LABEL: @test6
-; CHECK: vmovsd %xmm0, (%rdi) ## encoding: [0x62
-; CHECK: ret
define void @test6(double %x, double* %y) {
+; CHECK-LABEL: test6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsd %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xff,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
store double %x, double* %y, align 8
ret void
}
-; CHECK-LABEL: @test7
-; CHECK: vmovss (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define float @test7(i32* %x) {
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x10,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x
%res = bitcast i32 %y to float
ret float %res
}
-; CHECK-LABEL: @test8
-; CHECK: vmovd %xmm0, %eax ## encoding: [0x62
-; CHECK: ret
define i32 @test8(<4 x i32> %x) {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %xmm0, %eax ## encoding: [0x62,0xf1,0x7d,0x08,0x7e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = extractelement <4 x i32> %x, i32 0
ret i32 %res
}
-; CHECK-LABEL: @test9
-; CHECK: vmovq %xmm0, %rax ## encoding: [0x62
-; CHECK: ret
define i64 @test9(<2 x i64> %x) {
+; CHECK-LABEL: test9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %xmm0, %rax ## encoding: [0x62,0xf1,0xfd,0x08,0x7e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = extractelement <2 x i64> %x, i32 0
ret i64 %res
}
-; CHECK-LABEL: @test10
-; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test10(i32* %x) {
+; CHECK-LABEL: test10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x, align 4
%res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test11
-; CHECK: vmovss (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x float> @test11(float* %x) {
+; CHECK-LABEL: test11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x10,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load float, float* %x, align 4
%res = insertelement <4 x float>zeroinitializer, float %y, i32 0
ret <4 x float>%res
}
-; CHECK-LABEL: @test12
-; CHECK: vmovsd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <2 x double> @test12(double* %x) {
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x10,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load double, double* %x, align 8
%res = insertelement <2 x double>zeroinitializer, double %y, i32 0
ret <2 x double>%res
}
-; CHECK-LABEL: @test13
-; CHECK: vmovq %rdi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <2 x i64> @test13(i64 %x) {
+; CHECK-LABEL: test13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
ret <2 x i64>%res
}
-; CHECK-LABEL: @test14
-; CHECK: vmovd %edi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test14(i32 %x) {
+; CHECK-LABEL: test14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test15
-; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test15(i32* %x) {
+; CHECK-LABEL: test15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x, align 4
%res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: test16
-; CHECK: vmovdqu32
-; CHECK: ret
define <16 x i32> @test16(i8 * %addr) {
+; CHECK-LABEL: test16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7e,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
%res = load <16 x i32>, <16 x i32>* %vaddr, align 1
ret <16 x i32>%res
}
-; CHECK-LABEL: test17
-; CHECK: vmovdqa32
-; CHECK: ret
define <16 x i32> @test17(i8 * %addr) {
+; CHECK-LABEL: test17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
%res = load <16 x i32>, <16 x i32>* %vaddr, align 64
ret <16 x i32>%res
}
-; CHECK-LABEL: test18
-; CHECK: vmovdqa64
-; CHECK: ret
define void @test18(i8 * %addr, <8 x i64> %data) {
+; CHECK-LABEL: test18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
store <8 x i64>%data, <8 x i64>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test19
-; CHECK: vmovdqu32
-; CHECK: ret
define void @test19(i8 * %addr, <16 x i32> %data) {
+; CHECK-LABEL: test19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
store <16 x i32>%data, <16 x i32>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test20
-; CHECK: vmovdqa32
-; CHECK: ret
define void @test20(i8 * %addr, <16 x i32> %data) {
+; CHECK-LABEL: test20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
store <16 x i32>%data, <16 x i32>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test21
-; CHECK: vmovdqa64
-; CHECK: ret
define <8 x i64> @test21(i8 * %addr) {
+; CHECK-LABEL: test21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
%res = load <8 x i64>, <8 x i64>* %vaddr, align 64
ret <8 x i64>%res
}
-; CHECK-LABEL: test22
-; CHECK: vmovdqu64
-; CHECK: ret
define void @test22(i8 * %addr, <8 x i64> %data) {
+; CHECK-LABEL: test22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
store <8 x i64>%data, <8 x i64>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test23
-; CHECK: vmovdqu64
-; CHECK: ret
define <8 x i64> @test23(i8 * %addr) {
+; CHECK-LABEL: test23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
%res = load <8 x i64>, <8 x i64>* %vaddr, align 1
ret <8 x i64>%res
}
-; CHECK-LABEL: test24
-; CHECK: vmovapd
-; CHECK: ret
define void @test24(i8 * %addr, <8 x double> %data) {
+; CHECK-LABEL: test24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
store <8 x double>%data, <8 x double>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test25
-; CHECK: vmovapd
-; CHECK: ret
define <8 x double> @test25(i8 * %addr) {
+; CHECK-LABEL: test25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
%res = load <8 x double>, <8 x double>* %vaddr, align 64
ret <8 x double>%res
}
-; CHECK-LABEL: test26
-; CHECK: vmovaps
-; CHECK: ret
define void @test26(i8 * %addr, <16 x float> %data) {
+; CHECK-LABEL: test26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
store <16 x float>%data, <16 x float>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test27
-; CHECK: vmovaps
-; CHECK: ret
define <16 x float> @test27(i8 * %addr) {
+; CHECK-LABEL: test27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
%res = load <16 x float>, <16 x float>* %vaddr, align 64
ret <16 x float>%res
}
-; CHECK-LABEL: test28
-; CHECK: vmovupd
-; CHECK: ret
define void @test28(i8 * %addr, <8 x double> %data) {
+; CHECK-LABEL: test28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
store <8 x double>%data, <8 x double>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test29
-; CHECK: vmovupd
-; CHECK: ret
define <8 x double> @test29(i8 * %addr) {
+; CHECK-LABEL: test29:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
%res = load <8 x double>, <8 x double>* %vaddr, align 1
ret <8 x double>%res
}
-; CHECK-LABEL: test30
-; CHECK: vmovups
-; CHECK: ret
define void @test30(i8 * %addr, <16 x float> %data) {
+; CHECK-LABEL: test30:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
store <16 x float>%data, <16 x float>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test31
-; CHECK: vmovups
-; CHECK: ret
define <16 x float> @test31(i8 * %addr) {
+; CHECK-LABEL: test31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
%res = load <16 x float>, <16 x float>* %vaddr, align 1
ret <16 x float>%res
}
-; CHECK-LABEL: test32
-; CHECK: vmovdqa32{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; CHECK-LABEL: test32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 64
@@ -281,10 +322,13 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test33
-; CHECK: vmovdqu32{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; CHECK-LABEL: test33:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 1
@@ -292,10 +336,13 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test34
-; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
+; CHECK-LABEL: test34:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 64
@@ -303,10 +350,13 @@ define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test35
-; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
+; CHECK-LABEL: test35:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 1
@@ -314,10 +364,13 @@ define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test36
-; CHECK: vmovdqa64{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; CHECK-LABEL: test36:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 64
@@ -325,10 +378,13 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test37
-; CHECK: vmovdqu64{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; CHECK-LABEL: test37:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 1
@@ -336,10 +392,13 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test38
-; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
+; CHECK-LABEL: test38:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 64
@@ -347,10 +406,13 @@ define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test39
-; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
+; CHECK-LABEL: test39:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 1
@@ -358,10 +420,14 @@ define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test40
-; CHECK: vmovaps{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; CHECK-LABEL: test40:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 64
@@ -369,10 +435,14 @@ define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
ret <16 x float>%res
}
-; CHECK-LABEL: test41
-; CHECK: vmovups{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; CHECK-LABEL: test41:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 1
@@ -380,10 +450,14 @@ define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
ret <16 x float>%res
}
-; CHECK-LABEL: test42
-; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
+; CHECK-LABEL: test42:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 64
@@ -391,10 +465,14 @@ define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
ret <16 x float>%res
}
-; CHECK-LABEL: test43
-; CHECK: vmovups{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
+; CHECK-LABEL: test43:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 1
@@ -402,10 +480,14 @@ define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
ret <16 x float>%res
}
-; CHECK-LABEL: test44
-; CHECK: vmovapd{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; CHECK-LABEL: test44:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 64
@@ -413,10 +495,14 @@ define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
ret <8 x double>%res
}
-; CHECK-LABEL: test45
-; CHECK: vmovupd{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; CHECK-LABEL: test45:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 1
@@ -424,10 +510,14 @@ define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
ret <8 x double>%res
}
-; CHECK-LABEL: test46
-; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
+; CHECK-LABEL: test46:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 64
@@ -435,10 +525,14 @@ define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
ret <8 x double>%res
}
-; CHECK-LABEL: test47
-; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) {
+; CHECK-LABEL: test47:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 1
diff --git a/test/CodeGen/X86/avx512-nontemporal.ll b/test/CodeGen/X86/avx512-nontemporal.ll
index bf57d021acab..adfaef25b7d3 100644
--- a/test/CodeGen/X86/avx512-nontemporal.ll
+++ b/test/CodeGen/X86/avx512-nontemporal.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s
-define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, i32 %D, <8 x i64> %E, <8 x i64> %EE) {
+define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH) {
; CHECK: vmovntps %z
%cast = bitcast i8* %B to <16 x float>*
%A2 = fadd <16 x float> %A, %AA
@@ -13,6 +13,18 @@ define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x
%cast2 = bitcast i8* %B to <8 x double>*
%C2 = fadd <8 x double> %C, %CC
store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+ %cast3 = bitcast i8* %B to <16 x i32>*
+ %F2 = add <16 x i32> %F, %FF
+ store <16 x i32> %F2, <16 x i32>* %cast3, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+ %cast4 = bitcast i8* %B to <32 x i16>*
+ %G2 = add <32 x i16> %G, %GG
+ store <32 x i16> %G2, <32 x i16>* %cast4, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+ %cast5 = bitcast i8* %B to <64 x i8>*
+ %H2 = add <64 x i8> %H, %HH
+ store <64 x i8> %H2, <64 x i8>* %cast5, align 64, !nontemporal !0
ret void
}
diff --git a/test/CodeGen/X86/avx512-scalarIntrinsics.ll b/test/CodeGen/X86/avx512-scalarIntrinsics.ll
new file mode 100644
index 000000000000..c26e1fb070fc
--- /dev/null
+++ b/test/CodeGen/X86/avx512-scalarIntrinsics.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+
+define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
+ ; CHECK-LABEL: test_rsqrt14_ss:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrsqrt14ss %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
+ ; CHECK-LABEL: test_rcp14_ss:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x double> @test_rsqrt14_sd(<2 x double> %a0) {
+ ; CHECK-LABEL: test_rsqrt14_sd:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrsqrt14sd %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ;
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_rcp14_sd(<2 x double> %a0) {
+ ; CHECK-LABEL: test_rcp14_sd:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrcp14sd %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ;
+ ret <2 x double> %res
+
+}
+declare <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
+define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+ ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
+ ; CHECK: vscalefss %xmm1, %xmm0, %xmm2 {%k1}
+ ; CHECK: vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
+define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+ ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
+ ; CHECK: vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
+ ; CHECK: vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
+ %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index b92e6f62813c..2ac91cc7482a 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -1,62 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; CHECK-LABEL: select00
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
+; CHECK-LABEL: select00:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB0_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: LBB0_2:
+; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
%res = xor <16 x i32> %b, %selres
ret <16 x i32> %res
}
-; CHECK-LABEL: select01
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
+; CHECK-LABEL: select01:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB1_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: LBB1_2:
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b
%res = xor <8 x i64> %b, %selres
ret <8 x i64> %res
}
-; CHECK-LABEL: @select02
-; CHECK: cmpless %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovss %xmm2, {{.*}}%xmm1 {%k1}
-; CHECK: ret
define float @select02(float %a, float %b, float %c, float %eps) {
+; CHECK-LABEL: select02:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpless %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%cmp = fcmp oge float %a, %eps
%cond = select i1 %cmp, float %c, float %b
ret float %cond
}
-; CHECK-LABEL: @select03
-; CHECK: cmplesd %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovsd %xmm2, {{.*}}%xmm1 {%k1}
-; CHECK: ret
define double @select03(double %a, double %b, double %c, double %eps) {
+; CHECK-LABEL: select03:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplesd %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovsd %xmm2, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%cmp = fcmp oge double %a, %eps
%cond = select i1 %cmp, double %c, double %b
ret double %cond
}
-; CHECK-LABEL: @select04
-; CHECK: vmovaps %zmm3, %zmm1
-; CHECK-NEXT: ret
-; PR20677
define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
+; CHECK-LABEL: select04:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %zmm3, %zmm1
+; CHECK-NEXT: retq
%sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
ret <16 x double> %sel
}
-; CHECK-LABEL: select05
-; CHECK: movzbl %sil, %eax
-; CHECK: kmovw %eax, %k0
-; CHECK: movzbl %dil, %eax
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
define i8 @select05(i8 %a.0, i8 %m) {
+; CHECK-LABEL: select05:
+; CHECK: ## BB#0:
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
@@ -64,14 +81,30 @@ define i8 @select05(i8 %a.0, i8 %m) {
ret i8 %res;
}
-; CHECK-LABEL: select06
-; CHECK: movzbl %sil, %eax
-; CHECK: kmovw %eax, %k0
-; CHECK: movzbl %dil, %eax
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
+define i8 @select05_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
+; CHECK-LABEL: select05_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl (%rsi), %eax
+; CHECK-NEXT: kmovw %eax, %k0
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %mask = load <8 x i1> , <8 x i1>* %m
+ %a = load <8 x i1> , <8 x i1>* %a.0
+ %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
+ %res = bitcast <8 x i1> %r to i8
+ ret i8 %res;
+}
+
define i8 @select06(i8 %a.0, i8 %m) {
+; CHECK-LABEL: select06:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
@@ -79,19 +112,36 @@ define i8 @select06(i8 %a.0, i8 %m) {
ret i8 %res;
}
-; CHECK-LABEL: select07
-; CHECK-DAG: movzbl %dl, %eax
-; CHECK-DAG: kmovw %eax, %k0
-; CHECK-DAG: movzbl %dil, %eax
-; CHECK-DAG: kmovw %eax, %k1
-; CHECK-DAG: movzbl %sil, %eax
-; CHECK-DAG: kmovw %eax, %k2
-; CHECK: kandw %k0, %k1, %k1
-; CHECK-NEXT: knotw %k0, %k0
-; CHECK-NEXT: kandw %k0, %k2, %k0
-; CHECK-NEXT: korw %k0, %k1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
+define i8 @select06_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
+; CHECK-LABEL: select06_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl (%rsi), %eax
+; CHECK-NEXT: kmovw %eax, %k0
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kandw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %mask = load <8 x i1> , <8 x i1>* %m
+ %a = load <8 x i1> , <8 x i1>* %a.0
+ %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
+ %res = bitcast <8 x i1> %r to i8
+ ret i8 %res;
+}
define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
+; CHECK-LABEL: select07:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kmovw %esi, %k2
+; CHECK-NEXT: kandw %k0, %k1, %k1
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: kandw %k0, %k2, %k0
+; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%b = bitcast i8 %b.0 to <8 x i1>
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index c54010cd91b9..c1d0a9a173e1 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -5,7 +5,7 @@ define <8 x i1> @test(<2 x i1> %a) {
; CHECK-LABEL: test:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -17,7 +17,7 @@ define <8 x i1> @test1(<2 x i1> %a) {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -29,12 +29,12 @@ define <8 x i1> @test2(<2 x i1> %a) {
; CHECK-LABEL: test2:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: vpmovm2q %k0, %zmm0
; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0
-; CHECK-NEXT: vpmovq2m %zmm0, %k0
+; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
%res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef>
@@ -45,7 +45,7 @@ define <8 x i1> @test3(<4 x i1> %a) {
; CHECK-LABEL: test3:
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
@@ -59,10 +59,13 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test4:
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k1
-; CHECK-NEXT: korb %k0, %k1, %k0
+; CHECK-NEXT: kshiftrb $4, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -74,10 +77,13 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test5:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
-; CHECK-NEXT: kshiftlw $2, %k0, %k0
-; CHECK-NEXT: kshiftrw $2, %k0, %k1
-; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $2, %k1, %k1
+; CHECK-NEXT: kshiftlb $2, %k0, %k0
+; CHECK-NEXT: kshiftrb $2, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: retq
@@ -89,10 +95,13 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test6:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
-; CHECK-NEXT: kshiftlw $2, %k0, %k0
-; CHECK-NEXT: kshiftrw $2, %k0, %k1
-; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $2, %k1, %k1
+; CHECK-NEXT: kshiftlb $2, %k0, %k0
+; CHECK-NEXT: kshiftrb $2, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kunpckbw %k0, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: retq
@@ -105,10 +114,13 @@ define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test7:
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k1
-; CHECK-NEXT: korb %k0, %k1, %k0
+; CHECK-NEXT: kshiftrb $4, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kunpckbw %k0, %k0, %k0
; CHECK-NEXT: kunpckwd %k0, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %ymm0
@@ -133,3 +145,26 @@ define <64 x i1> @test8(<8 x i1> %a, <8 x i1>%b) {
ret <64 x i1> %res
}
+define <4 x i1> @test9(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: test9:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT: vpmovw2m %xmm0, %k0
+; CHECK-NEXT: kshiftrw $4, %k0, %k0
+; CHECK-NEXT: vpmovm2d %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i1> %a, <8 x i1> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x i1> %res
+}
+
+define <2 x i1> @test10(<4 x i1> %a, <4 x i1> %b) {
+; CHECK-LABEL: test10:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: kshiftrw $2, %k0, %k0
+; CHECK-NEXT: vpmovm2q %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i1> %a, <4 x i1> %b, <2 x i32> <i32 2, i32 3>
+ ret <2 x i1> %res
+}
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index e4e5c2b8a1d5..35be44140026 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -53,7 +53,9 @@ define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qb_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256:
@@ -67,6 +69,7 @@ define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
; KNL-LABEL: trunc_qb_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovd %xmm0, (%rdi)
@@ -128,7 +131,9 @@ define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qw_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256:
@@ -142,6 +147,7 @@ define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
; KNL-LABEL: trunc_qw_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; KNL-NEXT: vmovq %xmm0, (%rdi)
@@ -203,7 +209,9 @@ define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qd_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256:
@@ -217,6 +225,7 @@ define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
; KNL-LABEL: trunc_qd_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vmovaps %xmm0, (%rdi)
; KNL-NEXT: retq
@@ -276,7 +285,9 @@ define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
; KNL-LABEL: trunc_db_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256:
@@ -290,6 +301,7 @@ define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
; KNL-LABEL: trunc_db_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
@@ -350,7 +362,9 @@ define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
; KNL-LABEL: trunc_dw_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256:
@@ -364,6 +378,7 @@ define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
; KNL-LABEL: trunc_dw_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vmovaps %xmm0, (%rdi)
; KNL-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/test/CodeGen/X86/avx512-unsafe-fp-math.ll
new file mode 100644
index 000000000000..1956b2f7eca9
--- /dev/null
+++ b/test/CodeGen/X86/avx512-unsafe-fp-math.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=CHECK_UNSAFE --check-prefix=AVX512F_UNSAFE
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+
+define <16 x float> @test_max_v16f32(<16 x float> * %a_ptr, <16 x float> %b) {
+; CHECK_UNSAFE-LABEL: test_max_v16f32:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vmaxps (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_max_v16f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <16 x float>, <16 x float>* %a_ptr
+ %tmp = fcmp fast ogt <16 x float> %a, %b
+ %tmp4 = select <16 x i1> %tmp, <16 x float> %a, <16 x float> %b
+ ret <16 x float> %tmp4;
+}
+
+define <16 x float> @test_min_v16f32(<16 x float>* %a_ptr, <16 x float> %b) {
+; CHECK_UNSAFE-LABEL: test_min_v16f32:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vminps (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_min_v16f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <16 x float>, <16 x float>* %a_ptr
+ %tmp = fcmp fast olt <16 x float> %a, %b
+ %tmp4 = select <16 x i1> %tmp, <16 x float> %a, <16 x float> %b
+ ret <16 x float> %tmp4;
+}
+
+define <8 x double> @test_max_v8f64(<8 x double> * %a_ptr, <8 x double> %b) {
+; CHECK_UNSAFE-LABEL: test_max_v8f64:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vmaxpd (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_max_v8f64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a_ptr
+ %tmp = fcmp fast ogt <8 x double> %a, %b
+ %tmp4 = select <8 x i1> %tmp, <8 x double> %a, <8 x double> %b
+ ret <8 x double> %tmp4;
+}
+
+define <8 x double> @test_min_v8f64(<8 x double>* %a_ptr, <8 x double> %b) {
+; CHECK_UNSAFE-LABEL: test_min_v8f64:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vminpd (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_min_v8f64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a_ptr
+ %tmp = fcmp fast olt <8 x double> %a, %b
+ %tmp4 = select <8 x i1> %tmp, <8 x double> %a, <8 x double> %b
+ ret <8 x double> %tmp4;
+}
+
+define float @test_min_f32(float %a, float* %ptr) {
+; CHECK_UNSAFE-LABEL: test_min_f32:
+; CHECK_UNSAFE: # BB#0: # %entry
+; CHECK_UNSAFE-NEXT: vminss (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_min_f32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vminss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = load float, float* %ptr
+ %1 = fcmp fast olt float %0, %a
+ %2 = select i1 %1, float %0, float %a
+ ret float %2
+}
+
+define double @test_max_f64(double %a, double* %ptr) {
+; CHECK_UNSAFE-LABEL: test_max_f64:
+; CHECK_UNSAFE: # BB#0: # %entry
+; CHECK_UNSAFE-NEXT: vmaxsd (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_max_f64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = load double, double* %ptr
+ %1 = fcmp fast ogt double %0, %a
+ %2 = select i1 %1, double %0, double %a
+ ret double %2
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 4f679f9aca6f..005dc23ccf7b 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -124,6 +124,7 @@ define <8 x double> @_inreg8xdouble(double %a) {
define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_mask:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
; ALL-NEXT: vpxor %ymm3, %ymm3, %ymm3
; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
@@ -139,6 +140,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m
define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_maskz:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
@@ -164,6 +166,7 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) {
define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_mask_load:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
@@ -179,6 +182,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8
define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_maskz_load:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; ALL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
@@ -214,9 +218,10 @@ define <16 x i32> @test_vbroadcast() {
; ALL: # BB#0: # %entry
; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0
; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1
-; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: knotw %k1, %k1
-; ALL-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
entry:
%0 = sext <16 x i1> zeroinitializer to <16 x i32>
@@ -398,3 +403,42 @@ define <8 x i64> @_invec4xi64(<4 x i64>%a) {
ret <8 x i64>%res
}
+declare void @func_f32(float)
+define <16 x float> @broadcast_ss_spill(float %x) {
+; ALL-LABEL: broadcast_ss_spill:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rax
+; ALL-NEXT: .Ltmp0:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; ALL-NEXT: callq func_f32
+; ALL-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %zmm0 # 4-byte Folded Reload
+; ALL-NEXT: popq %rax
+; ALL-NEXT: retq
+ %a = fadd float %x, %x
+ call void @func_f32(float %a)
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %c
+}
+
+declare void @func_f64(double)
+define <8 x double> @broadcast_sd_spill(double %x) {
+; ALL-LABEL: broadcast_sd_spill:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rax
+; ALL-NEXT: .Ltmp1:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill
+; ALL-NEXT: callq func_f64
+; ALL-NEXT: vbroadcastsd (%rsp), %zmm0 # 8-byte Folded Reload
+; ALL-NEXT: popq %rax
+; ALL-NEXT: retq
+ %a = fadd double %x, %x
+ call void @func_f64(double %a)
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double> %c
+}
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index a8c558df9de8..5bda3bd173da 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1,37 +1,35 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
-; KNL-LABEL: test1:
-; KNL: ## BB#0:
-; KNL-NEXT: vcmpleps %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
}
define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
-; KNL-LABEL: test2:
-; KNL: ## BB#0:
-; KNL-NEXT: vcmplepd %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
}
define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
-; KNL-LABEL: test3:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -39,36 +37,33 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
}
define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
-; KNL-LABEL: test4_unsigned:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test4_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
}
define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
-; KNL-LABEL: test5:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
}
define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) nounwind {
-; KNL-LABEL: test6_unsigned:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test6_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
@@ -81,13 +76,13 @@ define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+;
; SKX-LABEL: test7:
; SKX: ## BB#0:
-; SKX: vxorps %xmm2, %xmm2, %xmm2
-; SKX: vcmpltps %xmm2, %xmm0, %k1
-; SKX: vmovaps %xmm0, %xmm1 {%k1}
-; SKX: vmovaps %zmm1, %zmm0
-; SKX: retq
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp olt <4 x float> %a, zeroinitializer
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
@@ -101,13 +96,13 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+;
; SKX-LABEL: test8:
; SKX: ## BB#0:
-; SKX: vxorpd %xmm2, %xmm2, %xmm2
-; SKX: vcmpltpd %xmm2, %xmm0, %k1
-; SKX: vmovapd %xmm0, %xmm1 {%k1}
-; SKX: vmovaps %zmm1, %zmm0
-; SKX: retq
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp olt <2 x double> %a, zeroinitializer
%c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
ret <2 x double>%c
@@ -116,9 +111,18 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
; KNL-LABEL: test9:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+;
+; SKX-LABEL: test9:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -127,15 +131,18 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
; KNL-LABEL: test10:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+;
; SKX-LABEL: test10:
; SKX: ## BB#0:
-; SKX: vcmpeqps %ymm1, %ymm0, %k1
-; SKX: vmovaps %ymm0, %ymm1 {%k1}
-; SKX: vmovaps %zmm1, %zmm0
-; SKX: retq
+; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -143,29 +150,179 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
}
define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
-; KNL-LABEL: test11_unsigned:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test11_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
}
define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
-; KNL-LABEL: test12:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
-; KNL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
-; KNL-NEXT: kunpckbw %k0, %k1, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: retq
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: kunpckbw %k0, %k1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
}
define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
+; KNL-LABEL: test12_v32i32:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $32, %rsp
+; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
; SKX-LABEL: test12_v32i32:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
@@ -179,6 +336,308 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
}
define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
+; KNL-LABEL: test12_v64i16:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl (%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
; SKX-LABEL: test12_v64i16:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k0
@@ -192,11 +651,11 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
}
define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
-; KNL-LABEL: test13:
-; KNL: ## BB#0:
-; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; CHECK-LABEL: test13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
{
%cmpvector_i = fcmp oeq <16 x float> %a, %b
%conv = zext <16 x i1> %cmpvector_i to <16 x i32>
@@ -204,14 +663,12 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
}
define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
-; KNL-LABEL: test14:
-; KNL: ## BB#0:
-; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm1
-; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
-; KNL-NEXT: knotw %k0, %k0
-; KNL-NEXT: knotw %k0, %k1
-; KNL-NEXT: vmovdqu32 %zmm1, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; CHECK-LABEL: test14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm1
+; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%sub_r = sub <16 x i32> %a, %b
%cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
%sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
@@ -221,14 +678,12 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
}
define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
-; KNL-LABEL: test15:
-; KNL: ## BB#0:
-; KNL-NEXT: vpsubq %zmm1, %zmm0, %zmm1
-; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
-; KNL-NEXT: knotw %k0, %k0
-; KNL-NEXT: knotw %k0, %k1
-; KNL-NEXT: vmovdqu64 %zmm1, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; CHECK-LABEL: test15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%sub_r = sub <8 x i64> %a, %b
%cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
%sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
@@ -238,24 +693,22 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
}
define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
-; KNL-LABEL: test16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled %zmm0, %zmm1, %k1
-; KNL-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
+; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
}
define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
-; KNL-LABEL: test17:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -263,12 +716,11 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
}
define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
-; KNL-LABEL: test18:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -276,12 +728,11 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
}
define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
-; KNL-LABEL: test19:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpleud (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -289,13 +740,12 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
}
define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind {
-; KNL-LABEL: test20:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -304,13 +754,12 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
}
define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind {
-; KNL-LABEL: test21:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpleq %zmm1, %zmm0, %k1
-; KNL-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; KNL-NEXT: vmovaps %zmm2, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -319,13 +768,12 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
}
define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
-; KNL-LABEL: test22:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
-; KNL-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <8 x i64> %x, %y
@@ -335,13 +783,12 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
}
define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
-; KNL-LABEL: test23:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled %zmm1, %zmm2, %k1
-; KNL-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask0 = icmp ule <16 x i32> %x, %y
@@ -351,12 +798,11 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
}
define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
-; KNL-LABEL: test24:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -366,12 +812,11 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
}
define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
-; KNL-LABEL: test25:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -381,13 +826,12 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
}
define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
-; KNL-LABEL: test26:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled %zmm1, %zmm2, %k1
-; KNL-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -399,13 +843,12 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
}
define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
-; KNL-LABEL: test27:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpleq %zmm1, %zmm2, %k1
-; KNL-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -416,11 +859,24 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
}
-; KNL-LABEL: test28
-; KNL: vpcmpgtq
-; KNL: vpcmpgtq
-; KNL: kxnorw
define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) {
+; KNL-LABEL: test28:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
+; KNL-NEXT: kxnorw %k1, %k0, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test28:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; SKX-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
+; SKX-NEXT: kxnorb %k1, %k0, %k0
+; SKX-NEXT: vpmovm2d %k0, %ymm0
+; SKX-NEXT: retq
%x_gt_y = icmp sgt <8 x i64> %x, %y
%x1_gt_y1 = icmp sgt <8 x i64> %x1, %y1
%res = icmp eq <8 x i1>%x_gt_y, %x1_gt_y1
@@ -428,11 +884,24 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1
ret <8 x i32> %resse
}
-; KNL-LABEL: test29
-; KNL: vpcmpgtd
-; KNL: vpcmpgtd
-; KNL: kxorw
define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) {
+; KNL-LABEL: test29:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
+; KNL-NEXT: kxorw %k1, %k0, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test29:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; SKX-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
+; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: retq
%x_gt_y = icmp sgt <16 x i32> %x, %y
%x1_gt_y1 = icmp sgt <16 x i32> %x1, %y1
%res = icmp ne <16 x i1>%x_gt_y, %x1_gt_y1
@@ -441,9 +910,17 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
}
define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
+; KNL-LABEL: test30:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
+; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: test30:
-; SKX: vcmpeqpd %ymm1, %ymm0, %k1
-; SKX: vmovapd %ymm0, %ymm1 {%k1}
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp oeq <4 x double> %x, %y
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -451,9 +928,17 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
}
define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp) nounwind {
-; SKX-LABEL: test31:
-; SKX: vcmpltpd (%rdi), %xmm0, %k1
-; SKX: vmovapd %xmm0, %xmm1 {%k1}
+; KNL-LABEL: test31:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2
+; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test31:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%y = load <2 x double>, <2 x double>* %yp, align 4
%mask = fcmp olt <2 x double> %x, %y
@@ -462,9 +947,17 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
}
define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp) nounwind {
+; KNL-LABEL: test32:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2
+; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: test32:
-; SKX: vcmpltpd (%rdi), %ymm0, %k1
-; SKX: vmovapd %ymm0, %ymm1 {%k1}
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%y = load <4 x double>, <4 x double>* %yp, align 4
%mask = fcmp ogt <4 x double> %y, %x
@@ -473,9 +966,11 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
}
define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp) nounwind {
-; SKX-LABEL: test33:
-; SKX: vcmpltpd (%rdi), %zmm0, %k1
-; SKX: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test33:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -483,9 +978,17 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
}
define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) nounwind {
-; SKX-LABEL: test34:
-; SKX: vcmpltps (%rdi), %xmm0, %k1
-; SKX: vmovaps %xmm0, %xmm1 {%k1}
+; KNL-LABEL: test34:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2
+; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test34:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -493,9 +996,21 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
}
define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) nounwind {
+; KNL-LABEL: test35:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vmovups (%rdi), %ymm2
+; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
+; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: test35:
-; SKX: vcmpltps (%rdi), %ymm0, %k1
-; SKX: vmovaps %ymm0, %ymm1 {%k1}
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%y = load <8 x float>, <8 x float>* %yp, align 4
%mask = fcmp ogt <8 x float> %y, %x
@@ -504,9 +1019,11 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
}
define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp) nounwind {
-; SKX-LABEL: test36:
-; SKX: vcmpltps (%rdi), %zmm0, %k1
-; SKX: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test36:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -514,9 +1031,11 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
}
define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nounwind {
-; SKX-LABEL: test37:
-; SKX: vcmpltpd (%rdi){1to8}, %zmm0, %k1
-; SKX: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test37:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -528,28 +1047,46 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
}
define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nounwind {
-; SKX-LABEL: test38:
-; SKX: vcmpltpd (%rdi){1to4}, %ymm0, %k1
-; SKX: vmovapd %ymm0, %ymm1 {%k1}
+; KNL-LABEL: test38:
+; KNL: ## BB#0:
+; KNL-NEXT: vbroadcastsd (%rdi), %ymm2
+; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2
+; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test38:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> zeroinitializer
-
+
%mask = fcmp ogt <4 x double> %shuffle, %x
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %x1
ret <4 x double> %max
}
define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nounwind {
-; SKX-LABEL: test39:
-; SKX: vcmpltpd (%rdi){1to2}, %xmm0, %k1
-; SKX: vmovapd %xmm0, %xmm1 {%k1}
+; KNL-LABEL: test39:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
+; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test39:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-
+
%mask = fcmp ogt <2 x double> %shuffle, %x
%max = select <2 x i1> %mask, <2 x double> %x, <2 x double> %x1
ret <2 x double> %max
@@ -557,59 +1094,161 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) nounwind {
-; SKX-LABEL: test40:
-; SKX: vcmpltps (%rdi){1to16}, %zmm0, %k1
-; SKX: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test40:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%a = load float, float* %ptr
%v = insertelement <16 x float> undef, float %a, i32 0
%shuffle = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
+
%mask = fcmp ogt <16 x float> %shuffle, %x
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
ret <16 x float> %max
}
define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) nounwind {
-; SKX-LABEL: test41:
-; SKX: vcmpltps (%rdi){1to8}, %ymm0, %k1
-; SKX: vmovaps %ymm0, %ymm1 {%k1}
+; KNL-LABEL: test41:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vbroadcastss (%rdi), %ymm2
+; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
+; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test41:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%a = load float, float* %ptr
%v = insertelement <8 x float> undef, float %a, i32 0
%shuffle = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
+
%mask = fcmp ogt <8 x float> %shuffle, %x
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %x1
ret <8 x float> %max
}
define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) nounwind {
-; SKX-LABEL: test42:
-; SKX: vcmpltps (%rdi){1to4}, %xmm0, %k1
-; SKX: vmovaps %xmm0, %xmm1 {%k1}
-
+; KNL-LABEL: test42:
+; KNL: ## BB#0:
+; KNL-NEXT: vbroadcastss (%rdi), %xmm2
+; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
+; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test42:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
+
%a = load float, float* %ptr
%v = insertelement <4 x float> undef, float %a, i32 0
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-
+
%mask = fcmp ogt <4 x float> %shuffle, %x
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
ret <4 x float> %max
}
define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x i1> %mask_in) nounwind {
-; SKX-LABEL: test43:
-; SKX: vpmovw2m %xmm2, %k1
-; SKX: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; SKX: vmovapd %zmm0, %zmm1 {%k1}
+; KNL-LABEL: test43:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm2, %zmm2
+; KNL-NEXT: vpsllq $63, %zmm2, %zmm2
+; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
+; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test43:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm2, %xmm2
+; SKX-NEXT: vpmovw2m %xmm2, %k1
+; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
+; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
%shuffle = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> zeroinitializer
-
+
%mask_cmp = fcmp ogt <8 x double> %shuffle, %x
%mask = and <8 x i1> %mask_cmp, %mask_in
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
ret <8 x double> %max
}
+
+define <4 x i32> @test44(<4 x i16> %x, <4 x i16> %y) #0 {
+; KNL-LABEL: test44:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test44:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
+ %mask = icmp eq <4 x i16> %x, %y
+ %1 = sext <4 x i1> %mask to <4 x i32>
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 {
+; KNL-LABEL: test45:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test45:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <2 x i16> %x, %y
+ %1 = zext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 {
+; KNL-LABEL: test46:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; KNL-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm1
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test46:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = fcmp oeq <2 x float> %x, %y
+ %1 = zext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %1
+}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..50a9076163e8
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -0,0 +1,413 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
+
+define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %zmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %bc1 = bitcast i64* %a1 to <64 x i1>*
+ %arg1 = load <64 x i1>, <64 x i1>* %bc1
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <64 x i32> zeroinitializer
+ %res1 = select <64 x i1> %arg1, <64 x i8> %res0, <64 x i8> %arg0
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %bc0 = bitcast i64* %a0 to <64 x i1>*
+ %arg0 = load <64 x i1>, <64 x i1>* %bc0
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <64 x i32> zeroinitializer
+ %res1 = select <64 x i1> %arg0, <64 x i8> %res0, <64 x i8> zeroinitializer
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %zmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer
+ %res1 = bitcast <32 x i16> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_bslli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_bslli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_bsrli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_bsrli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+; TODO - improve support for i64 -> mmask64 on 32-bit targets
+define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast i64* %a1 to <64 x i1>*
+ %sel1 = load <64 x i1>, <64 x i1>* %arg1
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+ %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to <64 x i1>*
+ %sel0 = load <64 x i1>, <64 x i1>* %arg0
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+ %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %res1 = bitcast <32 x i16> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast i64* %a1 to <64 x i1>*
+ %sel1 = load <64 x i1>, <64 x i1>* %arg1
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+ %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to <64 x i1>*
+ %sel0 = load <64 x i1>, <64 x i1>* %arg0
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+ %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+ %res1 = bitcast <32 x i16> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+ %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+ %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..cb2f23e90f20
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -0,0 +1,538 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+
+declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)
+
+define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdx, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%eax)
+; AVX512F-32-NEXT: retl
+ call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
+ call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)
+
+define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edx, %k1
+; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%eax)
+; AVX512F-32-NEXT: retl
+ call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
+ call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
+ ret void
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: kmovd %edx, %k1
+; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1)
+ %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT: kmovq %rdx, %k1
+; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1)
+ %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psll_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psll_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
+; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
+ %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+define <8 x i64>@test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psll_load_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psll_load_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512F-32-NEXT: retl
+ %x0 = load <8 x i64>, <8 x i64> *%p0
+ %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psrl_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psrl_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
+ %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psrl_load_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psrl_load_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512F-32-NEXT: retl
+ %x0 = load <8 x i64>, <8 x i64> *%p0
+ %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
+ ret <8 x i64> %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
+; AVX512BW-LABEL: test_pcmpeq_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp0:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+ ret i64 %res
+}
+
+define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpeq_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp1:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+ ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_pcmpeq_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpeq_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
+; AVX512BW-LABEL: test_pcmpgt_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp2:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+ ret i64 %res
+}
+
+define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpgt_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp3:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+ ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_pcmpgt_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpgt_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
+
+declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 7cf6edafbcc8..b131befcf0a2 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2,178 +2,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
-define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
-; AVX512BW-LABEL: test_pcmpeq_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpeq_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp0:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
- ret i64 %res
-}
-
-define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpeq_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpeq_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp1:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
- ret i64 %res
-}
-
-declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
-
-define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_pcmpeq_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpeq_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpeq_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpeq_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
-
-define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
-; AVX512BW-LABEL: test_pcmpgt_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpgt_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp2:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
- ret i64 %res
-}
-
-define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpgt_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpgt_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp3:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
- ret i64 %res
-}
-
-declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
-
-define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_pcmpgt_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpgt_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpgt_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpgt_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
-
define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512BW-LABEL: test_cmp_b_512:
; AVX512BW: ## BB#0:
@@ -205,7 +33,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-LABEL: test_cmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp4:
+; AVX512F-32-NEXT: .Ltmp0:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
@@ -214,31 +42,31 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: vpcmpltb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -291,7 +119,7 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-LABEL: test_mask_cmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp5:
+; AVX512F-32-NEXT: .Ltmp1:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
@@ -303,31 +131,31 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpcmpltb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -381,7 +209,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-LABEL: test_ucmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp6:
+; AVX512F-32-NEXT: .Ltmp2:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: vpcmpequb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
@@ -390,31 +218,31 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnequb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -467,7 +295,7 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp7:
+; AVX512F-32-NEXT: .Ltmp3:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
@@ -479,31 +307,31 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -822,43 +650,6 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
-declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
-
-define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
-; AVX512BW-LABEL: test_x86_mask_blend_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_x86_mask_blend_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
- ret <32 x i16> %res
-}
-declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
-
-define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
-; AVX512BW-LABEL: test_x86_mask_blend_b_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_x86_mask_blend_b_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
- ret <64 x i8> %res
-}
-
define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
; AVX512BW: ## BB#0:
@@ -2510,138 +2301,6 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
ret <16 x i32> %res2
}
-declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
-; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
- %res2 = add <64 x i8> %res, %res1
- ret <64 x i8> %res2
-}
-
-declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
-; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
- %res2 = add <64 x i8> %res, %res1
- ret <64 x i8> %res2
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
-; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
-; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
- %res2 = add <32 x i16> %res, %res1
- ret <32 x i16> %res2
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
-; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
- %res2 = add <32 x i16> %res, %res1
- ret <32 x i16> %res2
-}
-
-declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
- %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
- %res3 = add <64 x i8> %res, %res1
- %res4 = add <64 x i8> %res3, %res2
- ret <64 x i8> %res4
-}
-
declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
@@ -2672,49 +2331,6 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
ret <32 x i16> %res4
}
-declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
-
-define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_dq_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpslldq $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpslldq $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_dq_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpslldq $8, %zmm0, %zmm1
-; AVX512F-32-NEXT: vpslldq $4, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
- %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
-
-define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpsrldq $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpsrldq $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpsrldq $8, %zmm0, %zmm1
-; AVX512F-32-NEXT: vpsrldq $4, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
- %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
@@ -2773,7 +2389,7 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp8:
+; AVX512F-32-NEXT: .Ltmp4:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
@@ -2799,7 +2415,7 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp9:
+; AVX512F-32-NEXT: .Ltmp5:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
@@ -2879,6 +2495,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -2887,9 +2513,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
ret <32 x i16> %res4
}
-declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i32, <32 x i16>, i32)
-define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
@@ -2899,9 +2525,19 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
@@ -2919,6 +2555,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrlv32hi:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -2939,6 +2585,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -2947,9 +2603,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16>
ret <32 x i16> %res4
}
-declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i32, <32 x i16>, i32)
-define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_wi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
@@ -2959,49 +2615,19 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <
; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
- %res3 = add <32 x i16> %res, %res1
- %res4 = add <32 x i16> %res3, %res2
- ret <32 x i16> %res4
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i8, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %esi, %k1
-; AVX512BW-NEXT: vpshufhw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpshufhw $3, %zmm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpshufhw $3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
- %res3 = add <32 x i16> %res, %res1
- %res4 = add <32 x i16> %res3, %res2
- ret <32 x i16> %res4
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i8, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %esi, %k1
-; AVX512BW-NEXT: vpshuflw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpshuflw $3, %zmm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpshuflw $3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_wi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
@@ -3019,6 +2645,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -3027,6 +2663,24 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
ret <32 x i16> %res4
}
+define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> <i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51>,
+ <32 x i16> <i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49>,
+ <32 x i16> zeroinitializer, i32 -1)
+ ret <32 x i16> %res
+}
+
declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
@@ -3039,6 +2693,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -3047,9 +2711,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16>
ret <32 x i16> %res4
}
-declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i32, <32 x i16>, i32)
-define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_wi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
@@ -3059,9 +2723,19 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i8 %x1, <
; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_wi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
@@ -3079,6 +2753,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psllv32hi:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -3092,13 +2776,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i3
define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpmovzxbw %ymm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpmovzxbw %ymm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpmovzxbw %ymm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
@@ -3107,23 +2801,256 @@ define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i1
ret <32 x i16> %res4
}
-
declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2 {%k1} {z}
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
-} \ No newline at end of file
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rcx
+; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $20, %esp
+; AVX512F-32-NEXT: .Ltmp6:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
+ %res2 = add i64 %res, %res1
+ ret i64 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %ecx
+; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %ecx
+; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)
+
+define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rcx
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $20, %esp
+; AVX512F-32-NEXT: .Ltmp7:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
+ %res2 = add i64 %res, %res1
+ ret i64 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)
+
+define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %ecx
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %ecx
+; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rsi, %k1
+; AVX512BW-NEXT: vpbroadcastb %dil, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastb %dil, %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpbroadcastb %dil, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT: vpbroadcastb %al, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpbroadcastb %al, %zmm0 {%k1}
+; AVX512F-32-NEXT: vpbroadcastb %al, %zmm2
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res2, %res3
+ ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: vpbroadcastw %di, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastw %di, %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpbroadcastw %di, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm0 {%k1}
+; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm2
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res2, %res3
+ ret <32 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll
index 0208011cf89d..619c42494e2d 100644
--- a/test/CodeGen/X86/avx512bw-mask-op.ll
+++ b/test/CodeGen/X86/avx512bw-mask-op.ll
@@ -1,6 +1,13 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
define i32 @mask32(i32 %x) {
+; CHECK-LABEL: mask32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k0
+; CHECK-NEXT: knotd %k0, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: retq
%m0 = bitcast i32 %x to <32 x i1>
%m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -8,14 +15,15 @@ define i32 @mask32(i32 %x) {
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <32 x i1> %m1 to i32
ret i32 %ret
-; CHECK-LABEL: mask32
-; CHECK: kmovd
-; CHECK-NEXT: knotd
-; CHECK-NEXT: kmovd
-; CHECK_NEXT: ret
}
define i64 @mask64(i64 %x) {
+; CHECK-LABEL: mask64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k0
+; CHECK-NEXT: knotq %k0, %k0
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: retq
%m0 = bitcast i64 %x to <64 x i1>
%m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -27,14 +35,15 @@ define i64 @mask64(i64 %x) {
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <64 x i1> %m1 to i64
ret i64 %ret
-; CHECK-LABEL: mask64
-; CHECK: kmovq
-; CHECK-NEXT: knotq
-; CHECK-NEXT: kmovq
-; CHECK_NEXT: ret
}
define void @mask32_mem(i32* %ptr) {
+; CHECK-LABEL: mask32_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd (%rdi), %k0
+; CHECK-NEXT: knotd %k0, %k0
+; CHECK-NEXT: kmovd %k0, (%rdi)
+; CHECK-NEXT: retq
%x = load i32, i32* %ptr, align 4
%m0 = bitcast i32 %x to <32 x i1>
%m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -44,14 +53,15 @@ define void @mask32_mem(i32* %ptr) {
%ret = bitcast <32 x i1> %m1 to i32
store i32 %ret, i32* %ptr, align 4
ret void
-; CHECK-LABEL: mask32_mem
-; CHECK: kmovd ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
-; CHECK-NEXT: knotd
-; CHECK-NEXT: kmovd %k{{[0-7]}}, ([[ARG1]])
-; CHECK_NEXT: ret
}
define void @mask64_mem(i64* %ptr) {
+; CHECK-LABEL: mask64_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq (%rdi), %k0
+; CHECK-NEXT: knotq %k0, %k0
+; CHECK-NEXT: kmovq %k0, (%rdi)
+; CHECK-NEXT: retq
%x = load i64, i64* %ptr, align 4
%m0 = bitcast i64 %x to <64 x i1>
%m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -65,35 +75,78 @@ define void @mask64_mem(i64* %ptr) {
%ret = bitcast <64 x i1> %m1 to i64
store i64 %ret, i64* %ptr, align 4
ret void
-; CHECK-LABEL: mask64_mem
-; CHECK: kmovq ([[ARG1]]), %k{{[0-7]}}
-; CHECK-NEXT: knotq
-; CHECK-NEXT: kmovq %k{{[0-7]}}, ([[ARG1]])
-; CHECK_NEXT: ret
}
define i32 @mand32(i32 %x, i32 %y) {
+; CHECK-LABEL: mand32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: andl %esi, %eax
+; CHECK-NEXT: xorl %esi, %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%ma = bitcast i32 %x to <32 x i1>
%mb = bitcast i32 %y to <32 x i1>
%mc = and <32 x i1> %ma, %mb
%md = xor <32 x i1> %ma, %mb
%me = or <32 x i1> %mc, %md
%ret = bitcast <32 x i1> %me to i32
-; CHECK: kandd
-; CHECK: kxord
-; CHECK: kord
+ ret i32 %ret
+}
+
+define i32 @mand32_mem(<32 x i1>* %x, <32 x i1>* %y) {
+; CHECK-LABEL: mand32_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd (%rdi), %k0
+; CHECK-NEXT: kmovd (%rsi), %k1
+; CHECK-NEXT: kandd %k1, %k0, %k2
+; CHECK-NEXT: kxord %k1, %k0, %k0
+; CHECK-NEXT: kord %k0, %k2, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: retq
+ %ma = load <32 x i1>, <32 x i1>* %x
+ %mb = load <32 x i1>, <32 x i1>* %y
+ %mc = and <32 x i1> %ma, %mb
+ %md = xor <32 x i1> %ma, %mb
+ %me = or <32 x i1> %mc, %md
+ %ret = bitcast <32 x i1> %me to i32
ret i32 %ret
}
define i64 @mand64(i64 %x, i64 %y) {
+; CHECK-LABEL: mand64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: xorq %rsi, %rdi
+; CHECK-NEXT: orq %rax, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%ma = bitcast i64 %x to <64 x i1>
%mb = bitcast i64 %y to <64 x i1>
%mc = and <64 x i1> %ma, %mb
%md = xor <64 x i1> %ma, %mb
%me = or <64 x i1> %mc, %md
%ret = bitcast <64 x i1> %me to i64
-; CHECK: kandq
-; CHECK: kxorq
-; CHECK: korq
+ ret i64 %ret
+}
+
+define i64 @mand64_mem(<64 x i1>* %x, <64 x i1>* %y) {
+; CHECK-LABEL: mand64_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq (%rdi), %k0
+; CHECK-NEXT: kmovq (%rsi), %k1
+; CHECK-NEXT: kandq %k1, %k0, %k2
+; CHECK-NEXT: kxorq %k1, %k0, %k0
+; CHECK-NEXT: korq %k0, %k2, %k0
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: retq
+ %ma = load <64 x i1>, <64 x i1>* %x
+ %mb = load <64 x i1>, <64 x i1>* %y
+ %mc = and <64 x i1> %ma, %mb
+ %md = xor <64 x i1> %ma, %mb
+ %me = or <64 x i1> %mc, %md
+ %ret = bitcast <64 x i1> %me to i64
ret i64 %ret
}
diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
index 519b649ff53a..c58b3cc8c3cd 100644
--- a/test/CodeGen/X86/avx512bw-mov.ll
+++ b/test/CodeGen/X86/avx512bw-mov.ll
@@ -1,27 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
-; CHECK-LABEL: test1
-; CHECK: vmovdqu8
-; CHECK: ret
define <64 x i8> @test1(i8 * %addr) {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*
%res = load <64 x i8>, <64 x i8>* %vaddr, align 1
ret <64 x i8>%res
}
-; CHECK-LABEL: test2
-; CHECK: vmovdqu8
-; CHECK: ret
define void @test2(i8 * %addr, <64 x i8> %data) {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 %zmm0, (%rdi)
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*
store <64 x i8>%data, <64 x i8>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test3
-; CHECK: vmovdqu8{{.*{%k[1-7]}}}
-; CHECK: ret
define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpblendmb (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ne <64 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <64 x i8>*
%r = load <64 x i8>, <64 x i8>* %vaddr, align 1
@@ -29,10 +35,13 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
ret <64 x i8>%res
}
-; CHECK-LABEL: test4
-; CHECK: vmovdqu8{{.*{%k[1-7]} {z}}}
-; CHECK: ret
define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%mask = icmp ne <64 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <64 x i8>*
%r = load <64 x i8>, <64 x i8>* %vaddr, align 1
@@ -40,28 +49,33 @@ define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
ret <64 x i8>%res
}
-; CHECK-LABEL: test5
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test5(i8 * %addr) {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*
%res = load <32 x i16>, <32 x i16>* %vaddr, align 1
ret <32 x i16>%res
}
-; CHECK-LABEL: test6
-; CHECK: vmovdqu16
-; CHECK: ret
define void @test6(i8 * %addr, <32 x i16> %data) {
+; CHECK-LABEL: test6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 %zmm0, (%rdi)
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*
store <32 x i16>%data, <32 x i16>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test7
-; CHECK: vmovdqu16{{.*{%k[1-7]}}}
-; CHECK: ret
define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpblendmw (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ne <32 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i16>*
%r = load <32 x i16>, <32 x i16>* %vaddr, align 1
@@ -69,13 +83,136 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
ret <32 x i16>%res
}
-; CHECK-LABEL: test8
-; CHECK: vmovdqu16{{.*{%k[1-7]} {z}}}
-; CHECK: ret
define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%mask = icmp ne <32 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i16>*
%r = load <32 x i16>, <32 x i16>* %vaddr, align 1
%res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer
ret <32 x i16>%res
}
+
+define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; CHECK-LABEL: test_mask_load_16xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $48, %k0, %k0
+; CHECK-NEXT: kshiftrq $48, %k0, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+
+define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; CHECK-LABEL: test_mask_load_32xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $32, %k0, %k0
+; CHECK-NEXT: kshiftrq $32, %k0, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
+
+define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; CHECK-LABEL: test_mask_load_8xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT: vpmovw2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $24, %k0, %k0
+; CHECK-NEXT: kshiftrd $24, %k0, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+
+define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; CHECK-LABEL: test_mask_load_16xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $16, %k0, %k0
+; CHECK-NEXT: kshiftrd $16, %k0, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
+
+define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; CHECK-LABEL: test_mask_store_16xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $48, %k0, %k0
+; CHECK-NEXT: kshiftrq $48, %k0, %k1
+; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+
+define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; CHECK-LABEL: test_mask_store_32xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $32, %k0, %k0
+; CHECK-NEXT: kshiftrq $32, %k0, %k1
+; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
+
+define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; CHECK-LABEL: test_mask_store_8xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT: vpmovw2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $24, %k0, %k0
+; CHECK-NEXT: kshiftrd $24, %k0, %k1
+; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+
+define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; CHECK-LABEL: test_mask_store_16xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $16, %k0, %k0
+; CHECK-NEXT: kshiftrd $16, %k0, %k1
+; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
index 141f5cc09219..016837e61307 100644
--- a/test/CodeGen/X86/avx512bw-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -1,94 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; CHECK-LABEL: test1
-; CHECK: vpcmpeqb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: test2
-; CHECK: vpcmpgtb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: @test3
-; CHECK: vpcmplew {{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1
+; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
ret <32 x i16> %max
}
-; CHECK-LABEL: test4
-; CHECK: vpcmpnleub {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: test5
-; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %yp, align 4
%mask = icmp eq <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test6
-; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sgt <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test7
-; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sle <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test8
-; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp ule <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test9
-; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+; CHECK-LABEL: test9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <32 x i16> %x1, %y1
%mask0 = icmp eq <32 x i16> %x, %y
%mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
@@ -96,11 +107,13 @@ define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16
ret <32 x i16> %max
}
-; CHECK-LABEL: @test10
-; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+; CHECK-LABEL: test10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <64 x i8> %x1, %y1
%mask0 = icmp sle <64 x i8> %x, %y
%mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
@@ -108,11 +121,13 @@ define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: @test11
-; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+; CHECK-LABEL: test11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <64 x i8> %x1, %y1
%y = load <64 x i8>, <64 x i8>* %y.ptr, align 4
%mask0 = icmp sgt <64 x i8> %x, %y
@@ -121,11 +136,13 @@ define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i
ret <64 x i8> %max
}
-; CHECK-LABEL: @test12
-; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i16> %x1, %y1
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask0 = icmp ule <32 x i16> %x, %y
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..7cd0da9564ff
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlbw-builtins.c
+
+define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res1 = bitcast <16 x i8> %res0 to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x i8> %res0, <16 x i8> %arg0
+ %res2 = bitcast <16 x i8> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x i8> %res0, <16 x i8> zeroinitializer
+ %res2 = bitcast <16 x i8> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <32 x i32> zeroinitializer
+ %res1 = bitcast <32 x i8> %res0 to <4 x i64>
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_mask_broadcastb_epi8(<4 x i64> %a0, i32 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg1, <32 x i8> %res0, <32 x i8> %arg0
+ %res2 = bitcast <32 x i8> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastb_epi8(i32 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg0, <32 x i8> %res0, <32 x i8> zeroinitializer
+ %res2 = bitcast <32 x i8> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res1 = bitcast <8 x i16> %res0 to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_mask_broadcastw_epi16(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x i16> %res0, <8 x i16> %arg0
+ %res2 = bitcast <8 x i16> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_mm_maskz_broadcastw_epi16(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x i16> %res0, <8 x i16> zeroinitializer
+ %res2 = bitcast <8 x i16> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <16 x i32> zeroinitializer
+ %res1 = bitcast <16 x i16> %res0 to <4 x i64>
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x i16> %res0, <16 x i16> %arg0
+ %res2 = bitcast <16 x i16> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x i16> %res0, <16 x i16> zeroinitializer
+ %res2 = bitcast <16 x i16> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..9373561ea3ae
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -0,0 +1,629 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x78,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
+; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc9]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
+ %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
+ %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res2, %res3
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x78,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc9]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
+ %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res2, %res3
+ ret <16 x i8> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x79,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
+; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc9]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
+ %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x79,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc9]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res2, %res3
+ ret <8 x i16> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
+; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
+; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
+ %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
+ %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res2, %res3
+ ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
+; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
+ %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res2, %res3
+ ret <32 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16)
+
+define void@test_int_x86_avx512_mask_storeu_b_128(i8* %ptr1, i8* %ptr2, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu8 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7f,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr1, <16 x i8> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr2, <16 x i8> %x1, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.b.256(i8*, <32 x i8>, i32)
+
+define void@test_int_x86_avx512_mask_storeu_b_256(i8* %ptr1, i8* %ptr2, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu8 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7f,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2)
+ call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.128(i8*, <8 x i16>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_w_128(i8* %ptr1, i8* %ptr2, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu16 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xff,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr1, <8 x i16> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr2, <8 x i16> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.256(i8*, <16 x i16>, i16)
+
+define void@test_int_x86_avx512_mask_storeu_w_256(i8* %ptr1, i8* %ptr2, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu16 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xff,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1)
+ ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> %x1, i8 -1)
+ %res = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xff,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> %x1, i16 -1)
+ %res = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> %x1, i16 -1)
+ %res = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> %x1, i32 -1)
+ %res = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x0f,0xd9,0x02]
+; CHECK-NEXT: ## xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
+; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x0f,0xd9,0x02]
+; CHECK-NEXT: ## ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
+; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i32, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x70,0xd0,0x03]
+; CHECK-NEXT: ## xmm2 = xmm0[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i32, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x70,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
+; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i32, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7f,0x08,0x70,0xd0,0x03]
+; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0,4,5,6,7]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7]
+; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i32, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7f,0x28,0x70,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
+; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
+
+declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 4cbb9ba6c56a..534d5c85f008 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -1,124 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
; 256-bit
-define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: test_pcmpeq_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
-
-define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_pcmpeq_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
-
-define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: test_pcmpgt_b_256
-; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_b_256
-; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
-
-define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_pcmpgt_w_256
-; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_w_256
-; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
-
define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
-; CHECK_LABEL: test_cmp_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
%res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
%res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
%res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
%res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
%res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
%res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
%res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
}
define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_cmp_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
@@ -127,58 +103,97 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
-; CHECK_LABEL: test_ucmp_b_256
-; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpnequb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
%res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
%res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
%res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
%res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
%res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
%res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
}
define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_ucmp_b_256
-; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
@@ -187,58 +202,95 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK_LABEL: test_cmp_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x01]
+; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnlew %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_cmp_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd1,0x01]
+; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordw %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnlew %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -247,58 +299,95 @@ define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask)
declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK_LABEL: test_ucmp_w_256
-; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_ucmp_w_256
-; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduw %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequw %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -309,15 +398,24 @@ declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) n
; 128-bit
define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_pcmpeq_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpeq_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
ret i16 %res
}
define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpeq_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
ret i16 %res
}
@@ -325,15 +423,24 @@ define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_pcmpeq_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpeq_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpeq_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
ret i8 %res
}
@@ -341,15 +448,24 @@ define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_pcmpgt_b_128
-; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpgt_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
ret i16 %res
}
define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_b_128
-; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpgt_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
ret i16 %res
}
@@ -357,15 +473,24 @@ define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_pcmpgt_w_128
-; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpgt_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_w_128
-; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpgt_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
ret i8 %res
}
@@ -373,58 +498,95 @@ define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK_LABEL: test_cmp_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x01]
+; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordb %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleb %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordb %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_cmp_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltb %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd1,0x01]
+; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordb %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleb %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -433,58 +595,95 @@ define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK_LABEL: test_ucmp_b_128
-; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordub %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequb %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf1,0x06]
+; CHECK-NEXT: vpcmpordub %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_ucmp_b_128
-; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordub %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequb %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xf9,0x06]
+; CHECK-NEXT: vpcmpordub %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -493,58 +692,95 @@ define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask)
declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK_LABEL: test_cmp_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x01]
+; CHECK-NEXT: vpcmplew %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltw %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnlew %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_cmp_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd1,0x01]
+; CHECK-NEXT: vpcmplew %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordw %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltw %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnlew %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -553,58 +789,95 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK_LABEL: test_ucmp_w_128
-; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuw %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuw %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_ucmp_w_128
-; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuw %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduw %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequw %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuw %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -615,8 +888,11 @@ declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounw
declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd256_ps
- ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd256_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
@@ -624,8 +900,11 @@ define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps
- ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -634,7 +913,10 @@ declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x doub
define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd256_pd:
-; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
ret <4 x double> %res
}
@@ -643,7 +925,10 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x doub
define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd128_pd:
-; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
ret <2 x double> %res
}
@@ -651,13 +936,12 @@ define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2
define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -669,13 +953,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -687,13 +970,12 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -703,13 +985,12 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2
define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -721,13 +1002,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -739,13 +1019,12 @@ declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -755,13 +1034,12 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -773,13 +1051,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -791,13 +1068,12 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -807,13 +1083,12 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x
define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -825,13 +1100,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -843,13 +1117,12 @@ declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -862,13 +1135,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd9]
+; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xaa,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -881,13 +1153,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd9]
+; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xaa,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -899,13 +1170,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd9]
+; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xaa,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -917,13 +1187,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd9]
+; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xaa,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -933,8 +1202,11 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x
declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd256_ps
- ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd256_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
@@ -942,8 +1214,11 @@ define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd128_ps
- ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd128_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -951,8 +1226,11 @@ define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4
declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd256_pd
- ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd256_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
@@ -960,8 +1238,11 @@ define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1,
declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd128_pd
- ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd128_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
@@ -969,8 +1250,11 @@ define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1,
declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub256_ps
- ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub256_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
@@ -978,8 +1262,11 @@ define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8
declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub128_ps
- ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub128_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -987,8 +1274,11 @@ define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4
declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub256_pd
- ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub256_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
@@ -996,8 +1286,11 @@ define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1,
declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub128_pd
- ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub128_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
@@ -1006,13 +1299,12 @@ define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1,
define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xda]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1024,13 +1316,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x do
define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1040,13 +1331,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <
define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xda]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1058,13 +1348,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x do
define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1074,13 +1363,12 @@ define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <
define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xda]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1092,13 +1380,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x floa
define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1108,13 +1395,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4
define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xda]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1126,13 +1412,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x floa
define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1142,13 +1427,12 @@ define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8
define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xda]
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xac,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1158,13 +1442,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2
define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xda]
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xac,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1174,13 +1457,12 @@ define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4
define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xda]
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xac,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1190,13 +1472,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x
define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xda]
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xac,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1207,7 +1488,10 @@ declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x flo
define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub256_ps:
-; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
ret <8 x float> %res
}
@@ -1216,7 +1500,10 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x flo
define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub128_ps:
-; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
ret <4 x float> %res
}
@@ -1224,8 +1511,11 @@ define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4
declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmaddsub256_pd
- ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
+; CHECK-LABEL: test_mask_vfmaddsub256_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
@@ -1233,8 +1523,11 @@ define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a
declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmaddsub128_pd
- ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
+; CHECK-LABEL: test_mask_vfmaddsub128_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
@@ -1242,13 +1535,12 @@ define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a
define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1260,13 +1552,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1278,13 +1569,12 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1294,13 +1584,12 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0,
define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1312,13 +1601,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1330,13 +1618,12 @@ declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1346,13 +1633,12 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0,
define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1364,13 +1650,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1382,13 +1667,12 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1398,13 +1682,12 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <
define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1416,13 +1699,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1434,13 +1716,12 @@ declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1452,13 +1733,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa7,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2=fadd <2 x double> %res, %res1
@@ -1470,13 +1750,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa7,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2=fadd <4 x double> %res, %res1
@@ -1488,13 +1767,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa7,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2=fadd <4 x float> %res, %res1
@@ -1506,13 +1784,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa7,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2=fadd <8 x float> %res, %res1
@@ -1521,54 +1798,72 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <
define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_r
- ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_ps_r:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rz
- ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmka:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmb:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1579,8 +1874,11 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
}
define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmba:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1591,8 +1889,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
}
define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1603,8 +1903,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
}
define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1615,104 +1917,142 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
}
define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_r
- ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_pd_r:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_rz
- ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_pd_rz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
ret <2 x double> %res
}
define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
- ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_pd_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
- ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
ret <2 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_r
- ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd256_pd_r:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_rz
- ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd256_pd_rz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
ret <4 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
- ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd256_pd_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
- ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
ret <4 x double> %res
}
define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_add_epi16_rr_128
- ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrk_128
- ;CHECK: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
+; CHECK-LABEL: test_mask_add_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrkz_128
- ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi16_rm_128
- ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmk_128
- ;CHECK: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
+; CHECK-LABEL: test_mask_add_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmkz_128
- ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -1721,45 +2061,63 @@ define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_add_epi16_rr_256
- ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrk_256
- ;CHECK: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
+; CHECK-LABEL: test_mask_add_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrkz_256
- ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi16_rm_256
- ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmk_256
- ;CHECK: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
+; CHECK-LABEL: test_mask_add_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmkz_256
- ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -1768,45 +2126,63 @@ define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_
declare <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rr_128
- ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrk_128
- ;CHECK: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
+; CHECK-LABEL: test_mask_sub_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrkz_128
- ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rm_128
- ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmk_128
- ;CHECK: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
+; CHECK-LABEL: test_mask_sub_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmkz_128
- ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -1815,45 +2191,63 @@ define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rr_256
- ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrk_256
- ;CHECK: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
+; CHECK-LABEL: test_mask_sub_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrkz_256
- ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rm_256
- ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmk_256
- ;CHECK: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
+; CHECK-LABEL: test_mask_sub_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmkz_256
- ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -1862,45 +2256,63 @@ define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_
declare <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <32 x i16> @test_mask_add_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
- ;CHECK-LABEL: test_mask_add_epi16_rr_512
- ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrk_512
- ;CHECK: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
+; CHECK-LABEL: test_mask_add_epi16_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrkz_512
- ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi16_rm_512
- ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmk_512
- ;CHECK: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
+; CHECK-LABEL: test_mask_add_epi16_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmkz_512
- ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
@@ -1909,45 +2321,63 @@ define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_
declare <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <32 x i16> @test_mask_sub_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rr_512
- ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrk_512
- ;CHECK: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
+; CHECK-LABEL: test_mask_sub_epi16_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrkz_512
- ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rm_512
- ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmk_512
- ;CHECK: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
+; CHECK-LABEL: test_mask_sub_epi16_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmkz_512
- ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
@@ -1956,45 +2386,63 @@ define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_
declare <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <32 x i16> @test_mask_mullo_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rr_512
- ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrk_512
- ;CHECK: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_512
- ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rm_512
- ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmk_512
- ;CHECK: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi16_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_512
- ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
@@ -2003,45 +2451,63 @@ define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
declare <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rr_128
- ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrk_128
- ;CHECK: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_128
- ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rm_128
- ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmk_128
- ;CHECK: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_128
- ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2050,45 +2516,63 @@ define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b
declare <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rr_256
- ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrk_256
- ;CHECK: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_256
- ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rm_256
- ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmk_256
- ;CHECK: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_256
- ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2098,53 +2582,73 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16>, <16 x i16>, <16
define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rr_128
- ;CHECK: vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrk_128
- ;CHECK: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
+; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrkz_128
- ;CHECK: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rm_128
- ;CHECK: vpackssdw (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmk_128
- ;CHECK: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmkz_128
- ;CHECK: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmb_128
- ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2153,8 +2657,12 @@ define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbk_128
- ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2163,8 +2671,11 @@ define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x
}
define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_128
- ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2175,53 +2686,73 @@ define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8
declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rr_256
- ;CHECK: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrk_256
- ;CHECK: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
+; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrkz_256
- ;CHECK: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rm_256
- ;CHECK: vpackssdw (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmk_256
- ;CHECK: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmkz_256
- ;CHECK: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmb_256
- ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2230,8 +2761,12 @@ define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbk_256
- ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2240,8 +2775,11 @@ define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16
}
define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_256
- ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2252,45 +2790,63 @@ define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i1
declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rr_128
- ;CHECK: vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrk_128
- ;CHECK: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0xd1]
+; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrkz_128
- ;CHECK: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rm_128
- ;CHECK: vpacksswb (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmk_128
- ;CHECK: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0x0f]
+; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmkz_128
- ;CHECK: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -2299,45 +2855,63 @@ define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b
declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rr_256
- ;CHECK: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrk_256
- ;CHECK: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0xd1]
+; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrkz_256
- ;CHECK: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rm_256
- ;CHECK: vpacksswb (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmk_256
- ;CHECK: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0x0f]
+; CHECK-LABEL: test_mask_packs_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmkz_256
- ;CHECK: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -2347,53 +2921,73 @@ declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32
define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rr_128
- ;CHECK: vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrk_128
- ;CHECK: vpackusdw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrkz_128
- ;CHECK: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rm_128
- ;CHECK: vpackusdw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmk_128
- ;CHECK: vpackusdw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmkz_128
- ;CHECK: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmb_128
- ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2402,8 +2996,12 @@ define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbk_128
- ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2412,8 +3010,11 @@ define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8
}
define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_128
- ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2424,53 +3025,73 @@ define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8
declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rr_256
- ;CHECK: vpackusdw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrk_256
- ;CHECK: vpackusdw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrkz_256
- ;CHECK: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rm_256
- ;CHECK: vpackusdw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmk_256
- ;CHECK: vpackusdw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmkz_256
- ;CHECK: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmb_256
- ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2479,8 +3100,12 @@ define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbk_256
- ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2489,8 +3114,11 @@ define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <1
}
define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_256
- ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2501,45 +3129,63 @@ define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i
declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rr_128
- ;CHECK: vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrk_128
- ;CHECK: vpackuswb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrkz_128
- ;CHECK: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rm_128
- ;CHECK: vpackuswb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmk_128
- ;CHECK: vpackuswb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmkz_128
- ;CHECK: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -2548,45 +3194,63 @@ define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_
declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rr_256
- ;CHECK: vpackuswb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrk_256
- ;CHECK: vpackuswb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrkz_256
- ;CHECK: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rm_256
- ;CHECK: vpackuswb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmk_256
- ;CHECK: vpackuswb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmkz_256
- ;CHECK: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -2595,45 +3259,63 @@ define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %pt
declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rr_128
- ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrk_128
- ;CHECK: vpaddsw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrkz_128
- ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rm_128
- ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmk_128
- ;CHECK: vpaddsw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmkz_128
- ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2642,45 +3324,63 @@ define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rr_256
- ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrk_256
- ;CHECK: vpaddsw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrkz_256
- ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rm_256
- ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmk_256
- ;CHECK: vpaddsw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmkz_256
- ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2689,45 +3389,63 @@ define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rr_128
- ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrk_128
- ;CHECK: vpsubsw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrkz_128
- ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rm_128
- ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmk_128
- ;CHECK: vpsubsw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmkz_128
- ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2736,45 +3454,63 @@ define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rr_256
- ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrk_256
- ;CHECK: vpsubsw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrkz_256
- ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rm_256
- ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmk_256
- ;CHECK: vpsubsw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmkz_256
- ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2783,45 +3519,63 @@ define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rr_128
- ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrk_128
- ;CHECK: vpaddusw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrkz_128
- ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rm_128
- ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmk_128
- ;CHECK: vpaddusw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmkz_128
- ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2830,45 +3584,63 @@ define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rr_256
- ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrk_256
- ;CHECK: vpaddusw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrkz_256
- ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rm_256
- ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmk_256
- ;CHECK: vpaddusw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmkz_256
- ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2877,45 +3649,63 @@ define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rr_128
- ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrk_128
- ;CHECK: vpsubusw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrkz_128
- ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rm_128
- ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmk_128
- ;CHECK: vpsubusw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmkz_128
- ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2924,45 +3714,63 @@ define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rr_256
- ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrk_256
- ;CHECK: vpsubusw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrkz_256
- ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rm_256
- ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmk_256
- ;CHECK: vpsubusw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmkz_256
- ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2971,45 +3779,63 @@ define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rr_128
- ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrk_128
- ;CHECK: vpaddsb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrkz_128
- ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rm_128
- ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmk_128
- ;CHECK: vpaddsb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmkz_128
- ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3018,45 +3844,63 @@ define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rr_256
- ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrk_256
- ;CHECK: vpaddsb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrkz_256
- ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rm_256
- ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmk_256
- ;CHECK: vpaddsb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmkz_256
- ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3065,45 +3909,63 @@ define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b,
declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rr_128
- ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrk_128
- ;CHECK: vpsubsb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrkz_128
- ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rm_128
- ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmk_128
- ;CHECK: vpsubsb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmkz_128
- ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3112,45 +3974,63 @@ define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rr_256
- ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrk_256
- ;CHECK: vpsubsb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrkz_256
- ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rm_256
- ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmk_256
- ;CHECK: vpsubsb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmkz_256
- ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3159,45 +4039,63 @@ define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b,
declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rr_128
- ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrk_128
- ;CHECK: vpaddusb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrkz_128
- ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rm_128
- ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmk_128
- ;CHECK: vpaddusb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmkz_128
- ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3206,45 +4104,63 @@ define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rr_256
- ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrk_256
- ;CHECK: vpaddusb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrkz_256
- ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rm_256
- ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmk_256
- ;CHECK: vpaddusb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmkz_256
- ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3253,45 +4169,63 @@ define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b,
declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rr_128
- ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrk_128
- ;CHECK: vpsubusb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrkz_128
- ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rm_128
- ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmk_128
- ;CHECK: vpsubusb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmkz_128
- ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3300,45 +4234,63 @@ define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rr_256
- ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrk_256
- ;CHECK: vpsubusb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrkz_256
- ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rm_256
- ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmk_256
- ;CHECK: vpsubusb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmkz_256
- ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3348,11 +4300,14 @@ declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x
declare <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_128
-; CHECK-NOT: call
-; CHECK: vpmaxsb %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1]
+; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3361,11 +4316,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_256
-; CHECK-NOT: call
-; CHECK: vpmaxsb %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1]
+; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3c,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3374,11 +4332,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_128
-; CHECK-NOT: call
-; CHECK: vpmaxsw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1]
+; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xee,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3387,11 +4348,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_256
-; CHECK-NOT: call
-; CHECK: vpmaxsw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1]
+; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3400,11 +4364,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16
declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_128
-; CHECK-NOT: call
-; CHECK: vpmaxub %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2,i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1]
+; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3413,11 +4380,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_256
-; CHECK-NOT: call
-; CHECK: vpmaxub %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1]
+; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xde,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3426,11 +4396,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_128
-; CHECK-NOT: call
-; CHECK: vpmaxuw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1]
+; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x3e,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3439,11 +4412,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_256
-; CHECK-NOT: call
-; CHECK: vpmaxuw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1]
+; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3452,11 +4428,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16
declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_128
-; CHECK-NOT: call
-; CHECK: vpminsb %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1]
+; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3465,11 +4444,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_256
-; CHECK-NOT: call
-; CHECK: vpminsb %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1]
+; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x38,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3478,11 +4460,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_128
-; CHECK-NOT: call
-; CHECK: vpminsw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1]
+; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xea,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3491,11 +4476,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_256
-; CHECK-NOT: call
-; CHECK: vpminsw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1]
+; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3504,11 +4492,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16
declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_128
-; CHECK-NOT: call
-; CHECK: vpminub %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1]
+; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3517,11 +4508,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_256
-; CHECK-NOT: call
-; CHECK: vpminub %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1]
+; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xda,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3530,11 +4524,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_128
-; CHECK-NOT: call
-; CHECK: vpminuw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1]
+; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x3a,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3543,11 +4540,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_256
-; CHECK-NOT: call
-; CHECK: vpminuw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1]
+; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3556,12 +4556,15 @@ define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16
declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %xmm{{.*}}{%k1}
-; CHECK-NOT: {z}
define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xca]
+; CHECK-NEXT: vpaddw %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3570,11 +4573,15 @@ define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x
declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %xmm{{.*}}{%k1} {z}
define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xca]
+; CHECK-NEXT: vpaddw %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3583,11 +4590,15 @@ define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x
declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %ymm{{.*}}{%k1}
define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xca]
+; CHECK-NEXT: vpaddw %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3596,11 +4607,15 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16
declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %ymm{{.*}}{%k1} {z}
define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xca]
+; CHECK-NEXT: vpaddw %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3609,11 +4624,15 @@ define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <1
declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2w %xmm{{.*}}{%k1}
define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x75,0xda]
+; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x75,0xca]
+; CHECK-NEXT: vpaddw %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3622,11 +4641,15 @@ define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x
declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2w %ymm{{.*}}{%k1}
define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x75,0xda]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x75,0xca]
+; CHECK-NEXT: vpaddw %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3635,11 +4658,14 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16
declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_128
-; CHECK-NOT: call
-; CHECK: vpavgb %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
+; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe0,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
%res2 = add <16 x i8> %res, %res1
@@ -3648,11 +4674,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x
declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_256
-; CHECK-NOT: call
-; CHECK: vpavgb %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe0,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3661,11 +4690,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x
declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_128
-; CHECK-NOT: call
-; CHECK: vpavgw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
+; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe3,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3674,11 +4706,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x
declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_256
-; CHECK-NOT: call
-; CHECK: vpavgw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe3,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3687,11 +4722,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16>
declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpshufb %xmm{{.*}}{%k1}
define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1]
+; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
%res2 = add <16 x i8> %res, %res1
@@ -3700,11 +4738,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpshufb %ymm{{.*}}{%k1}
define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1]
+; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3713,11 +4754,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %
declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsb{{.*}}{%k1}
define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1c,0xc8]
+; CHECK-NEXT: vpabsb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x1c,0xc0]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
%res2 = add <16 x i8> %res, %res1
@@ -3726,11 +4770,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x
declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsb{{.*}}{%k1}
define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpabsb %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1c,0xc8]
+; CHECK-NEXT: vpabsb %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x1c,0xc0]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3739,11 +4786,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x
declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsw{{.*}}{%k1}
define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1d,0xc8]
+; CHECK-NEXT: vpabsw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x1d,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3752,57 +4802,30 @@ define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x
declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsw{{.*}}{%k1}
define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsw %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1d,0xc8]
+; CHECK-NEXT: vpabsw %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x1d,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
-; CHECK-LABEL: test_x86_mask_blend_b_256
-; CHECK: vpblendmb
-define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
- %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
- ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_w_256
-define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
- ; CHECK: vpblendmw
- %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_b_128
-; CHECK: vpblendmb
-define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
- %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_w_128
-define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
- ; CHECK: vpblendmw
- %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
-
declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhuw {{.*}}encoding: [0x62
define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1]
+; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe4,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3811,12 +4834,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16>
declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhuw {{.*}}encoding: [0x62
define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1]
+; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe4,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3825,12 +4850,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i1
declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhw {{.*}}encoding: [0x62
define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1]
+; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe5,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3838,12 +4865,15 @@ define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %
}
declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhw {{.*}}encoding: [0x62
+
define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1]
+; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe5,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3851,12 +4881,15 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16
}
declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+
define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1]
+; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x0b,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3864,12 +4897,15 @@ define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16>
}
declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+
define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1]
+; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x0b,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3880,9 +4916,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
-; CHECK: vpmovwb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovwb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovwb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
+; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
+; CHECK-NEXT: vpmovwb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3895,8 +4936,11 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
-; CHECK: vpmovwb %xmm0, (%rdi)
-; CHECK: vpmovwb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovwb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0x07]
+; CHECK-NEXT: vpmovwb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
ret void
@@ -3906,9 +4950,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
-; CHECK: vpmovswb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovswb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
+; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
+; CHECK-NEXT: vpmovswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3921,8 +4970,11 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
-; CHECK: vpmovswb %xmm0, (%rdi)
-; CHECK: vpmovswb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
ret void
@@ -3932,9 +4984,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
-; CHECK: vpmovuswb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovuswb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovuswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
+; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
+; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3947,8 +5004,11 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
-; CHECK: vpmovuswb %xmm0, (%rdi)
-; CHECK: vpmovuswb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
ret void
@@ -3958,9 +5018,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
-; CHECK: vpmovwb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovwb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovwb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
+; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2]
+; CHECK-NEXT: vpmovwb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
@@ -3973,8 +5038,11 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
-; CHECK: vpmovwb %ymm0, (%rdi)
-; CHECK: vpmovwb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovwb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0x07]
+; CHECK-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
ret void
@@ -3984,9 +5052,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
-; CHECK: vpmovswb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovswb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
+; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2]
+; CHECK-NEXT: vpmovswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
@@ -3999,8 +5072,11 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
-; CHECK: vpmovswb %ymm0, (%rdi)
-; CHECK: vpmovswb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
ret void
@@ -4010,9 +5086,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16
define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
-; CHECK: vpmovuswb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovuswb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
+; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2]
+; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
@@ -4025,8 +5106,11 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
-; CHECK: vpmovuswb %ymm0, (%rdi)
-; CHECK: vpmovuswb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
ret void
@@ -4037,12 +5121,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x
define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1]
+; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf5,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4054,12 +5137,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8
define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1]
+; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf5,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4071,12 +5153,11 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8
define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1]
+; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x04,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -4088,182 +5169,29 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <1
define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1]
+; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x04,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
-declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
-; CHECK: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[8],k1[8],xmm2[9],k1[9],xmm2[10],k1[10],xmm2[11],k1[11],xmm2[12],k1[12],xmm2[13],k1[13],xmm2[14],k1[14],xmm2[15],k1[15]
-; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
- %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
- %res2 = add <16 x i8> %res, %res1
- ret <16 x i8> %res2
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
-; CHECK: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3],xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
-; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
- %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
- %res2 = add <16 x i8> %res, %res1
- ret <16 x i8> %res2
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
-; CHECK: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15],ymm2[24],k1[24],ymm2[25],k1[25],ymm2[26],k1[26],ymm2[27],k1[27],ymm2[28],k1[28],ymm2[29],k1[29],ymm2[30],k1[30],ymm2[31],k1[31]
-; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
- %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
- %res2 = add <32 x i8> %res, %res1
- ret <32 x i8> %res2
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
-; CHECK: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[16],k1[16],ymm2[17],k1[17],ymm2[18],k1[18],ymm2[19],k1[19],ymm2[20],k1[20],ymm2[21],k1[21],ymm2[22],k1[22],ymm2[23],k1[23]
-; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
- %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
- %res2 = add <32 x i8> %res, %res1
- ret <32 x i8> %res2
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
-; CHECK: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3]
-; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
- %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
- %res2 = add <8 x i16> %res, %res1
- ret <8 x i16> %res2
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
-; CHECK: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
-; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
- %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
- %res2 = add <8 x i16> %res, %res1
- ret <8 x i16> %res2
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
-; CHECK: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11]
-; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
- %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
- %res2 = add <16 x i16> %res, %res1
- ret <16 x i16> %res2
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
-; CHECK: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15]
-; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
- %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
- %res2 = add <16 x i16> %res, %res1
- ret <16 x i16> %res2
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
- %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1)
- %res3 = add <16 x i8> %res, %res1
- %res4 = add <16 x i8> %res3, %res2
- ret <16 x i8> %res4
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
- %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1)
- %res3 = add <32 x i8> %res, %res1
- %res4 = add <32 x i8> %res3, %res2
- ret <32 x i8> %res4
-}
-
declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x42,0xc1,0x02]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1)
@@ -4277,13 +5205,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32,
define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x42,0xc1,0x02]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1)
@@ -4292,135 +5220,15 @@ define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8>
ret <16 x i16> %res4
}
-declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
- %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
- %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask)
- %res3 = add <32 x i8> %res, %res1
- %res4 = add <32 x i8> %res2, %res3
- ret <32 x i8> %res4
-}
-
-declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
- %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
- %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask)
- %res3 = add <16 x i8> %res, %res1
- %res4 = add <16 x i8> %res2, %res3
- ret <16 x i8> %res4
-}
-
-declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
- %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
- %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask)
- %res3 = add <16 x i16> %res, %res1
- %res4 = add <16 x i16> %res2, %res3
- ret <16 x i16> %res4
-}
-
-declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
- %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
- %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask)
- %res3 = add <8 x i16> %res, %res1
- %res4 = add <8 x i16> %res2, %res3
- ret <8 x i16> %res4
-}
-
-declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xd0]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0]
-; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
-; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
- %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
- %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask)
- %res3 = add <64 x i8> %res, %res1
- %res4 = add <64 x i8> %res2, %res3
- ret <64 x i8> %res4
-}
-
-declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xd0]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0]
-; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
-; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
- %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
- %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask)
- %res3 = add <32 x i16> %res, %res1
- %res4 = add <32 x i16> %res2, %res3
- ret <32 x i16> %res4
-}
-
declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>)
define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovb2m %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovb2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xc0]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0)
ret i16 %res
}
@@ -4430,9 +5238,9 @@ declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>)
define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovb2m %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovb2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x29,0xc0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0)
ret i32 %res
}
@@ -4442,9 +5250,10 @@ declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>)
define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovw2m %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovw2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc0]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0)
ret i8 %res
}
@@ -4454,9 +5263,10 @@ declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>)
define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovw2m %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovw2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x29,0xc0]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0)
ret i16 %res
}
@@ -4466,9 +5276,9 @@ declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16)
define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpmovm2b %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT: vpmovm2b %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16 %x0)
ret <16 x i8> %res
}
@@ -4478,9 +5288,9 @@ declare <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32)
define <32 x i8>@test_int_x86_avx512_cvtmask2b_256(i32 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k0
-; CHECK-NEXT: vpmovm2b %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
+; CHECK-NEXT: vpmovm2b %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32 %x0)
ret <32 x i8> %res
}
@@ -4490,10 +5300,9 @@ declare <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8)
define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: vpmovm2w %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT: vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8 %x0)
ret <8 x i16> %res
}
@@ -4503,9 +5312,9 @@ declare <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16)
define <16 x i16>@test_int_x86_avx512_cvtmask2w_256(i16 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpmovm2w %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT: vpmovm2w %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16 %x0)
ret <16 x i16> %res
}
@@ -4515,14 +5324,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
+; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xd9]
+; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd1,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -4536,13 +5344,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xd9]
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd1,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -4551,42 +5359,41 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16>
ret <16 x i16> %res4
}
-declare <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16>, i32, <8 x i16>, i8)
-define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x71,0xd0,0x03]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
%res3 = add <8 x i16> %res, %res1
%res4 = add <8 x i16> %res2, %res3
ret <8 x i16> %res4
}
-declare <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16>, i32, <16 x i16>, i16)
-define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x71,0xd0,0x03]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
%res3 = add <16 x i16> %res, %res1
%res4 = add <16 x i16> %res3, %res2
ret <16 x i16> %res4
@@ -4597,13 +5404,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x10,0xd1]
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x10,0xd9]
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x10,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4617,14 +5424,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x10,0xd1]
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x10,0xd9]
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x10,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4638,14 +5444,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1]
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xd9]
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe1,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4654,22 +5459,21 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x
ret <8 x i16> %res4
}
-declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i32, <8 x i16>, i8)
-define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x71,0xe0,0x03]
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
%res3 = add <8 x i16> %res, %res1
%res4 = add <8 x i16> %res3, %res2
ret <8 x i16> %res4
@@ -4680,13 +5484,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1]
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xd9]
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe1,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4695,151 +5499,21 @@ define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16>
ret <16 x i16> %res4
}
-declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i32, <16 x i16>, i16)
-define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
- %res3 = add <16 x i16> %res, %res1
- %res4 = add <16 x i16> %res3, %res2
- ret <16 x i16> %res4
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i16, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i16 %x1, <4 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[3,0,0,0]
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> %x2, i8 -1)
- %res3 = add <4 x i32> %res, %res1
- %res4 = add <4 x i32> %res3, %res2
- ret <4 x i32> %res4
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i16, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i16 %x1, <8 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> %x2, i8 -1)
- %res3 = add <8 x i32> %res, %res1
- %res4 = add <8 x i32> %res3, %res2
- ret <8 x i32> %res4
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i8, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,7,4,4,4]
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
- %res3 = add <8 x i16> %res, %res1
- %res4 = add <8 x i16> %res3, %res2
- ret <8 x i16> %res4
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i8, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
- %res3 = add <16 x i16> %res, %res1
- %res4 = add <16 x i16> %res3, %res2
- ret <16 x i16> %res4
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i8, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[3,0,0,0,4,5,6,7]
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
- %res3 = add <8 x i16> %res, %res1
- %res4 = add <8 x i16> %res3, %res2
- ret <8 x i16> %res4
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i8, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x71,0xe0,0x03]
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
%res3 = add <16 x i16> %res, %res1
%res4 = add <16 x i16> %res3, %res2
ret <16 x i16> %res4
@@ -4850,13 +5524,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav16_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x11,0xd1]
+; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x11,0xd9]
+; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x11,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4870,14 +5544,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x11,0xd1]
+; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x11,0xd9]
+; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x11,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4892,14 +5565,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1]
+; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xd9]
+; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf1,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4913,13 +5585,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1]
+; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xd9]
+; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf1,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4928,42 +5600,41 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16>
ret <16 x i16> %res4
}
-declare <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16>, i32, <8 x i16>, i8)
-define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x71,0xf0,0x03]
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
%res3 = add <8 x i16> %res, %res1
%res4 = add <8 x i16> %res3, %res2
ret <8 x i16> %res4
}
-declare <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16>, i32, <16 x i16>, i16)
-define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsllw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x71,0xf0,0x03]
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
%res3 = add <16 x i16> %res, %res1
%res4 = add <16 x i16> %res3, %res2
ret <16 x i16> %res4
@@ -4974,13 +5645,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv16_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x12,0xd1]
+; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x12,0xd9]
+; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x12,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4994,14 +5665,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x12,0xd1]
+; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x12,0xd9]
+; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x12,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -5015,14 +5685,16 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x30,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1)
@@ -5036,13 +5708,16 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8>, <16 x i16>, i1
define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxbw %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x30,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1)
@@ -5057,14 +5732,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8]
+; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xd0]
+; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x20,0xc0]
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1)
@@ -5078,13 +5752,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8>, <16 x i16>, i1
define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8]
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xd0]
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x20,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1)
@@ -5098,14 +5772,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xd0]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x25,0xc0]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1)
@@ -5119,14 +5792,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8]
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xd0]
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x25,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1)
@@ -5135,3 +5807,272 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64>
ret <4 x i64> %res4
}
+declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0]
+; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xd8]
+; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xc0]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xd8]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xc0]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8)
+
+define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2)
+
+define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpbroadcastb %dil, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
+; CHECK-NEXT: vpbroadcastb %dil, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
+; CHECK-NEXT: vpbroadcastb %dil, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res2, %res3
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastb %dil, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
+; CHECK-NEXT: vpbroadcastb %dil, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
+; CHECK-NEXT: vpbroadcastb %dil, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res2, %res3
+ ret <16 x i8> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastw %di, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
+; CHECK-NEXT: vpbroadcastw %di, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
+; CHECK-NEXT: vpbroadcastw %di, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastw %di, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
+; CHECK-NEXT: vpbroadcastw %di, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
+; CHECK-NEXT: vpbroadcastw %di, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res2, %res3
+ ret <8 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
index 8a9a4fa5e5e2..6bd9c9384050 100644
--- a/test/CodeGen/X86/avx512bwvl-mov.ll
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -1,27 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
-; CHECK-LABEL: test_256_1
-; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
-; CHECK: ret
define <32 x i8> @test_256_1(i8 * %addr) {
+; CHECK-LABEL: test_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*
%res = load <32 x i8>, <32 x i8>* %vaddr, align 1
ret <32 x i8>%res
}
-; CHECK-LABEL: test_256_2
-; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_2(i8 * %addr, <32 x i8> %data) {
+; CHECK-LABEL: test_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7f,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*
store <32 x i8>%data, <32 x i8>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_3
-; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
+; CHECK-LABEL: test_256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmb (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <32 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i8>*
%r = load <32 x i8>, <32 x i8>* %vaddr, align 1
@@ -29,10 +35,13 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
ret <32 x i8>%res
}
-; CHECK-LABEL: test_256_4
-; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
+; CHECK-LABEL: test_256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <32 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i8>*
%r = load <32 x i8>, <32 x i8>* %vaddr, align 1
@@ -40,28 +49,33 @@ define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
ret <32 x i8>%res
}
-; CHECK-LABEL: test_256_5
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define <16 x i16> @test_256_5(i8 * %addr) {
+; CHECK-LABEL: test_256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xff,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*
%res = load <16 x i16>, <16 x i16>* %vaddr, align 1
ret <16 x i16>%res
}
-; CHECK-LABEL: test_256_6
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_6(i8 * %addr, <16 x i16> %data) {
+; CHECK-LABEL: test_256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xff,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*
store <16 x i16>%data, <16 x i16>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_7
-; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
+; CHECK-LABEL: test_256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmw (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i16>*
%r = load <16 x i16>, <16 x i16>* %vaddr, align 1
@@ -69,10 +83,13 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
ret <16 x i16>%res
}
-; CHECK-LABEL: test_256_8
-; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
+; CHECK-LABEL: test_256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i16>*
%r = load <16 x i16>, <16 x i16>* %vaddr, align 1
@@ -80,28 +97,33 @@ define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
ret <16 x i16>%res
}
-; CHECK-LABEL: test_128_1
-; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
-; CHECK: ret
define <16 x i8> @test_128_1(i8 * %addr) {
+; CHECK-LABEL: test_128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*
%res = load <16 x i8>, <16 x i8>* %vaddr, align 1
ret <16 x i8>%res
}
-; CHECK-LABEL: test_128_2
-; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_2(i8 * %addr, <16 x i8> %data) {
+; CHECK-LABEL: test_128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7f,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*
store <16 x i8>%data, <16 x i8>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_3
-; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
+; CHECK-LABEL: test_128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmb (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i8>*
%r = load <16 x i8>, <16 x i8>* %vaddr, align 1
@@ -109,10 +131,13 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
ret <16 x i8>%res
}
-; CHECK-LABEL: test_128_4
-; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
+; CHECK-LABEL: test_128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i8>*
%r = load <16 x i8>, <16 x i8>* %vaddr, align 1
@@ -120,28 +145,33 @@ define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
ret <16 x i8>%res
}
-; CHECK-LABEL: test_128_5
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define <8 x i16> @test_128_5(i8 * %addr) {
+; CHECK-LABEL: test_128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*
%res = load <8 x i16>, <8 x i16>* %vaddr, align 1
ret <8 x i16>%res
}
-; CHECK-LABEL: test_128_6
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_6(i8 * %addr, <8 x i16> %data) {
+; CHECK-LABEL: test_128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xff,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*
store <8 x i16>%data, <8 x i16>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_7
-; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
+; CHECK-LABEL: test_128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmw (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i16>*
%r = load <8 x i16>, <8 x i16>* %vaddr, align 1
@@ -149,10 +179,13 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
ret <8 x i16>%res
}
-; CHECK-LABEL: test_128_8
-; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
+; CHECK-LABEL: test_128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i16>*
%r = load <8 x i16>, <8 x i16>* %vaddr, align 1
diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
index 9bf02fa41d9a..17e581bbb501 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
@@ -1,94 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; CHECK-LABEL: test256_1
-; CHECK: vpcmpeqb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: test256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
ret <32 x i8> %max
}
-; CHECK-LABEL: test256_2
-; CHECK: vpcmpgtb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+; CHECK-LABEL: test256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
ret <32 x i8> %max
}
-; CHECK-LABEL: @test256_3
-; CHECK: vpcmplew {{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind {
+; CHECK-LABEL: test256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1
+; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
ret <16 x i16> %max
}
-; CHECK-LABEL: test256_4
-; CHECK: vpcmpnleub {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+; CHECK-LABEL: test256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
ret <32 x i8> %max
}
-; CHECK-LABEL: test256_5
-; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind {
+; CHECK-LABEL: test256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %yp, align 4
%mask = icmp eq <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_6
-; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sgt <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_7
-; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sle <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_8
-; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp ule <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_9
-; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+; CHECK-LABEL: test256_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i16> %x1, %y1
%mask0 = icmp eq <16 x i16> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -96,11 +107,13 @@ define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_10
-; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+; CHECK-LABEL: test256_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i8> %x1, %y1
%mask0 = icmp sle <32 x i8> %x, %y
%mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
@@ -108,11 +121,13 @@ define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8
ret <32 x i8> %max
}
-; CHECK-LABEL: @test256_11
-; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+; CHECK-LABEL: test256_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <32 x i8> %x1, %y1
%y = load <32 x i8>, <32 x i8>* %y.ptr, align 4
%mask0 = icmp sgt <32 x i8> %x, %y
@@ -121,11 +136,13 @@ define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32
ret <32 x i8> %max
}
-; CHECK-LABEL: @test256_12
-; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+; CHECK-LABEL: test256_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i16> %x1, %y1
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask0 = icmp ule <16 x i16> %x, %y
@@ -134,95 +151,105 @@ define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1,
ret <16 x i16> %max
}
-; CHECK-LABEL: test128_1
-; CHECK: vpcmpeqb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
+; CHECK-LABEL: test128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
ret <16 x i8> %max
}
-; CHECK-LABEL: test128_2
-; CHECK: vpcmpgtb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+; CHECK-LABEL: test128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
ret <16 x i8> %max
}
-; CHECK-LABEL: @test128_3
-; CHECK: vpcmplew {{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind {
+; CHECK-LABEL: test128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1
+; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
ret <8 x i16> %max
}
-; CHECK-LABEL: test128_4
-; CHECK: vpcmpnleub {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+; CHECK-LABEL: test128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
ret <16 x i8> %max
}
-; CHECK-LABEL: test128_5
-; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind {
+; CHECK-LABEL: test128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %yp, align 4
%mask = icmp eq <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_6
-; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sgt <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_7
-; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sle <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_8
-; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp ule <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_9
-; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+; CHECK-LABEL: test128_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i16> %x1, %y1
%mask0 = icmp eq <8 x i16> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -230,11 +257,13 @@ define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16>
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_10
-; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+; CHECK-LABEL: test128_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i8> %x1, %y1
%mask0 = icmp sle <16 x i8> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -242,11 +271,13 @@ define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8
ret <16 x i8> %max
}
-; CHECK-LABEL: @test128_11
-; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+; CHECK-LABEL: test128_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <16 x i8> %x1, %y1
%y = load <16 x i8>, <16 x i8>* %y.ptr, align 4
%mask0 = icmp sgt <16 x i8> %x, %y
@@ -255,11 +286,13 @@ define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16
ret <16 x i8> %max
}
-; CHECK-LABEL: @test128_12
-; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+; CHECK-LABEL: test128_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i16> %x1, %y1
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask0 = icmp ule <8 x i16> %x, %y
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 14e91e1a8768..b27b795b4409 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly
@@ -7,8 +8,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vplzcntd %xmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vplzcntd %xmm0, %xmm0
@@ -28,8 +28,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vplzcntd %ymm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -45,8 +44,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vplzcntq %xmm0, %xmm0
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
@@ -62,8 +60,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vplzcntq %ymm0, %ymm0
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -79,8 +76,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictd %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vpconflictd %xmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpconflictd %xmm0, %xmm0
@@ -100,8 +96,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictd %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vpconflictd %ymm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -117,8 +112,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vpconflictq %xmm0, %xmm0
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
@@ -134,8 +128,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vpconflictq %ymm0, %ymm0
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -147,33 +140,45 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i
}
define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
- ; CHECK: test_x86_vbroadcastmw_256
- ; CHECK: vpbroadcastmw2d %k0, %ymm0
- %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
+; CHECK-LABEL: test_x86_vbroadcastmw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
- ; CHECK: test_x86_vbroadcastmw_128
- ; CHECK: vpbroadcastmw2d %k0, %xmm0
- %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
+; CHECK-LABEL: test_x86_vbroadcastmw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
- ; CHECK: test_x86_broadcastmb_256
- ; CHECK: vpbroadcastmb2q %k0, %ymm0
- %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
+; CHECK-LABEL: test_x86_broadcastmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
- ; CHECK: test_x86_broadcastmb_128
- ; CHECK: vpbroadcastmb2q %k0, %xmm0
- %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
+; CHECK-LABEL: test_x86_broadcastmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index a59fe393f556..35db4901135f 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -1,4 +1,4 @@
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s
declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32)
@@ -194,13 +194,15 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_512(<8 x i64> %x0, <8 x f
}
declare <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducepd {{.*}}{%k1}
-; CHECK: vreducepd
-; CHECK: {sae}
+
define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vreducepd $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vreducepd $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 8, <8 x double> %x2, i8 %x3, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 4, <8 x double> %x2, i8 -1, i32 8)
%res2 = fadd <8 x double> %res, %res1
@@ -208,14 +210,15 @@ define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8
}
declare <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreduceps
-; CHECK: {sae}
-; CKECK: {%k1}
-; CHECK: vreduceps
+
define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vreduceps $44, {sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vreduceps $11, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 44, <16 x float> %x2, i16 %x3, i32 8)
%res1 = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 4)
%res2 = fadd <16 x float> %res, %res1
@@ -223,14 +226,15 @@ define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16
}
declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangepd
-; CKECK: {%k1}
-; CHECK: vrangepd
-; CHECK: {sae}
+
define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vrangepd $8, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vrangepd $4, {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 8, <8 x double> %x3, i8 %x4, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 4, <8 x double> %x3, i8 -1, i32 8)
%res2 = fadd <8 x double> %res, %res1
@@ -239,14 +243,14 @@ define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x
declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangeps
-; CKECK: {%k1}
-; CHECK: vrangeps
-; CHECK: {sae}
define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrangeps $88, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vrangeps $4, {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 88, <16 x float> %x3, i16 %x4, i32 4)
%res1 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 4, <16 x float> %x3, i16 -1, i32 8)
%res2 = fadd <16 x float> %res, %res1
@@ -255,14 +259,15 @@ define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16
declare <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ss
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducess
-; CKECK: {%k1}
-; CHECK: vreducess
-; CHECK: {sae}
define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vreducess $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vreducess $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <4 x float> %res, %res1
@@ -270,15 +275,16 @@ define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x floa
}
declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ss
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangess
-; CHECK: {sae}
-; CKECK: {%k1}
-; CHECK: vrangess
-; CHECK: {sae}
+
define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 8)
%res1 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <4 x float> %res, %res1
@@ -287,14 +293,15 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float
declare <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_sd
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducesd
-; CKECK: {%k1}
-; CHECK: vreducesd
-; CHECK: {sae}
define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vreducesd $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vreducesd $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <2 x double> %res, %res1
@@ -302,14 +309,16 @@ define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x do
}
declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_sd
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangesd
-; CKECK: {%k1}
-; CHECK: vrangesd
-; CHECK: {sae}
+
define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <2 x double> %res, %res1
@@ -439,14 +448,17 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i6
declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasspd
-; CHECK: {%k1}
-; CHECK: vfpclasspd
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vfpclasspd $2, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %ecx
+; CHECK-NEXT: vfpclasspd $4, %zmm0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -454,14 +466,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
}
declare i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float>, i32, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclassps
-; CHECK: vfpclassps
-; CHECK: {%k1}
-; CHECK: kmov
define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfpclassps $4, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: vfpclassps $4, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 %x1)
%res1 = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 -1)
%res2 = add i16 %res, %res1
@@ -470,14 +485,28 @@ define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_sd
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasssd
-; CHECK: %k0 {%k1}
-; CHECK: vfpclasssd
-; CHECK: %k0
define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je LBB28_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: movb $-1, %al
+; CHECK-NEXT: LBB28_2:
+; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: je LBB28_4
+; CHECK-NEXT: ## BB#3:
+; CHECK-NEXT: movb $-1, %cl
+; CHECK-NEXT: LBB28_4:
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -486,16 +515,28 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ss
-; CHECK-NOT: call
-; CHECK: kmovw
-; CHECK: vfpclassss
-; CHECK: %k0
-; CHECK: {%k1}
-; CHECK: kmovw
-; CHECK: vfpclassss
-; CHECK: %k0
define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je LBB29_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: movb $-1, %al
+; CHECK-NEXT: LBB29_2:
+; CHECK-NEXT: vfpclassss $4, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: je LBB29_4
+; CHECK-NEXT: ## BB#3:
+; CHECK-NEXT: movb $-1, %cl
+; CHECK-NEXT: LBB29_4:
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -549,6 +590,7 @@ define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovd2m %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0)
ret i16 %res
@@ -561,6 +603,7 @@ define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0)
ret i8 %res
@@ -594,12 +637,15 @@ declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vaddps %zmm1, %zmm0, %zmm0
-; CHECK: vaddps %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1)
%res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
@@ -613,12 +659,15 @@ declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vaddpd %zmm1, %zmm0, %zmm0
-; CHECK: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1)
%res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
@@ -632,12 +681,15 @@ declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32
define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
@@ -651,12 +703,15 @@ declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
index b4d11bc0b77b..27c0b06d5f23 100644
--- a/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -1,38 +1,69 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
define i8 @mask8(i8 %x) {
+; CHECK-LABEL: mask8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k0
+; CHECK-NEXT: knotb %k0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
ret i8 %ret
-; CHECK: mask8
-; CHECK: knotb
-; CHECK: ret
}
define void @mask8_mem(i8* %ptr) {
+; CHECK-LABEL: mask8_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb (%rdi), %k0
+; CHECK-NEXT: knotb %k0, %k0
+; CHECK-NEXT: kmovb %k0, (%rdi)
+; CHECK-NEXT: retq
%x = load i8, i8* %ptr, align 4
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
store i8 %ret, i8* %ptr, align 4
ret void
-; CHECK-LABEL: mask8_mem
-; CHECK: kmovb ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
-; CHECK-NEXT: knotb
-; CHECK-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
-; CHECK: ret
}
define i8 @mand8(i8 %x, i8 %y) {
+; CHECK-LABEL: mand8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: xorl %esi, %eax
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%ma = bitcast i8 %x to <8 x i1>
%mb = bitcast i8 %y to <8 x i1>
%mc = and <8 x i1> %ma, %mb
%md = xor <8 x i1> %ma, %mb
%me = or <8 x i1> %mc, %md
%ret = bitcast <8 x i1> %me to i8
-; CHECK: kandb
-; CHECK: kxorb
-; CHECK: korb
+ ret i8 %ret
+}
+
+define i8 @mand8_mem(<8 x i1>* %x, <8 x i1>* %y) {
+; CHECK-LABEL: mand8_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb (%rdi), %k0
+; CHECK-NEXT: kmovb (%rsi), %k1
+; CHECK-NEXT: kandb %k1, %k0, %k2
+; CHECK-NEXT: kxorb %k1, %k0, %k0
+; CHECK-NEXT: korb %k0, %k2, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %ma = load <8 x i1>, <8 x i1>* %x
+ %mb = load <8 x i1>, <8 x i1>* %y
+ %mc = and <8 x i1> %ma, %mb
+ %md = xor <8 x i1> %ma, %mb
+ %me = or <8 x i1> %mc, %md
+ %ret = bitcast <8 x i1> %me to i8
ret i8 %ret
}
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index 2065322009da..f201082fb1a7 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -1,53 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq -mattr=+avx512vl --show-mc-encoding| FileCheck %s
define <8 x i64> @test_mask_mullo_epi64_rr_512(<8 x i64> %a, <8 x i64> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rr_512
- ;CHECK: vpmullq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrk_512
- ;CHECK: vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rrkz_512(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrkz_512
- ;CHECK: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rm_512(<8 x i64> %a, <8 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rm_512
- ;CHECK: vpmullq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmk_512
- ;CHECK: vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rmkz_512(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmkz_512
- ;CHECK: vpmullq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmb_512
- ;CHECK: vpmullq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x58,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x58,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -56,8 +77,12 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
}
define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbk_512
- ;CHECK: vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -66,8 +91,11 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x
}
define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbkz_512
- ;CHECK: vpmullq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -77,53 +105,73 @@ define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8
declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
define <4 x i64> @test_mask_mullo_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rr_256
- ;CHECK: vpmullq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrk_256
- ;CHECK: vpmullq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrkz_256
- ;CHECK: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rm_256
- ;CHECK: vpmullq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmk_256
- ;CHECK: vpmullq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmkz_256
- ;CHECK: vpmullq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmb_256
- ;CHECK: vpmullq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -132,8 +180,12 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
}
define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbk_256
- ;CHECK: vpmullq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x40,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -142,8 +194,11 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x
}
define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbkz_256
- ;CHECK: vpmullq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -154,53 +209,73 @@ define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8
declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
define <2 x i64> @test_mask_mullo_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rr_128
- ;CHECK: vpmullq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrk_128
- ;CHECK: vpmullq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrkz_128
- ;CHECK: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rm_128
- ;CHECK: vpmullq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmk_128
- ;CHECK: vpmullq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmkz_128
- ;CHECK: vpmullq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmb_128
- ;CHECK: vpmullq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -209,8 +284,12 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
}
define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbk_128
- ;CHECK: vpmullq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x40,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -219,8 +298,11 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x
}
define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbkz_128
- ;CHECK: vpmullq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -231,53 +313,73 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8
declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
define <4 x float> @test_mask_andnot_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rr_128
- ;CHECK: vandnps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrk_128
- ;CHECK: vandnps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1]
+; CHECK-LABEL: test_mask_andnot_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrkz_128
- ;CHECK: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rm_128
- ;CHECK: vandnps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmk_128
- ;CHECK: vandnps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmkz_128
- ;CHECK: vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmb_128
- ;CHECK: vandnps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -286,8 +388,12 @@ define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbk_128
- ;CHECK: vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -296,8 +402,11 @@ define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b,
}
define <4 x float> @test_mask_andnot_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbkz_128
- ;CHECK: vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -308,53 +417,73 @@ define <4 x float> @test_mask_andnot_ps_rmbkz_128(<4 x float> %a, float* %ptr_b,
declare <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_andnot_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rr_256
- ;CHECK: vandnps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrk_256
- ;CHECK: vandnps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1]
+; CHECK-LABEL: test_mask_andnot_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrkz_256
- ;CHECK: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rm_256
- ;CHECK: vandnps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmk_256
- ;CHECK: vandnps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmkz_256
- ;CHECK: vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmb_256
- ;CHECK: vandnps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -363,8 +492,12 @@ define <8 x float> @test_mask_andnot_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbk_256
- ;CHECK: vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -373,8 +506,11 @@ define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b,
}
define <8 x float> @test_mask_andnot_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbkz_256
- ;CHECK: vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -385,53 +521,73 @@ define <8 x float> @test_mask_andnot_ps_rmbkz_256(<8 x float> %a, float* %ptr_b,
declare <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_andnot_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rr_512
- ;CHECK: vandnps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrk_512
- ;CHECK: vandnps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1]
+; CHECK-LABEL: test_mask_andnot_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrkz_512
- ;CHECK: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rm_512
- ;CHECK: vandnps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmk_512
- ;CHECK: vandnps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmkz_512
- ;CHECK: vandnps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmb_512
- ;CHECK: vandnps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -440,8 +596,12 @@ define <16 x float> @test_mask_andnot_ps_rmb_512(<16 x float> %a, float* %ptr_b)
}
define <16 x float> @test_mask_andnot_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbk_512
- ;CHECK: vandnps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x55,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -450,8 +610,11 @@ define <16 x float> @test_mask_andnot_ps_rmbk_512(<16 x float> %a, float* %ptr_b
}
define <16 x float> @test_mask_andnot_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbkz_512
- ;CHECK: vandnps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -462,53 +625,73 @@ define <16 x float> @test_mask_andnot_ps_rmbkz_512(<16 x float> %a, float* %ptr_
declare <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
define <4 x float> @test_mask_and_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_and_ps_rr_128
- ;CHECK: vandps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrk_128
- ;CHECK: vandps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1]
+; CHECK-LABEL: test_mask_and_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrkz_128
- ;CHECK: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rm_128
- ;CHECK: vandps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmk_128
- ;CHECK: vandps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmkz_128
- ;CHECK: vandps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rmb_128
- ;CHECK: vandps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -517,8 +700,12 @@ define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbk_128
- ;CHECK: vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -527,8 +714,11 @@ define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
}
define <4 x float> @test_mask_and_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbkz_128
- ;CHECK: vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -539,53 +729,73 @@ define <4 x float> @test_mask_and_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8
declare <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_and_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_and_ps_rr_256
- ;CHECK: vandps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrk_256
- ;CHECK: vandps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1]
+; CHECK-LABEL: test_mask_and_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrkz_256
- ;CHECK: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rm_256
- ;CHECK: vandps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmk_256
- ;CHECK: vandps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmkz_256
- ;CHECK: vandps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rmb_256
- ;CHECK: vandps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -594,8 +804,12 @@ define <8 x float> @test_mask_and_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbk_256
- ;CHECK: vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -604,8 +818,11 @@ define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
}
define <8 x float> @test_mask_and_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbkz_256
- ;CHECK: vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -616,53 +833,73 @@ define <8 x float> @test_mask_and_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8
declare <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_and_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_and_ps_rr_512
- ;CHECK: vandps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrk_512
- ;CHECK: vandps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1]
+; CHECK-LABEL: test_mask_and_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrkz_512
- ;CHECK: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rm_512
- ;CHECK: vandps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmk_512
- ;CHECK: vandps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmkz_512
- ;CHECK: vandps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rmb_512
- ;CHECK: vandps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -671,8 +908,12 @@ define <16 x float> @test_mask_and_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
}
define <16 x float> @test_mask_and_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbk_512
- ;CHECK: vandps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x54,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -681,8 +922,11 @@ define <16 x float> @test_mask_and_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <
}
define <16 x float> @test_mask_and_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbkz_512
- ;CHECK: vandps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -693,53 +937,73 @@ define <16 x float> @test_mask_and_ps_rmbkz_512(<16 x float> %a, float* %ptr_b,
declare <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
define <4 x float> @test_mask_or_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_or_ps_rr_128
- ;CHECK: vorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrk_128
- ;CHECK: vorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1]
+; CHECK-LABEL: test_mask_or_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrkz_128
- ;CHECK: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rm_128
- ;CHECK: vorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmk_128
- ;CHECK: vorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmkz_128
- ;CHECK: vorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rmb_128
- ;CHECK: vorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -748,8 +1012,12 @@ define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbk_128
- ;CHECK: vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -758,8 +1026,11 @@ define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x
}
define <4 x float> @test_mask_or_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbkz_128
- ;CHECK: vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -770,53 +1041,73 @@ define <4 x float> @test_mask_or_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8
declare <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_or_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_or_ps_rr_256
- ;CHECK: vorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrk_256
- ;CHECK: vorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1]
+; CHECK-LABEL: test_mask_or_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrkz_256
- ;CHECK: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rm_256
- ;CHECK: vorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmk_256
- ;CHECK: vorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmkz_256
- ;CHECK: vorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rmb_256
- ;CHECK: vorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -825,8 +1116,12 @@ define <8 x float> @test_mask_or_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbk_256
- ;CHECK: vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -835,8 +1130,11 @@ define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x
}
define <8 x float> @test_mask_or_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbkz_256
- ;CHECK: vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -847,53 +1145,73 @@ define <8 x float> @test_mask_or_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8
declare <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_or_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_or_ps_rr_512
- ;CHECK: vorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrk_512
- ;CHECK: vorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1]
+; CHECK-LABEL: test_mask_or_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrkz_512
- ;CHECK: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rm_512
- ;CHECK: vorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmk_512
- ;CHECK: vorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmkz_512
- ;CHECK: vorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rmb_512
- ;CHECK: vorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -902,8 +1220,12 @@ define <16 x float> @test_mask_or_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
}
define <16 x float> @test_mask_or_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbk_512
- ;CHECK: vorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x56,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -912,8 +1234,11 @@ define <16 x float> @test_mask_or_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <1
}
define <16 x float> @test_mask_or_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbkz_512
- ;CHECK: vorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -924,53 +1249,73 @@ define <16 x float> @test_mask_or_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i
declare <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
define <4 x float> @test_mask_xor_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_xor_ps_rr_128
- ;CHECK: vxorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrk_128
- ;CHECK: vxorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1]
+; CHECK-LABEL: test_mask_xor_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrkz_128
- ;CHECK: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rm_128
- ;CHECK: vxorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmk_128
- ;CHECK: vxorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmkz_128
- ;CHECK: vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rmb_128
- ;CHECK: vxorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -979,8 +1324,12 @@ define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbk_128
- ;CHECK: vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -989,8 +1338,11 @@ define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
}
define <4 x float> @test_mask_xor_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbkz_128
- ;CHECK: vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -1001,53 +1353,73 @@ define <4 x float> @test_mask_xor_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8
declare <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_xor_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_xor_ps_rr_256
- ;CHECK: vxorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrk_256
- ;CHECK: vxorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1]
+; CHECK-LABEL: test_mask_xor_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrkz_256
- ;CHECK: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rm_256
- ;CHECK: vxorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmk_256
- ;CHECK: vxorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmkz_256
- ;CHECK: vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rmb_256
- ;CHECK: vxorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -1056,8 +1428,12 @@ define <8 x float> @test_mask_xor_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbk_256
- ;CHECK: vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -1066,8 +1442,11 @@ define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
}
define <8 x float> @test_mask_xor_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbkz_256
- ;CHECK: vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -1078,53 +1457,73 @@ define <8 x float> @test_mask_xor_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8
declare <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_xor_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_xor_ps_rr_512
- ;CHECK: vxorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrk_512
- ;CHECK: vxorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1]
+; CHECK-LABEL: test_mask_xor_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrkz_512
- ;CHECK: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rm_512
- ;CHECK: vxorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmk_512
- ;CHECK: vxorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmkz_512
- ;CHECK: vxorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rmb_512
- ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1133,8 +1532,12 @@ define <16 x float> @test_mask_xor_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
}
define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbk_512
- ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_mask_xor_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x57,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1143,8 +1546,11 @@ define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <
}
define <16 x float> @test_mask_xor_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbkz_512
- ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_xor_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1159,11 +1565,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8]
+; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1175,11 +1581,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8]
+; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1191,11 +1597,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1207,11 +1613,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1223,11 +1629,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8]
+; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1239,11 +1645,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8]
+; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1255,11 +1661,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1271,11 +1677,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1287,11 +1693,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>,
define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1303,11 +1709,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>,
define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1319,11 +1725,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1335,11 +1741,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1351,11 +1757,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1367,11 +1773,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1383,11 +1789,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>,
define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1399,11 +1805,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>,
define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1415,11 +1821,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1431,11 +1837,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1447,11 +1853,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1463,11 +1869,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>
define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1479,11 +1885,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1495,11 +1901,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1511,11 +1917,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1527,11 +1933,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1540,12 +1946,14 @@ define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x
declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducepd {{.*}}{%k1}
-; CHECK: vreducepd
define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreducepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x56,0xc8,0x04]
+; CHECK-NEXT: vreducepd $8, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x56,0xc0,0x08]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 8, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1554,12 +1962,14 @@ define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2
declare <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double>, i32, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducepd {{.*}}{%k1}
-; CHECK: vreducepd
define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreducepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x56,0xc8,0x04]
+; CHECK-NEXT: vreducepd $0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x56,0xc0,0x00]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 0, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1567,12 +1977,15 @@ define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4
}
declare <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float>, i32, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreduceps {{.*}}{%k1}
-; CHECK: vreduceps
+
define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreduceps $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x56,0xc8,0x04]
+; CHECK-NEXT: vreduceps $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x56,0xc0,0x58]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1581,12 +1994,14 @@ define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x
declare <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float>, i32, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreduceps {{.*}}{%k1}
-; CHECK: vreduceps
define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreduceps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x56,0xc8,0x0b]
+; CHECK-NEXT: vreduceps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x56,0xc0,0x0b]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1595,12 +2010,14 @@ define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x
declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangepd {{.*}}{%k1}
-; CHECK: vrangepd
define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangepd $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangepd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x50,0xc1,0x08]
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 4, <2 x double> %x3, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 8, <2 x double> %x3, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1609,12 +2026,14 @@ define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x
declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangepd {{.*}}{%k1}
-; CHECK: vrangepd
define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangepd $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangepd $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x50,0xc1,0x58]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 4, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 88, <4 x double> %x3, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1623,12 +2042,14 @@ define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x
declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangeps {{.*}}{%k1}
-; CHECK: vrangeps
define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangeps $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangeps $88, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x50,0xc1,0x58]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 4, <4 x float> %x3, i8 %x4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 88, <4 x float> %x3, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1637,12 +2058,14 @@ define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x f
declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangeps {{.*}}{%k1}
-; CHECK: vrangeps
define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangeps $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangeps $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x50,0xc1,0x58]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 4, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 88, <8 x float> %x3, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1654,13 +2077,13 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32,
define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc2,0x01]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc0,0x01]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
%res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
@@ -1674,13 +2097,13 @@ declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x do
define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xd9,0x01]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x18,0xc1,0x01]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
%res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4)
@@ -1694,13 +2117,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i3
define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xd9,0x01]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x38,0xc1,0x01]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4)
@@ -1711,14 +2134,17 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclassps
-; CHECK: {%k1}
-; CHECK: vfpclassps
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclassps $2, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclassps $4, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -1727,14 +2153,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclassps
-; CHECK: {%k1}
-; CHECK: vfpclassps
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclassps $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclassps $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -1743,14 +2172,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasspd
-; CHECK: {%k1}
-; CHECK: vfpclasspd
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclasspd $4, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclasspd $2, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1)
%res2 = add i8 %res, %res1
@@ -1759,14 +2191,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasspd
-; CHECK: {%k1}
-; CHECK: vfpclasspd
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclasspd $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclasspd $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -1778,13 +2213,13 @@ declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x f
define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm0
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x19,0xc8]
+; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
+; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x19,0xc0]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3)
%res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
@@ -1795,17 +2230,20 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0,
declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3, i64 * %y_ptr) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3)
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vbroadcasti32x2 (%rsi), %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x59,0xd0]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x59,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %y_64 = load i64, i64 * %y_ptr
+ %y_v2i64 = insertelement <2 x i64> undef, i64 %y_64, i32 0
+ %y = bitcast <2 x i64> %y_v2i64 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %y, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
@@ -1818,13 +2256,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x59,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1)
@@ -1838,9 +2276,10 @@ declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>)
define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovd2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
ret i8 %res
}
@@ -1850,9 +2289,10 @@ declare i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32>)
define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovd2m %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovd2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0)
ret i8 %res
}
@@ -1862,9 +2302,10 @@ declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovq2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
ret i8 %res
}
@@ -1874,9 +2315,10 @@ declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovq2m %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovq2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0)
ret i8 %res
}
@@ -1886,9 +2328,9 @@ declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8)
define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2d %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8 %x0)
ret <4 x i32> %res
}
@@ -1898,9 +2340,9 @@ declare <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8)
define <8 x i32>@test_int_x86_avx512_cvtmask2d_256(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2d %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2d %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8 %x0)
ret <8 x i32> %res
}
@@ -1910,9 +2352,9 @@ declare <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8)
define <2 x i64>@test_int_x86_avx512_cvtmask2q_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2q %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8 %x0)
ret <2 x i64> %res
}
@@ -1922,9 +2364,9 @@ declare <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8)
define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2q %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2q %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0)
ret <4 x i64> %res
}
@@ -1932,12 +2374,18 @@ declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK: vaddpd %ymm0, %ymm2, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1)
%res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask)
@@ -1951,12 +2399,18 @@ declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>,
define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vpaddq %ymm1, %ymm0, %ymm0
-; CHECK: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1]
+; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask)
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
new file mode 100644
index 000000000000..685817cbe265
--- /dev/null
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
new file mode 100644
index 000000000000..1d5febfec4f3
--- /dev/null
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
new file mode 100644
index 000000000000..ce999855d1f1
--- /dev/null
+++ b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512vbmi | FileCheck %s
+declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddb %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512:
+; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm0
+; CHECK: vpaddb %zmm3, %zmm2, %zmm1
+; CHECK: vpaddb %zmm0, %zmm1, %zmm0
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
new file mode 100644
index 000000000000..b68e71110210
--- /dev/null
+++ b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512vl -mattr=+avx512vbmi --show-mc-encoding| FileCheck %s
+declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0]
+; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x8d,0xd8]
+; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0x8d,0xc0]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0]
+; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xd8]
+; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0x8d,0xc0]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1]
+; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x83,0xd9]
+; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1]
+; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xd9]
+; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xca]
+; CHECK-NEXT: vpxord %xmm4, %xmm4, %xmm4 ## encoding: [0x62,0xf1,0x5d,0x08,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %xmm4, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xca]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %ymm4, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xca]
+; CHECK-NEXT: vpxord %xmm4, %xmm4, %xmm4 ## encoding: [0x62,0xf1,0x5d,0x08,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %xmm4, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xca]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %ymm4, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xca]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ ret <16 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xca]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..310ed8f50c4e
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -0,0 +1,1391 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
+
+define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res1 = bitcast <4 x i32> %res0 to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp0:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x i32> %res0, <4 x i32> %arg0
+ %res2 = bitcast <4 x i32> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp1:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x i32> %res0, <4 x i32> zeroinitializer
+ %res2 = bitcast <4 x i32> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
+ %res1 = bitcast <8 x i32> %res0 to <4 x i64>
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
+ %res2 = bitcast <8 x i32> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
+ %res2 = bitcast <8 x i32> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp2:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg1, <2 x i64> %res0, <2 x i64> %a0
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp3:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn0 to <2 x i1>
+ %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg0, <2 x i64> %res0, <2 x i64> zeroinitializer
+ ret <2 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp4:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp5:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
+ ret <4 x i64> %res1
+}
+
+define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_mask_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp6:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp7:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn0 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
+ ret <2 x double> %res1
+}
+
+define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm256_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp8:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp9:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_mask_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp10:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp11:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm256_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_mask_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp12:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_maskz_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp13:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
+ ret <2 x double> %res1
+}
+
+define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
+; X32-LABEL: test_mm256_mask_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp14:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
+; X32-LABEL: test_mm256_maskz_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp15:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_mask_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp16:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_maskz_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp17:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
+; X32-LABEL: test_mm256_mask_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
+; X32-LABEL: test_mm256_maskz_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_mask_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp18:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_maskz_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp19:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
+; X32-LABEL: test_mm256_mask_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
+; X32-LABEL: test_mm256_maskz_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp20:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp21:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
+ ret <4 x i64> %res1
+}
+
+define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
+; X32-LABEL: test_mm256_mask_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp22:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
+; X32-LABEL: test_mm256_maskz_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp23:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) {
+; X32-LABEL: test_mm_mask_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp24:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> %a3, <2 x i32> <i32 1, i32 3>
+ %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_maskz_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp25:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 1, i32 3>
+ %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
+ ret <2 x double> %res1
+}
+
+define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
+; X32-LABEL: test_mm256_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) {
+; X32-LABEL: test_mm256_mask_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp26:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a2, <4 x double> %a3, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
+; X32-LABEL: test_mm256_maskz_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp27:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a1, <4 x double> %a2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm_mask_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp28:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> %a3, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_maskz_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp29:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
+; X32-LABEL: test_mm256_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
+; X32-LABEL: test_mm256_mask_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
+; X32-LABEL: test_mm256_maskz_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..f9126b4614eb
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -0,0 +1,2536 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8]
+; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0]
+; CHECK-NEXT: vpaddd (%rsi){1to8}, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x38,0xfe,0x0e]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %y_32 = load i32, i32 * %y_ptr
+ %y = insertelement <4 x i32> undef, i32 %y_32, i32 0
+ %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %y, <8 x i32> %x1, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x58,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc9]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x59,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
+; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc9]
+; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x59,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
+; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc9]
+; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xd4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x19,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
+; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc9]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x18,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
+; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc9]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x18,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
+; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc9]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsldup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x12,0xd0]
+; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsldup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x12,0xd0]
+; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovshdup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x16,0xd0]
+; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovshdup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x16,0xd0]
+; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovddup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0xff,0x08,0x12,0xd0]
+; CHECK-NEXT: ## xmm2 = xmm0[0,0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0]
+; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xca]
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res2, %res3
+ ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovddup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0xff,0x28,0x12,0xd0]
+; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xca]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x05,0xd0,0x06]
+; CHECK-NEXT: ## ymm2 = ymm0[0,1,3,2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,3,2]
+; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,3,2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 ## encoding: [0x62,0xf3,0xfd,0x08,0x05,0xd0,0x01]
+; CHECK-NEXT: ## xmm2 = xmm0[1,0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,0]
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0x7d,0x28,0x04,0xd0,0x16]
+; CHECK-NEXT: ## ymm2 = ymm0[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 ## encoding: [0x62,0xf3,0x7d,0x08,0x04,0xd0,0x16]
+; CHECK-NEXT: ## xmm2 = xmm0[2,1,1,0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[2,1,1,0]
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[2,1,1,0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x01,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0]
+; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x00,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0]
+; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8)
+
+define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovapd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x29,0x07]
+; CHECK-NEXT: vmovapd %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x08,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.256(i8*, <4 x double>, i8)
+
+define void@test_int_x86_avx512_mask_store_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovapd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x29,0x07]
+; CHECK-NEXT: vmovapd %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x28,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.128(i8*, <2 x double>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovupd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x11,0x07]
+; CHECK-NEXT: vmovupd %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x08,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.256(i8*, <4 x double>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovupd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x11,0x07]
+; CHECK-NEXT: vmovupd %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x28,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.128(i8*, <4 x float>, i8)
+
+define void@test_int_x86_avx512_mask_store_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovaps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x29,0x07]
+; CHECK-NEXT: vmovaps %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x08,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.256(i8*, <8 x float>, i8)
+
+define void@test_int_x86_avx512_mask_store_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovaps %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07]
+; CHECK-NEXT: vmovaps %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x28,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.128(i8*, <4 x float>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovups %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x11,0x07]
+; CHECK-NEXT: vmovups %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x08,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.256(i8*, <8 x float>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovups %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x11,0x07]
+; CHECK-NEXT: vmovups %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x28,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.128(i8*, <2 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu64 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfe,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.256(i8*, <4 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu64 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfe,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.128(i8*, <4 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu32 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7e,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.256(i8*, <8 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu32 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7e,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.128(i8*, <2 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_store_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqa64 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.256(i8*, <4 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_store_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqa64 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.128(i8*, <4 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_store_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqa32 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.256(i8*, <8 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_store_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqa32 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7d,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1)
+ ret void
+}
+
+define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x0f]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x float> %res2, %res1
+ ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8)
+
+define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07]
+; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x0f]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x float> %res2, %res1
+ ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8)
+
+define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
+; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x0f]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x double> %res2, %res1
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8)
+
+define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
+; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x0f]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x double> %res2, %res1
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8)
+
+define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x0f]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x float> %res2, %res1
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8)
+
+define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
+; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x0f]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x float> %res2, %res1
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8)
+
+define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
+; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x0f]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <2 x double> %res2, %res1
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8)
+
+define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
+; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x0f]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <2 x double> %res2, %res1
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8*, <2 x double>, i8)
+
+declare <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8*, <4 x i32>, i8)
+
+define <4 x i32> @test_mask_load_unaligned_d_128(i8* %ptr, i8* %ptr2, <4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr2, <4 x i32> %res, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i32> %res2, %res1
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8*, <8 x i32>, i8)
+
+define <8 x i32> @test_mask_load_unaligned_d_256(i8* %ptr, i8* %ptr2, <8 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr2, <8 x i32> %res, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i32> %res2, %res1
+ ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8*, <2 x i64>, i8)
+
+define <2 x i64> @test_mask_load_unaligned_q_128(i8* %ptr, i8* %ptr2, <2 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr2, <2 x i64> %res, i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <2 x i64> %res2, %res1
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8*, <4 x i64>, i8)
+
+define <4 x i64> @test_mask_load_unaligned_q_256(i8* %ptr, i8* %ptr2, <4 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr2, <4 x i64> %res, i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i64> %res2, %res1
+ ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8*, <4 x i32>, i8)
+
+define <4 x i32> @test_mask_load_aligned_d_128(<4 x i32> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> %res, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i32> %res2, %res1
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8*, <8 x i32>, i8)
+
+define <8 x i32> @test_mask_load_aligned_d_256(<8 x i32> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> %res, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i32> %res2, %res1
+ ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8*, <2 x i64>, i8)
+
+define <2 x i64> @test_mask_load_aligned_q_128(<2 x i64> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> %res, i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <2 x i64> %res2, %res1
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8*, <4 x i64>, i8)
+
+define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> %res, i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i64> %res2, %res1
+ ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7d,0x08,0x70,0xd0,0x03]
+; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0]
+; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7d,0x28,0x70,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
+
+declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x15,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x15,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x14,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x14,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x14,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x14,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x6a,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x62,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x6a,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x62,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6d,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1]
+; CHECK-NEXT: ## xmm2 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6c,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1]
+; CHECK-NEXT: ## xmm2 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6c,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1]
+; CHECK-NEXT: ## ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6d,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1]
+; CHECK-NEXT: ## ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+
+define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_and_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_and_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_or_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_or_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_xor_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_xor_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <2 x i64>, <2 x i64>* %ptr_b
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <2 x i64>, <2 x i64>* %ptr_b
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <2 x i64>, <2 x i64>* %ptr_b
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
+ ret <2 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i64>, <4 x i64>* %ptr_b
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i64>, <4 x i64>* %ptr_b
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i64>, <4 x i64>* %ptr_b
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
+ ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index a4f3e666833a..41376cf602c4 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -1,124 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
; 256-bit
-define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: test_pcmpeq_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
-
-define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: test_pcmpeq_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
-
-define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: test_pcmpgt_d_256
-; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d_256
-; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
-
-define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: test_pcmpgt_q_256
-; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q_256
-; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
-
define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_cmp_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -127,58 +101,95 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_ucmp_d_256
-; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_d_256
-; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -187,58 +198,95 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_cmp_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -247,58 +295,95 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_ucmp_q_256
-; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_q_256
-; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -308,123 +393,96 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounw
; 128-bit
-define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_pcmpeq_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
-
-define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_pcmpeq_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
-
-define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_pcmpgt_d_128
-; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d_128
-; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
-
-define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_pcmpgt_q_128
-; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q_128
-; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
-
define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_cmp_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -433,58 +491,95 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_ucmp_d_128
-; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_d_128
-; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -493,58 +588,95 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_cmp_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -553,58 +685,95 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_ucmp_q_128
-; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_q_128
-; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -612,87 +781,114 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
-; CHECK-LABEL: compr1
-; CHECK: vcompresspd %zmm0
define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: compr1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8a,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
-; CHECK-LABEL: compr2
-; CHECK: vcompresspd %ymm0
define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
+; CHECK-LABEL: compr2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompresspd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
-; CHECK-LABEL: compr3
-; CHECK: vcompressps %xmm0
define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
+; CHECK-LABEL: compr3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompressps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8a,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
-; CHECK-LABEL: compr4
-; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: compr4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
-; CHECK-LABEL: compr5
-; CHECK: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
+; CHECK-LABEL: compr5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
-; CHECK-LABEL: compr6
-; CHECK: vcompressps %xmm0
define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
+; CHECK-LABEL: compr6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcompressps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8a,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
-; CHECK-LABEL: compr7
-; CHECK-NOT: vcompress
-; CHECK: vmovupd
define void @compr7(i8* %addr, <8 x double> %data) {
+; CHECK-LABEL: compr7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
ret void
}
-; CHECK-LABEL: compr8
-; CHECK-NOT: vcompressps %xmm0
define <4 x float> @compr8(<4 x float> %data) {
+; CHECK-LABEL: compr8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
-; CHECK-LABEL: compr9
-; CHECK: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: compr9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
-; CHECK-LABEL: compr10
-; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: compr10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
@@ -701,217 +897,188 @@ declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32
; Expand
-; CHECK-LABEL: expand1
-; CHECK: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: expand1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
-; CHECK-LABEL: expand2
-; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
+; CHECK-LABEL: expand2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
-; CHECK-LABEL: expand3
-; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
+; CHECK-LABEL: expand3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
-; CHECK-LABEL: expand4
-; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: expand4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
-; CHECK-LABEL: expand5
-; CHECK: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
+; CHECK-LABEL: expand5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
-; CHECK-LABEL: expand6
-; CHECK: vexpandps %xmm0
define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
+; CHECK-LABEL: expand6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vexpandps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x88,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
-; CHECK-LABEL: expand7
-; CHECK-NOT: vexpand
-; CHECK: vmovupd
define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
+; CHECK-LABEL: expand7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
ret <8 x double> %res
}
-; CHECK-LABEL: expand8
-; CHECK-NOT: vexpandps %xmm0
define <4 x float> @expand8(<4 x float> %data) {
+; CHECK-LABEL: expand8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
-; CHECK-LABEL: expand9
-; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: expand9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
ret <8 x i64> %res
}
declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
-; CHECK-LABEL: expand10
-; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: expand10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
-define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: vblendmps %ymm1, %ymm0
- %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly
-
-define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
- ; CHECK: vblendmpd %ymm1, %ymm0
- %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-
-define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) {
- ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop
- ; CHECK: vblendmpd (%
- %b = load <4 x double>, <4 x double>* %ptr
- %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_d_256
-; CHECK: vpblendmd
-define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) {
- %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly
-
-define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) {
- ; CHECK: vpblendmq
- %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly
-
-define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: vblendmps %xmm1, %xmm0
- %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
-
-define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: vblendmpd %xmm1, %xmm0
- %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-
-define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) {
- ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop
- ; CHECK: vblendmpd (%
- %b = load <2 x double>, <2 x double>* %ptr
- %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly
-
-define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpblendmd
- %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly
-
-define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) {
- ; CHECK: vpblendmq
- %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly
-
-
define < 2 x i64> @test_mask_mul_epi32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rr_128
- ;CHECK: vpmuldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrk_128
- ;CHECK: vpmuldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0xd1]
+; CHECK-LABEL: test_mask_mul_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrkz_128
- ;CHECK: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rm_128
- ;CHECK: vpmuldq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmk_128
- ;CHECK: vpmuldq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmkz_128
- ;CHECK: vpmuldq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmb_128
- ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -921,8 +1088,12 @@ define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
}
define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbk_128
- ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -932,8 +1103,11 @@ define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
}
define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbkz_128
- ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer
@@ -945,53 +1119,73 @@ define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8
declare < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32>, < 4 x i32>, < 2 x i64>, i8)
define < 4 x i64> @test_mask_mul_epi32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rr_256
- ;CHECK: vpmuldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrk_256
- ;CHECK: vpmuldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0xd1]
+; CHECK-LABEL: test_mask_mul_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrkz_256
- ;CHECK: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rm_256
- ;CHECK: vpmuldq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmk_256
- ;CHECK: vpmuldq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmkz_256
- ;CHECK: vpmuldq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmb_256
- ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1001,8 +1195,12 @@ define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
}
define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbk_256
- ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x28,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1012,8 +1210,11 @@ define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
}
define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbkz_256
- ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1025,53 +1226,73 @@ define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8
declare < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32>, < 8 x i32>, < 4 x i64>, i8)
define < 2 x i64> @test_mask_mul_epu32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rr_128
- ;CHECK: vpmuludq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrk_128
- ;CHECK: vpmuludq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xd1]
+; CHECK-LABEL: test_mask_mul_epu32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrkz_128
- ;CHECK: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rm_128
- ;CHECK: vpmuludq (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmk_128
- ;CHECK: vpmuludq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmkz_128
- ;CHECK: vpmuludq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmb_128
- ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -1081,8 +1302,12 @@ define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
}
define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbk_128
- ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -1092,8 +1317,11 @@ define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
}
define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbkz_128
- ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer
@@ -1105,53 +1333,73 @@ define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8
declare < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32>, < 4 x i32>, < 2 x i64>, i8)
define < 4 x i64> @test_mask_mul_epu32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rr_256
- ;CHECK: vpmuludq %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrk_256
- ;CHECK: vpmuludq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xd1]
+; CHECK-LABEL: test_mask_mul_epu32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrkz_256
- ;CHECK: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rm_256
- ;CHECK: vpmuludq (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmk_256
- ;CHECK: vpmuludq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmkz_256
- ;CHECK: vpmuludq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmb_256
- ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1161,8 +1409,12 @@ define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
}
define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbk_256
- ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1172,8 +1424,11 @@ define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
}
define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbkz_256
- ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1185,53 +1440,73 @@ define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8
declare < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32>, < 8 x i32>, < 4 x i64>, i8)
define <4 x i32> @test_mask_add_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_add_epi32_rr_128
- ;CHECK: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrk_128
- ;CHECK: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xd1]
+; CHECK-LABEL: test_mask_add_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrkz_128
- ;CHECK: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rm_128
- ;CHECK: vpaddd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmk_128
- ;CHECK: vpaddd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmkz_128
- ;CHECK: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rmb_128
- ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1240,8 +1515,12 @@ define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbk_128
- ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1250,8 +1529,11 @@ define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
}
define <4 x i32> @test_mask_add_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbkz_128
- ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1262,53 +1544,73 @@ define <4 x i32> @test_mask_add_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %m
declare <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
define <4 x i32> @test_mask_sub_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rr_128
- ;CHECK: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrk_128
- ;CHECK: vpsubd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0xd1]
+; CHECK-LABEL: test_mask_sub_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrkz_128
- ;CHECK: vpsubd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rm_128
- ;CHECK: (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmk_128
- ;CHECK: vpsubd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmkz_128
- ;CHECK: vpsubd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmb_128
- ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1317,8 +1619,12 @@ define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbk_128
- ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1327,8 +1633,11 @@ define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
}
define <4 x i32> @test_mask_sub_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbkz_128
- ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1339,53 +1648,73 @@ define <4 x i32> @test_mask_sub_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %m
declare <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
define <8 x i32> @test_mask_sub_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rr_256
- ;CHECK: vpsubd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrk_256
- ;CHECK: vpsubd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0xd1]
+; CHECK-LABEL: test_mask_sub_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrkz_256
- ;CHECK: vpsubd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rm_256
- ;CHECK: vpsubd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmk_256
- ;CHECK: vpsubd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmkz_256
- ;CHECK: vpsubd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmb_256
- ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1394,8 +1723,12 @@ define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbk_256
- ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1404,8 +1737,11 @@ define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
}
define <8 x i32> @test_mask_sub_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbkz_256
- ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1416,53 +1752,73 @@ define <8 x i32> @test_mask_sub_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %m
declare <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
define <8 x i32> @test_mask_add_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_add_epi32_rr_256
- ;CHECK: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrk_256
- ;CHECK: vpaddd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xd1]
+; CHECK-LABEL: test_mask_add_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrkz_256
- ;CHECK: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rm_256
- ;CHECK: vpaddd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmk_256
- ;CHECK: vpaddd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmkz_256
- ;CHECK: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rmb_256
- ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1471,8 +1827,12 @@ define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbk_256
- ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1481,8 +1841,11 @@ define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
}
define <8 x i32> @test_mask_add_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbkz_256
- ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1492,1079 +1855,443 @@ define <8 x i32> @test_mask_add_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %m
declare <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_and_epi32_rr_128
- ;CHECK: vpandd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrk_128
- ;CHECK: vpandd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrkz_128
- ;CHECK: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rm_128
- ;CHECK: vpandd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmk_128
- ;CHECK: vpandd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmkz_128
- ;CHECK: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rmb_128
- ;CHECK: vpandd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbk_128
- ;CHECK: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbkz_128
- ;CHECK: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_and_epi32_rr_256
- ;CHECK: vpandd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrk_256
- ;CHECK: vpandd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrkz_256
- ;CHECK: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rm_256
- ;CHECK: vpandd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmk_256
- ;CHECK: vpandd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmkz_256
- ;CHECK: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rmb_256
- ;CHECK: vpandd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbk_256
- ;CHECK: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbkz_256
- ;CHECK: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_or_epi32_rr_128
- ;CHECK: vpord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrk_128
- ;CHECK: vpord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrkz_128
- ;CHECK: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rm_128
- ;CHECK: vpord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmk_128
- ;CHECK: vpord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmkz_128
- ;CHECK: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rmb_128
- ;CHECK: vpord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbk_128
- ;CHECK: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbkz_128
- ;CHECK: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_or_epi32_rr_256
- ;CHECK: vpord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrk_256
- ;CHECK: vpord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrkz_256
- ;CHECK: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rm_256
- ;CHECK: vpord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmk_256
- ;CHECK: vpord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmkz_256
- ;CHECK: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rmb_256
- ;CHECK: vpord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbk_256
- ;CHECK: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbkz_256
- ;CHECK: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rr_128
- ;CHECK: vpxord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrk_128
- ;CHECK: vpxord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrkz_128
- ;CHECK: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rm_128
- ;CHECK: vpxord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmk_128
- ;CHECK: vpxord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmkz_128
- ;CHECK: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmb_128
- ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbk_128
- ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_128
- ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rr_256
- ;CHECK: vpxord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrk_256
- ;CHECK: vpxord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrkz_256
- ;CHECK: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rm_256
- ;CHECK: vpxord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmk_256
- ;CHECK: vpxord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmkz_256
- ;CHECK: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmb_256
- ;CHECK: vpxord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbk_256
- ;CHECK: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_256
- ;CHECK: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rr_128
- ;CHECK: vpandnd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrk_128
- ;CHECK: vpandnd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrkz_128
- ;CHECK: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rm_128
- ;CHECK: vpandnd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmk_128
- ;CHECK: vpandnd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmkz_128
- ;CHECK: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmb_128
- ;CHECK: vpandnd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbk_128
- ;CHECK: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbkz_128
- ;CHECK: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rr_256
- ;CHECK: vpandnd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrk_256
- ;CHECK: vpandnd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrkz_256
- ;CHECK: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rm_256
- ;CHECK: vpandnd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmk_256
- ;CHECK: vpandnd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmkz_256
- ;CHECK: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmb_256
- ;CHECK: vpandnd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbk_256
- ;CHECK: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbkz_256
- ;CHECK: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rr_128
- ;CHECK: vpandnq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrk_128
- ;CHECK: vpandnq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrkz_128
- ;CHECK: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rm_128
- ;CHECK: vpandnq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07]
- %b = load <2 x i64>, <2 x i64>* %ptr_b
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmk_128
- ;CHECK: vpandnq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f]
- %b = load <2 x i64>, <2 x i64>* %ptr_b
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmkz_128
- ;CHECK: vpandnq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07]
- %b = load <2 x i64>, <2 x i64>* %ptr_b
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmb_128
- ;CHECK: vpandnq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
- %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbk_128
- ;CHECK: vpandnq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
- %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbkz_128
- ;CHECK: vpandnq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
- %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
- ret <2 x i64> %res
-}
-
-declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
-define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rr_256
- ;CHECK: vpandnq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrk_256
- ;CHECK: vpandnq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrkz_256
- ;CHECK: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rm_256
- ;CHECK: vpandnq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07]
- %b = load <4 x i64>, <4 x i64>* %ptr_b
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmk_256
- ;CHECK: vpandnq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f]
- %b = load <4 x i64>, <4 x i64>* %ptr_b
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmkz_256
- ;CHECK: vpandnq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07]
- %b = load <4 x i64>, <4 x i64>* %ptr_b
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmb_256
- ;CHECK: vpandnq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
- %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbk_256
- ;CHECK: vpandnq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
- %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbkz_256
- ;CHECK: vpandnq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
- %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
- ret <4 x i64> %res
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmpps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8)
define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmpps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8)
define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
- ;CHECK: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmppd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8)
define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
- ;CHECK: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmppd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)
define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_add_ps_256
- ;CHECK: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_add_ps_256
- ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_add_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x58,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_add_ps_256
- ;CHECK: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_add_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_add_ps_128
- ;CHECK: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_add_ps_128
- ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_add_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x58,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_add_ps_128
- ;CHECK: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_add_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_sub_ps_256
- ;CHECK: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_sub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_sub_ps_256
- ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_sub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5c,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_sub_ps_256
- ;CHECK: vsubps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_sub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_sub_ps_128
- ;CHECK: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_sub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_sub_ps_128
- ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_sub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5c,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_sub_ps_128
- ;CHECK: vsubps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_sub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_mul_ps_256
- ;CHECK: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_mul_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_mul_ps_256
- ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_mul_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x59,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mul_ps_256
- ;CHECK: vmulps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_mul_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_mul_ps_128
- ;CHECK: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_mul_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_mul_ps_128
- ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_mul_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x59,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mul_ps_128
- ;CHECK: vmulps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_mul_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_div_ps_256
- ;CHECK: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_div_ps_256
- ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_div_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5e,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_div_ps_256
- ;CHECK: vdivps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_div_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_div_ps_128
- ;CHECK: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_div_ps_128
- ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_div_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5e,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_div_ps_128
- ;CHECK: vdivps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_div_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_max_ps_256
- ;CHECK: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_max_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_max_ps_256
- ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_max_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_max_ps_256
- ;CHECK: vmaxps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_max_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_max_ps_128
- ;CHECK: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_max_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_max_ps_128
- ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_max_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_max_ps_128
- ;CHECK: vmaxps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_max_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_min_ps_256
- ;CHECK: vminps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_min_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_min_ps_256
- ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_min_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_min_ps_256
- ;CHECK: vminps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_min_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_min_ps_128
- ;CHECK: vminps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_min_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_min_ps_128
- ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_min_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_min_ps_128
- ;CHECK: vminps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_min_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
- ; CHECK-LABEL: test_sqrt_pd_256
- ; CHECK: vsqrtpd
+; CHECK-LABEL: test_sqrt_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
- ; CHECK-LABEL: test_sqrt_ps_256
- ; CHECK: vsqrtps
+; CHECK-LABEL: test_sqrt_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
@@ -2572,8 +2299,10 @@ define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
- ; CHECK-LABEL: test_getexp_pd_256
- ; CHECK: vgetexppd
+; CHECK-LABEL: test_getexp_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vgetexppd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x42,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
ret <4 x double> %res
}
@@ -2581,8 +2310,10 @@ define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
define <8 x float> @test_getexp_ps_256(<8 x float> %a0) {
- ; CHECK-LABEL: test_getexp_ps_256
- ; CHECK: vgetexpps
+; CHECK-LABEL: test_getexp_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vgetexpps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x42,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
@@ -2590,11 +2321,14 @@ declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>
declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_128
-; CHECK-NOT: call
-; CHECK: vpmaxsd %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2603,11 +2337,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_256
-; CHECK-NOT: call
-; CHECK: vpmaxsd %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3d,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2616,11 +2353,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_128
-; CHECK-NOT: call
-; CHECK: vpmaxsq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2629,11 +2369,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_256
-; CHECK-NOT: call
-; CHECK: vpmaxsq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2642,11 +2385,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_128
-; CHECK-NOT: call
-; CHECK: vpmaxud %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2,i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1]
+; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2655,11 +2401,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_256
-; CHECK-NOT: call
-; CHECK: vpmaxud %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1]
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3f,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2668,11 +2417,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_128
-; CHECK-NOT: call
-; CHECK: vpmaxuq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1]
+; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2681,11 +2433,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_256
-; CHECK-NOT: call
-; CHECK: vpmaxuq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1]
+; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2694,11 +2449,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_128
-; CHECK-NOT: call
-; CHECK: vpminsd %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1]
+; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2707,11 +2465,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_256
-; CHECK-NOT: call
-; CHECK: vpminsd %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1]
+; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x39,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2720,11 +2481,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_128
-; CHECK-NOT: call
-; CHECK: vpminsq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1]
+; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x39,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2733,11 +2497,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_256
-; CHECK-NOT: call
-; CHECK: vpminsq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1]
+; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2746,11 +2513,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_128
-; CHECK-NOT: call
-; CHECK: vpminud %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1]
+; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2759,11 +2529,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_256
-; CHECK-NOT: call
-; CHECK: vpminud %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1]
+; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3b,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2772,11 +2545,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_128
-; CHECK-NOT: call
-; CHECK: vpminuq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1]
+; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2785,11 +2561,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_256
-; CHECK-NOT: call
-; CHECK: vpminuq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1]
+; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2798,12 +2577,15 @@ define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d %xmm{{.*}}{%k1}
-; CHECK-NOT: {z}
define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xca]
+; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -2812,11 +2594,15 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d %xmm{{.*}}{%k1} {z}
define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xca]
+; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -2825,12 +2611,15 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d %ymm{{.*}}{%k1}
-; CHECK-NOT: {z}
define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xca]
+; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2839,11 +2628,15 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d {{.*}}{%k1} {z}
define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xca]
+; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2852,11 +2645,15 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2pd %xmm{{.*}}{%k1}
define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x77,0xda]
+; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x77,0xca]
+; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -2865,11 +2662,15 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2pd %ymm{{.*}}{%k1}
define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x77,0xda]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x77,0xca]
+; CHECK-NEXT: vaddpd %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -2878,11 +2679,15 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2ps %xmm{{.*}}{%k1}
define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xda]
+; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x77,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -2891,11 +2696,15 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2ps %ymm{{.*}}{%k1}
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x77,0xda]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x77,0xca]
+; CHECK-NEXT: vaddps %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -2904,11 +2713,14 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsq{{.*}}{%k1}
define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x1f,0xc8]
+; CHECK-NEXT: vpabsq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x1f,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2917,11 +2729,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x
declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsq{{.*}}{%k1}
define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x1f,0xc8]
+; CHECK-NEXT: vpabsq %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x1f,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -2930,11 +2745,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x
declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsd{{.*}}{%k1}
define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1e,0xc8]
+; CHECK-NEXT: vpabsd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x1e,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -2943,25 +2761,30 @@ define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x
declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsd{{.*}}{%k1}
define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1e,0xc8]
+; CHECK-NEXT: vpabsd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x1e,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
ret <8 x i32> %res2
}
-
declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefpd{{.*}}{%k1}
define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x2c,0xd1]
+; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x2c,0xc1]
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -2970,11 +2793,14 @@ define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2
declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefpd{{.*}}{%k1}
define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x2c,0xd1]
+; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x2c,0xc1]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -2982,11 +2808,15 @@ define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4
}
declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefps{{.*}}{%k1}
+
define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2c,0xd1]
+; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2c,0xc1]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -2994,250 +2824,33 @@ define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x
}
declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefps{{.*}}{%k1}
+
define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2c,0xd1]
+; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2c,0xc1]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
ret <8 x float> %res2
}
-declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
-; CHECK: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[1],k1[1]
-; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[1],xmm1[1]
- %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
- %res2 = fadd <2 x double> %res, %res1
- ret <2 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256:
-; CHECK: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
-; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x15,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
- %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
- %res2 = fadd <4 x double> %res, %res1
- ret <4 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128:
-; CHECK: vunpckhps %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
-; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x15,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
- %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
- %res2 = fadd <4 x float> %res, %res1
- ret <4 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
-; CHECK: ## BB#0:
-; CHECK: vunpckhps %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
-; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
- %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
- %res2 = fadd <8 x float> %res, %res1
- ret <8 x float> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128:
-; CHECK: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0]
-; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x14,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0]
- %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
- %res2 = fadd <2 x double> %res, %res1
- ret <2 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256:
-; CHECK: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
-; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x14,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
- %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
- %res2 = fadd <4 x double> %res, %res1
- ret <4 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128:
-; CHECK: vunpcklps %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
-; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x14,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
- %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
- %res2 = fadd <4 x float> %res, %res1
- ret <4 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256:
-; CHECK: vunpcklps %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
-; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x14,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
- %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
- %res2 = fadd <8 x float> %res, %res1
- ret <8 x float> %res2
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128:
-; CHECK: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
-; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6a,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
- %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
- %res2 = add <4 x i32> %res, %res1
- ret <4 x i32> %res2
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128:
-; CHECK: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
-; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x62,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
- %res2 = add <4 x i32> %res, %res1
- ret <4 x i32> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256:
-; CHECK: ## BB#0:
-; CHECK: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
-; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6a,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
- %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
- %res2 = add <8 x i32> %res, %res1
- ret <8 x i32> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256:
-; CHECK: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
-; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x62,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
- %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
- %res2 = add <8 x i32> %res, %res1
- ret <8 x i32> %res2
-}
-
-declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128:
-; CHECK: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[1],k1[1]
-; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6d,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[1],xmm1[1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res2 = add <2 x i64> %res, %res1
- ret <2 x i64> %res2
-}
-
-declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128:
-; CHECK: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0]
-; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6c,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0]
- %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res2 = add <2 x i64> %res, %res1
- ret <2 x i64> %res2
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256:
-; CHECK: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
-; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6c,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
- %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res2 = add <4 x i64> %res, %res1
- ret <4 x i64> %res2
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256:
-; CHECK: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
-; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6d,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
- %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res2 = add <4 x i64> %res, %res1
- ret <4 x i64> %res2
-}
-
declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
-; CHECK: vpmovqb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
+; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2]
+; CHECK-NEXT: vpmovqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3250,8 +2863,11 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128:
-; CHECK: vpmovqb %xmm0, (%rdi)
-; CHECK: vpmovqb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0x07]
+; CHECK-NEXT: vpmovqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3261,9 +2877,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
-; CHECK: vpmovsqb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
+; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2]
+; CHECK-NEXT: vpmovsqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3276,8 +2897,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128:
-; CHECK: vpmovsqb %xmm0, (%rdi)
-; CHECK: vpmovsqb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3287,9 +2911,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
-; CHECK: vpmovusqb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
+; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2]
+; CHECK-NEXT: vpmovusqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3302,8 +2931,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128:
-; CHECK: vpmovusqb %xmm0, (%rdi)
-; CHECK: vpmovusqb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3313,9 +2945,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256:
-; CHECK: vpmovqb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
+; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2]
+; CHECK-NEXT: vpmovqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3328,8 +2965,11 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256:
-; CHECK: vpmovqb %ymm0, (%rdi)
-; CHECK: vpmovqb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0x07]
+; CHECK-NEXT: vpmovqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3339,9 +2979,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256:
-; CHECK: vpmovsqb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
+; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2]
+; CHECK-NEXT: vpmovsqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3354,8 +2999,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256:
-; CHECK: vpmovsqb %ymm0, (%rdi)
-; CHECK: vpmovsqb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3365,9 +3013,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256:
-; CHECK: vpmovusqb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
+; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2]
+; CHECK-NEXT: vpmovusqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3380,8 +3033,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256:
-; CHECK: vpmovusqb %ymm0, (%rdi)
-; CHECK: vpmovusqb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3391,9 +3047,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
-; CHECK: vpmovqw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3406,8 +3067,11 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128:
-; CHECK: vpmovqw %xmm0, (%rdi)
-; CHECK: vpmovqw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0x07]
+; CHECK-NEXT: vpmovqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3417,9 +3081,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
-; CHECK: vpmovsqw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
+; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2]
+; CHECK-NEXT: vpmovsqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3432,8 +3101,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128:
-; CHECK: vpmovsqw %xmm0, (%rdi)
-; CHECK: vpmovsqw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3443,9 +3115,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
-; CHECK: vpmovusqw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
+; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2]
+; CHECK-NEXT: vpmovusqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3458,8 +3135,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128:
-; CHECK: vpmovusqw %xmm0, (%rdi)
-; CHECK: vpmovusqw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3469,9 +3149,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256:
-; CHECK: vpmovqw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
+; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2]
+; CHECK-NEXT: vpmovqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3484,8 +3169,11 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256:
-; CHECK: vpmovqw %ymm0, (%rdi)
-; CHECK: vpmovqw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0x07]
+; CHECK-NEXT: vpmovqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3495,9 +3183,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256:
-; CHECK: vpmovsqw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
+; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2]
+; CHECK-NEXT: vpmovsqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3510,8 +3203,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256:
-; CHECK: vpmovsqw %ymm0, (%rdi)
-; CHECK: vpmovsqw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3521,9 +3217,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256:
-; CHECK: vpmovusqw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
+; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2]
+; CHECK-NEXT: vpmovusqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3536,8 +3237,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256:
-; CHECK: vpmovusqw %ymm0, (%rdi)
-; CHECK: vpmovusqw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3547,9 +3251,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
-; CHECK: vpmovqd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
+; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc2]
+; CHECK-NEXT: vpmovqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3562,8 +3271,11 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128:
-; CHECK: vpmovqd %xmm0, (%rdi)
-; CHECK: vpmovqd %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0x07]
+; CHECK-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3573,9 +3285,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
-; CHECK: vpmovsqd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
+; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc2]
+; CHECK-NEXT: vpmovsqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3588,8 +3305,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128:
-; CHECK: vpmovsqd %xmm0, (%rdi)
-; CHECK: vpmovsqd %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3599,9 +3319,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
-; CHECK: vpmovusqd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
+; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc2]
+; CHECK-NEXT: vpmovusqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3614,8 +3339,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128:
-; CHECK: vpmovusqd %xmm0, (%rdi)
-; CHECK: vpmovusqd %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3625,9 +3353,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256:
-; CHECK: vpmovqd %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqd %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1]
+; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc2]
+; CHECK-NEXT: vpmovqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3640,8 +3373,11 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256:
-; CHECK: vpmovqd %ymm0, (%rdi)
-; CHECK: vpmovqd %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0x07]
+; CHECK-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3651,9 +3387,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256:
-; CHECK: vpmovsqd %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqd %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1]
+; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2]
+; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3666,8 +3407,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256:
-; CHECK: vpmovsqd %ymm0, (%rdi)
-; CHECK: vpmovsqd %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3677,9 +3421,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256:
-; CHECK: vpmovusqd %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqd %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1]
+; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2]
+; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3692,8 +3441,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256:
-; CHECK: vpmovusqd %ymm0, (%rdi)
-; CHECK: vpmovusqd %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3703,9 +3455,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128:
-; CHECK: vpmovdb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
+; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2]
+; CHECK-NEXT: vpmovdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3718,8 +3475,11 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128:
-; CHECK: vpmovdb %xmm0, (%rdi)
-; CHECK: vpmovdb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0x07]
+; CHECK-NEXT: vpmovdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3729,9 +3489,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
-; CHECK: vpmovsdb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
+; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2]
+; CHECK-NEXT: vpmovsdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3744,8 +3509,11 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128:
-; CHECK: vpmovsdb %xmm0, (%rdi)
-; CHECK: vpmovsdb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3755,9 +3523,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
-; CHECK: vpmovusdb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
+; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2]
+; CHECK-NEXT: vpmovusdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3770,8 +3543,11 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128:
-; CHECK: vpmovusdb %xmm0, (%rdi)
-; CHECK: vpmovusdb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3781,9 +3557,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256:
-; CHECK: vpmovdb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
+; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2]
+; CHECK-NEXT: vpmovdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3796,8 +3577,11 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256:
-; CHECK: vpmovdb %ymm0, (%rdi)
-; CHECK: vpmovdb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0x07]
+; CHECK-NEXT: vpmovdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3807,9 +3591,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256:
-; CHECK: vpmovsdb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
+; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2]
+; CHECK-NEXT: vpmovsdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3822,8 +3611,11 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256:
-; CHECK: vpmovsdb %ymm0, (%rdi)
-; CHECK: vpmovsdb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3833,9 +3625,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256:
-; CHECK: vpmovusdb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
+; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2]
+; CHECK-NEXT: vpmovusdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3848,8 +3645,11 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256:
-; CHECK: vpmovusdb %ymm0, (%rdi)
-; CHECK: vpmovusdb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3859,9 +3659,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
-; CHECK: vpmovdw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
+; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2]
+; CHECK-NEXT: vpmovdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3874,8 +3679,11 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128:
-; CHECK: vpmovdw %xmm0, (%rdi)
-; CHECK: vpmovdw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
+; CHECK-NEXT: vpmovdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3885,9 +3693,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
-; CHECK: vpmovsdw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
+; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2]
+; CHECK-NEXT: vpmovsdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3900,8 +3713,11 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128:
-; CHECK: vpmovsdw %xmm0, (%rdi)
-; CHECK: vpmovsdw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3911,9 +3727,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
-; CHECK: vpmovusdw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
+; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2]
+; CHECK-NEXT: vpmovusdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3926,8 +3747,11 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128:
-; CHECK: vpmovusdw %xmm0, (%rdi)
-; CHECK: vpmovusdw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3937,9 +3761,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256:
-; CHECK: vpmovdw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
+; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2]
+; CHECK-NEXT: vpmovdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3952,8 +3781,11 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256:
-; CHECK: vpmovdw %ymm0, (%rdi)
-; CHECK: vpmovdw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0x07]
+; CHECK-NEXT: vpmovdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3963,9 +3795,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256:
-; CHECK: vpmovsdw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
+; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2]
+; CHECK-NEXT: vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3978,8 +3815,11 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256:
-; CHECK: vpmovsdw %ymm0, (%rdi)
-; CHECK: vpmovsdw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3989,9 +3829,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256:
-; CHECK: vpmovusdw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
+; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2]
+; CHECK-NEXT: vpmovusdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -4004,8 +3849,11 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256:
-; CHECK: vpmovusdw %ymm0, (%rdi)
-; CHECK: vpmovusdw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -4016,12 +3864,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>,
define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4033,12 +3880,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32>, <4 x double>,
define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 ## encoding: [0xc5,0xfe,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4050,12 +3896,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5b,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4067,12 +3912,11 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32>, <8 x float>, i
define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5b,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
%res1 = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4084,12 +3928,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4101,12 +3944,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x28,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4118,12 +3960,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double>, <4 x float>
define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8]
+; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x5a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4135,12 +3976,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8
define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8]
+; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x5a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4152,12 +3992,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4169,12 +4008,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4186,12 +4024,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4203,12 +4040,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4220,12 +4056,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float>, <2 x double
define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8]
+; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5a,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4237,12 +4072,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float>, <4 x double
define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8]
+; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5a,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4254,12 +4088,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4271,12 +4104,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4288,12 +4120,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4305,12 +4136,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4322,12 +4152,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4339,12 +4168,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4356,12 +4184,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4373,12 +4200,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4390,12 +4216,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4407,12 +4232,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4424,12 +4248,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4441,12 +4264,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32>, <4 x double>
define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4458,12 +4280,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4475,12 +4296,11 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
%res1 = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4488,12 +4308,15 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x f
}
declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscalepd {{.*}}{%k1}
-; CHECK: vrndscalepd
+
define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscalepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x09,0xc8,0x04]
+; CHECK-NEXT: vrndscalepd $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x09,0xc0,0x58]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4501,12 +4324,15 @@ define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <
}
declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscalepd {{.*}}{%k1}
-; CHECK: vrndscalepd
+
define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscalepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x09,0xc8,0x04]
+; CHECK-NEXT: vrndscalepd $88, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x09,0xc0,0x58]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4514,12 +4340,15 @@ define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <
}
declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscaleps {{.*}}{%k1}
-; CHECK: vrndscaleps
+
define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscaleps $88, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x08,0xc8,0x58]
+; CHECK-NEXT: vrndscaleps $4, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x04]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4528,12 +4357,14 @@ define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4
declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscaleps {{.*}}{%k1}
-; CHECK: vrndscaleps
define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscaleps $5, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x08,0xc8,0x05]
+; CHECK-NEXT: vrndscaleps $66, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x08,0xc0,0x42]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4545,17 +4376,16 @@ declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: ## ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd9,0x16]
+; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
%res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4)
@@ -4569,17 +4399,16 @@ declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: ## ymm3 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd9,0x16]
+; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
%res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4)
@@ -4593,14 +4422,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32
define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4612,14 +4440,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32
define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -4631,13 +4458,13 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4
define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc2,0x01]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc0,0x01]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3)
%res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1)
@@ -4651,14 +4478,13 @@ declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2
define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x26,0xd0,0x0b]
+; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3)
%res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> zeroinitializer, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 -1)
@@ -4672,12 +4498,11 @@ declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4
define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4689,12 +4514,11 @@ declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x
define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4706,12 +4530,11 @@ declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x
define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4723,17 +4546,16 @@ declare <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double>, <2 x double
define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[1]
-; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: ## xmm3 = k1[0],xmm0[1]
-; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[1]
+; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xd9,0x16]
+; CHECK-NEXT: ## xmm3 {%k1} {z} = xmm0[0],xmm1[1]
+; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc6,0xc1,0x16]
; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[1]
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 -1)
%res2 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> zeroinitializer, i8 %x4)
@@ -4747,14 +4569,13 @@ declare <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double>, <4 x double
define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[1],ymm2[3],k1[2]
-; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
+; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc6,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4766,14 +4587,13 @@ declare <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[2,1],k1[1,0]
-; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2,1],xmm1[1,0]
+; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc6,0xc1,0x16]
; CHECK-NEXT: ## xmm0 = xmm0[2,1],xmm1[1,0]
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4785,14 +4605,13 @@ declare <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[2,1],k1[1,0],ymm2[6,5],k1[5,4]
-; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4]
+; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc6,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4]
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4804,14 +4623,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32>, <4 x i32>, i32,
define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x16]
+; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xd9,0x16]
+; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
@@ -4825,12 +4643,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32,
define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x16]
+; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4842,12 +4659,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64>, <2 x i64>, i32,
define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x16]
+; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -4859,127 +4675,29 @@ declare <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64>, <4 x i64>, i32,
define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x16]
+; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
%res2 = add <4 x i64> %res, %res1
ret <4 x i64> %res2
}
-declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm1[0,1,3,2]
-; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = k1[0,1,3,2]
-; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,3,2]
-; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm1[1,0]
-; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = k1[1,0]
-; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[1,0]
-; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1)
- %res3 = fadd <2 x double> %res, %res1
- %res4 = fadd <2 x double> %res3, %res2
- ret <2 x double> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm1[2,1,1,0,6,5,5,4]
-; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = k1[2,1,1,0,6,5,5,4]
-; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[2,1,1,0,6,5,5,4]
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res3, %res2
- ret <8 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm1[2,1,1,0]
-; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = k1[2,1,1,0]
-; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[2,1,1,0]
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-
declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1]
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xd9]
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x0d,0xc1]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
%res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
@@ -4993,14 +4711,13 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1]
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xd9]
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x0d,0xc1]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xcb]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3)
%res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
@@ -5014,14 +4731,13 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i3
define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1]
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xd9]
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x0c,0xc1]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
%res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@@ -5035,14 +4751,13 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i3
define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1]
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xd9]
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x0c,0xc1]
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xcb]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3)
%res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
@@ -5056,14 +4771,13 @@ declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x floa
define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xd9,0x01]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x18,0xc1,0x01]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
%res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4)
@@ -5077,14 +4791,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i3
define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xd9,0x01]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x38,0xc1,0x01]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1)
@@ -5099,13 +4812,12 @@ declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -5117,13 +4829,12 @@ declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -5135,13 +4846,12 @@ declare <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -5153,13 +4863,12 @@ declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -5171,13 +4880,12 @@ declare <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -5189,13 +4897,12 @@ declare <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
%res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -5207,13 +4914,12 @@ declare <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -5225,121 +4931,45 @@ declare <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
%res2 = add <4 x i64> %res, %res1
ret <4 x i64> %res2
}
-declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 -1)
- %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask)
- %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
- %res3 = add <8 x i32> %res, %res1
- %res4 = add <8 x i32> %res2, %res3
- ret <8 x i32> %res4
-}
-
-declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
- %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask)
- %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask)
- %res3 = add <4 x i32> %res, %res1
- %res4 = add <4 x i32> %res2, %res3
- ret <4 x i32> %res4
-}
-
-declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
- %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask)
- %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask)
- %res3 = add <4 x i64> %res, %res1
- %res4 = add <4 x i64> %res2, %res3
- ret <4 x i64> %res4
-}
-
-declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
- %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask)
- %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask)
- %res3 = add <2 x i64> %res, %res1
- %res4 = add <2 x i64> %res2, %res3
- ret <2 x i64> %res4
-}
-
define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
- ; CHECK: test_x86_vcvtph2ps_128
- ; CHECK: vcvtph2ps %xmm0, %xmm0
+; CHECK-LABEL: test_x86_vcvtph2ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_128_rrk
- ; CHECK: vcvtph2ps %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_x86_vcvtph2ps_128_rrk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_128_rrkz
- ; CHECK: vcvtph2ps %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_vcvtph2ps_128_rrkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
@@ -5347,228 +4977,133 @@ define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly
define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
- ; CHECK: test_x86_vcvtph2ps_256
- ; CHECK: vcvtph2ps %xmm0, %ymm0
+; CHECK-LABEL: test_x86_vcvtph2ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_256_rrk
- ; CHECK: vcvtph2ps %xmm0, %ymm1 {%k1}
+; CHECK-LABEL: test_x86_vcvtph2ps_256_rrk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_256_rrkz
- ; CHECK: vcvtph2ps %xmm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_x86_vcvtph2ps_256_rrkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly
-define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
- ; CHECK: test_x86_vcvtps2ph_128
- ; CHECK: vcvtps2ph $2, %xmm0, %xmm0
- %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) {
+; CHECK-LABEL: test_x86_vcvtps2ph_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1d,0xc0,0x02]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res0 = add <8 x i16> %res1, %res2
+ %res = add <8 x i16> %res3, %res0
ret <8 x i16> %res
}
-
declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly
-define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
- ; CHECK: test_x86_vcvtps2ph_256
- ; CHECK: vcvtps2ph $2, %ymm0, %xmm0
- %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %src) {
+; CHECK-LABEL: test_x86_vcvtps2ph_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1d,0xc0,0x02]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res0 = add <8 x i16> %res1, %res2
+ %res = add <8 x i16> %res3, %res0
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly
-declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2]
-; CHECK-NEXT: vmovsldup %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2]
-; CHECK-NEXT: vmovsldup %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2]
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovsldup %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovsldup %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT: vmovshdup %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: vmovshdup %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vmovshdup %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vmovshdup %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm0[0,0]
-; CHECK-NEXT: vmovddup %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = xmm0[0,0]
-; CHECK-NEXT: vmovddup %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[0,0]
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2)
- %res3 = fadd <2 x double> %res, %res1
- %res4 = fadd <2 x double> %res2, %res3
- ret <2 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2]
-; CHECK-NEXT: vmovddup %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2]
-; CHECK-NEXT: vmovddup %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2]
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-
define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) {
; CHECK-LABEL: test_rsqrt_ps_256_rr:
-; CHECK: vrsqrt14ps %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_256_rrkz:
-; CHECK: vrsqrt14ps %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_256_rrk:
-; CHECK: vrsqrt14ps %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
ret <8 x float> %res
}
define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) {
; CHECK-LABEL: test_rsqrt_ps_128_rr:
-; CHECK: vrsqrt14ps %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_128_rrkz:
-; CHECK: vrsqrt14ps %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_128_rrk:
-; CHECK: vrsqrt14ps %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
ret <4 x float> %res
}
@@ -5578,42 +5113,60 @@ declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8
define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_256_rr:
-; CHECK: vrcp14ps %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_256_rrkz:
-; CHECK: vrcp14ps %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_256_rrk:
-; CHECK: vrcp14ps %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
ret <8 x float> %res
}
define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_128_rr:
-; CHECK: vrcp14ps %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_128_rrkz:
-; CHECK: vrcp14ps %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_128_rrk:
-; CHECK: vrcp14ps %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
ret <4 x float> %res
}
@@ -5621,45 +5174,62 @@ define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %ma
declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
-
define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) {
; CHECK-LABEL: test_rsqrt_pd_256_rr:
-; CHECK: vrsqrt14pd %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
ret <4 x double> %res
}
define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_256_rrkz:
-; CHECK: vrsqrt14pd %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
ret <4 x double> %res
}
define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_256_rrk:
-; CHECK: vrsqrt14pd %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
ret <4 x double> %res
}
define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) {
; CHECK-LABEL: test_rsqrt_pd_128_rr:
-; CHECK: vrsqrt14pd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
ret <2 x double> %res
}
define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_128_rrkz:
-; CHECK: vrsqrt14pd %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
ret <2 x double> %res
}
define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_128_rrk:
-; CHECK: vrsqrt14pd %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
ret <2 x double> %res
}
@@ -5669,42 +5239,60 @@ declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>,
define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) {
; CHECK-LABEL: test_rcp_pd_256_rr:
-; CHECK: vrcp14pd %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
ret <4 x double> %res
}
define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_256_rrkz:
-; CHECK: vrcp14pd %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
ret <4 x double> %res
}
define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_256_rrk:
-; CHECK: vrcp14pd %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
ret <4 x double> %res
}
define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) {
; CHECK-LABEL: test_rcp_pd_128_rr:
-; CHECK: vrcp14pd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
ret <2 x double> %res
}
define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_128_rrkz:
-; CHECK: vrcp14pd %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
ret <2 x double> %res
}
define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_128_rrk:
-; CHECK: vrcp14pd %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
ret <2 x double> %res
}
@@ -5712,69 +5300,22 @@ define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8
declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
-define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-
- %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly
-
-define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
-
- %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly
-
-define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-
- %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
-
-
declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8)
define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vaddps %ymm1, %ymm0, %ymm0
-; CHECK: vaddps %ymm0, %ymm2, %ymm0
-
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
%res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask)
%res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask)
@@ -5787,13 +5328,18 @@ declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>,
define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK: vpaddd %ymm0, %ymm2, %ymm0
-
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x43,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask)
%res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
@@ -5807,14 +5353,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i6
define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xd9]
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xd3,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
@@ -5828,14 +5373,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xd9]
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd3,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
@@ -5844,43 +5388,41 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x
ret <4 x i64> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 -1)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x73,0xd0,0xff]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 255, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 255, <2 x i64> %x2, i8 -1)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 255, <2 x i64> zeroinitializer, i8 %x3)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res2, %res3
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 -1)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x73,0xd0,0xff]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 255, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 255, <4 x i64> %x2, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 255, <4 x i64> zeroinitializer, i8 %x3)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res2, %res3
ret <4 x i64> %res4
@@ -5889,14 +5431,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xd9]
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd2,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
@@ -5910,14 +5451,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xd9]
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd2,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
@@ -5926,63 +5466,61 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x
ret <8 x i32> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld $255, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrld $255, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrld $255, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 -1)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xd0,0xff]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 255, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 255, <4 x i32> %x2, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 255, <4 x i32> zeroinitializer, i8 %x3)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res2, %res3
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld $255, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrld $255, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrld $255, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 -1)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xd0,0xff]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 255, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 255, <8 x i32> %x2, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 255, <8 x i32> zeroinitializer, i8 %x3)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res2, %res3
ret <8 x i32> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsrld $255, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpsrld $255, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpsrld $255, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xc9,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xd0,0xff]
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> zeroinitializer, i16 %x3)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res2, %res3
ret <16 x i32> %res4
@@ -5993,14 +5531,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64>, <2 x i64>, <2 x i64
define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x45,0xd1]
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x45,0xd9]
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x45,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6014,14 +5551,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64>, <4 x i64>, <4 x i64
define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x45,0xd1]
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x45,0xd9]
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x45,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6035,14 +5571,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32>, <4 x i32>, <4 x i32
define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x45,0xd1]
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x45,0xd9]
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x45,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6056,14 +5591,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x45,0xd1]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x45,0xd9]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x45,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6077,14 +5611,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe2,0xd1]
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe2,0xd9]
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe2,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6098,14 +5631,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe2,0xd1]
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe2,0xd9]
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe2,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6114,43 +5646,41 @@ define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x
ret <8 x i32> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrad $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrad $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrad $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
@@ -6161,14 +5691,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i6
define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe2,0xd1]
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xe2,0xd9]
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe2,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6182,14 +5711,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe2,0xd1]
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xe2,0xd9]
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xe2,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6198,43 +5726,41 @@ define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x
ret <4 x i64> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
@@ -6246,14 +5772,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf2,0xd1]
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf2,0xd9]
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf2,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6267,14 +5792,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf2,0xd1]
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf2,0xd9]
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf2,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6283,43 +5807,41 @@ define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x
ret <8 x i32> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xf0,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xf0,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
@@ -6330,14 +5852,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf3,0xd1]
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf3,0xd9]
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xf3,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6346,213 +5867,58 @@ define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x
ret <4 x i64> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsllq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsllq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x73,0xf0,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsllq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x73,0xf0,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
}
-define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps (%rdi), %ymm0
-; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x float> %res2, %res1
- ret <8 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8)
-
-define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovups (%rdi), %ymm0
-; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x float> %res2, %res1
- ret <8 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8)
-
-define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovapd (%rdi), %ymm0
-; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x double> %res2, %res1
- ret <4 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8)
-
-define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovupd (%rdi), %ymm0
-; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x double> %res2, %res1
- ret <4 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8)
-
-define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_ps_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x float> %res2, %res1
- ret <4 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8)
-
-define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_ps_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovups (%rdi), %xmm0
-; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x float> %res2, %res1
- ret <4 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8)
-
-define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_pd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovapd (%rdi), %xmm0
-; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <2 x double> %res2, %res1
- ret <2 x double> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8)
-
-define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_pd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovupd (%rdi), %xmm0
-; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <2 x double> %res2, %res1
- ret <2 x double> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8*, <2 x double>, i8)
-
declare <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav4_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x46,0xd1]
+; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x46,0xd9]
+; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x46,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6566,14 +5932,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x46,0xd1]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x46,0xd9]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x46,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6582,19 +5947,31 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1
ret <8 x i32> %res4
}
+define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si_const:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; CHECK-NEXT: ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI371_0-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x46,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI371_1-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
declare <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x46,0xd1]
+; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x46,0xd9]
+; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6603,19 +5980,31 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %
ret <2 x i64> %res4
}
+define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128_const:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} xmm0 = [2,18446744073709551607]
+; CHECK-NEXT: ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI373_0-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI373_1-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> <i64 2, i64 -9>, <2 x i64> <i64 1, i64 90>, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
declare <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x46,0xd1]
+; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x46,0xd9]
+; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x46,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6629,14 +6018,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64>, <2 x i64>, <2 x i64
define <2 x i64>@test_int_x86_avx512_mask_psllv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv2_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x47,0xd1]
+; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x47,0xd9]
+; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x47,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6650,14 +6038,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64>, <4 x i64>, <4 x i64
define <4 x i64>@test_int_x86_avx512_mask_psllv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x47,0xd1]
+; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x47,0xd9]
+; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x47,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6671,14 +6058,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32>, <4 x i32>, <4 x i32
define <4 x i32>@test_int_x86_avx512_mask_psllv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x47,0xd1]
+; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x47,0xd9]
+; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x47,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6692,14 +6078,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x47,0xd1]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x47,0xd9]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x47,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6713,14 +6098,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x14,0xd1]
+; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x14,0xd9]
+; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6734,14 +6118,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x14,0xd1]
+; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x14,0xd9]
+; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6755,14 +6138,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x14,0xd1]
+; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x14,0xd9]
+; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6776,14 +6158,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x14,0xd1]
+; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x14,0xd9]
+; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6791,85 +6172,82 @@ define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i8, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
@@ -6880,14 +6258,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x15,0xd1]
+; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x15,0xd9]
+; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6901,14 +6278,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x15,0xd1]
+; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x15,0xd9]
+; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6922,14 +6298,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x15,0xd1]
+; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x15,0xd9]
+; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6943,14 +6318,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x15,0xd1]
+; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x15,0xd9]
+; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6959,85 +6333,81 @@ define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %
ret <4 x i64> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprord $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i32, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprord $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
@@ -7048,14 +6418,16 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x31,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x31,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
@@ -7069,14 +6441,16 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x31,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
@@ -7090,14 +6464,16 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x32,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x32,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
@@ -7111,14 +6487,16 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x32,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
@@ -7132,14 +6510,16 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x35,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x35,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1)
@@ -7153,14 +6533,16 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x35,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1)
@@ -7174,14 +6556,16 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x33,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x33,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
@@ -7195,14 +6579,16 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x33,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
@@ -7216,14 +6602,16 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x34,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x34,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
@@ -7237,14 +6625,16 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x34,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
@@ -7258,14 +6648,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x21,0xd0]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x21,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
@@ -7279,14 +6668,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8]
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xd0]
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x21,0xc0]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
@@ -7300,14 +6688,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8]
+; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x22,0xd0]
+; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x22,0xc0]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
@@ -7321,14 +6708,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8]
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xd0]
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x22,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
@@ -7342,14 +6728,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8]
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x23,0xd0]
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x23,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
@@ -7363,14 +6748,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8]
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xd0]
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x23,0xc0]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
@@ -7384,14 +6768,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8]
+; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x24,0xd0]
+; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x24,0xc0]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
@@ -7405,14 +6788,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8]
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x24,0xd0]
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x24,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
@@ -7420,3 +6802,494 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64>
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
}
+
+declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0]
+; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xd8]
+; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0]
+; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xd8]
+; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x36,0xc0]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x16,0xd8]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0x16,0xc0]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x36,0xd8]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0x36,0xc0]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
+; CHECK-NEXT: vpxord %xmm4, %xmm4, %xmm4 ## encoding: [0x62,0xf1,0x5d,0x08,0xef,0xe4]
+; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
+; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xcc]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 3, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xda,0x05]
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 3, i8 %x4)
+ ;%res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ ;%res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res3
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
+; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xcc]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xda,0x05]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vmovaps %ymm0, %ymm5 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe8]
+; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
+; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xcd]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
+ %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
+ %res2 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm4, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm4, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8)
+
+define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32>, <8 x i32>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xd7]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xd7]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xd7]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd7]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
diff --git a/test/CodeGen/X86/avx512vl-logic.ll b/test/CodeGen/X86/avx512vl-logic.ll
index 02cb8f978656..d6e1a7dd5391 100644
--- a/test/CodeGen/X86/avx512vl-logic.ll
+++ b/test/CodeGen/X86/avx512vl-logic.ll
@@ -13,6 +13,18 @@ entry:
ret <8 x i32> %x
}
+; CHECK-LABEL: vpandnd256
+; CHECK: vpandnd %ymm
+; CHECK: ret
+define <8 x i32> @vpandnd256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %b2 = xor <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %x = and <8 x i32> %a2, %b2
+ ret <8 x i32> %x
+}
+
; CHECK-LABEL: vpord256
; CHECK: vpord %ymm
; CHECK: ret
@@ -46,6 +58,18 @@ entry:
ret <4 x i64> %x
}
+; CHECK-LABEL: vpandnq256
+; CHECK: vpandnq %ymm
+; CHECK: ret
+define <4 x i64> @vpandnq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+ %b2 = xor <4 x i64> %b, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %x = and <4 x i64> %a2, %b2
+ ret <4 x i64> %x
+}
+
; CHECK-LABEL: vporq256
; CHECK: vporq %ymm
; CHECK: ret
@@ -81,6 +105,18 @@ entry:
ret <4 x i32> %x
}
+; CHECK-LABEL: vpandnd128
+; CHECK: vpandnd %xmm
+; CHECK: ret
+define <4 x i32> @vpandnd128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+ %b2 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %x = and <4 x i32> %a2, %b2
+ ret <4 x i32> %x
+}
+
; CHECK-LABEL: vpord128
; CHECK: vpord %xmm
; CHECK: ret
@@ -114,6 +150,18 @@ entry:
ret <2 x i64> %x
}
+; CHECK-LABEL: vpandnq128
+; CHECK: vpandnq %xmm
+; CHECK: ret
+define <2 x i64> @vpandnq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <2 x i64> %a, <i64 1, i64 1>
+ %b2 = xor <2 x i64> %b, <i64 -1, i64 -1>
+ %x = and <2 x i64> %a2, %b2
+ ret <2 x i64> %x
+}
+
; CHECK-LABEL: vporq128
; CHECK: vporq %xmm
; CHECK: ret
diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
index 18fa0a142a2d..0838fb5c0439 100644
--- a/test/CodeGen/X86/avx512vl-mov.ll
+++ b/test/CodeGen/X86/avx512vl-mov.ll
@@ -1,153 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
-; CHECK-LABEL: test_256_1
-; CHECK: vmovdqu32
-; CHECK: ret
define <8 x i32> @test_256_1(i8 * %addr) {
+; CHECK-LABEL: test_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
%res = load <8 x i32>, <8 x i32>* %vaddr, align 1
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_2
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test_256_2(i8 * %addr) {
+; CHECK-LABEL: test_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
%res = load <8 x i32>, <8 x i32>* %vaddr, align 32
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_3
-; CHECK: vmovdqa64
-; CHECK: ret
define void @test_256_3(i8 * %addr, <4 x i64> %data) {
+; CHECK-LABEL: test_256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
store <4 x i64>%data, <4 x i64>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_4
-; CHECK: vmovdqu32
-; CHECK: ret
define void @test_256_4(i8 * %addr, <8 x i32> %data) {
+; CHECK-LABEL: test_256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
store <8 x i32>%data, <8 x i32>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_5
-; CHECK: vmovdqa32
-; CHECK: ret
define void @test_256_5(i8 * %addr, <8 x i32> %data) {
+; CHECK-LABEL: test_256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
store <8 x i32>%data, <8 x i32>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_6
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test_256_6(i8 * %addr) {
+; CHECK-LABEL: test_256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
%res = load <4 x i64>, <4 x i64>* %vaddr, align 32
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_7
-; CHECK: vmovdqu64
-; CHECK: ret
define void @test_256_7(i8 * %addr, <4 x i64> %data) {
+; CHECK-LABEL: test_256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
store <4 x i64>%data, <4 x i64>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_8
-; CHECK: vmovdqu64
-; CHECK: ret
define <4 x i64> @test_256_8(i8 * %addr) {
+; CHECK-LABEL: test_256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
%res = load <4 x i64>, <4 x i64>* %vaddr, align 1
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_9
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_9(i8 * %addr, <4 x double> %data) {
+; CHECK-LABEL: test_256_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x28,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
store <4 x double>%data, <4 x double>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_10
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x double> @test_256_10(i8 * %addr) {
+; CHECK-LABEL: test_256_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
%res = load <4 x double>, <4 x double>* %vaddr, align 32
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_11
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_11(i8 * %addr, <8 x float> %data) {
+; CHECK-LABEL: test_256_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x28,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
store <8 x float>%data, <8 x float>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_12
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define <8 x float> @test_256_12(i8 * %addr) {
+; CHECK-LABEL: test_256_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
%res = load <8 x float>, <8 x float>* %vaddr, align 32
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_13
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_13(i8 * %addr, <4 x double> %data) {
+; CHECK-LABEL: test_256_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x28,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
store <4 x double>%data, <4 x double>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_14
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x double> @test_256_14(i8 * %addr) {
+; CHECK-LABEL: test_256_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
%res = load <4 x double>, <4 x double>* %vaddr, align 1
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_15
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_15(i8 * %addr, <8 x float> %data) {
+; CHECK-LABEL: test_256_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x28,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
store <8 x float>%data, <8 x float>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_16
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define <8 x float> @test_256_16(i8 * %addr) {
+; CHECK-LABEL: test_256_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
%res = load <8 x float>, <8 x float>* %vaddr, align 1
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_17
-; CHECK: vmovdqa32{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 32
@@ -155,10 +175,13 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_18
-; CHECK: vmovdqu32{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 1
@@ -166,10 +189,13 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_19
-; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 32
@@ -177,10 +203,13 @@ define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_20
-; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 1
@@ -188,10 +217,13 @@ define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_21
-; CHECK: vmovdqa64{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 32
@@ -199,10 +231,13 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_22
-; CHECK: vmovdqu64{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 1
@@ -210,10 +245,13 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_23
-; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 32
@@ -221,10 +259,13 @@ define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_24
-; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 1
@@ -232,10 +273,14 @@ define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_25
-; CHECK: vmovaps{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 32
@@ -243,10 +288,14 @@ define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_26
-; CHECK: vmovups{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 1
@@ -254,10 +303,14 @@ define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_27
-; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 32
@@ -265,10 +318,14 @@ define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_28
-; CHECK: vmovups{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 1
@@ -276,10 +333,13 @@ define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_29
-; CHECK: vmovapd{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_29:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 32
@@ -287,10 +347,13 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_30
-; CHECK: vmovupd{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_30:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 1
@@ -298,10 +361,13 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_31
-; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 32
@@ -309,10 +375,13 @@ define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_32
-; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 1
@@ -320,154 +389,173 @@ define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
ret <4 x double>%res
}
-; CHECK-LABEL: test_128_1
-; CHECK: vmovdqu32
-; CHECK: ret
define <4 x i32> @test_128_1(i8 * %addr) {
+; CHECK-LABEL: test_128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
%res = load <4 x i32>, <4 x i32>* %vaddr, align 1
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_2
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test_128_2(i8 * %addr) {
+; CHECK-LABEL: test_128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
%res = load <4 x i32>, <4 x i32>* %vaddr, align 16
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_3
-; CHECK: vmovdqa64
-; CHECK: ret
define void @test_128_3(i8 * %addr, <2 x i64> %data) {
+; CHECK-LABEL: test_128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
store <2 x i64>%data, <2 x i64>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_4
-; CHECK: vmovdqu32
-; CHECK: ret
define void @test_128_4(i8 * %addr, <4 x i32> %data) {
+; CHECK-LABEL: test_128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
store <4 x i32>%data, <4 x i32>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_5
-; CHECK: vmovdqa32
-; CHECK: ret
define void @test_128_5(i8 * %addr, <4 x i32> %data) {
+; CHECK-LABEL: test_128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
store <4 x i32>%data, <4 x i32>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_6
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test_128_6(i8 * %addr) {
+; CHECK-LABEL: test_128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
%res = load <2 x i64>, <2 x i64>* %vaddr, align 16
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_7
-; CHECK: vmovdqu64
-; CHECK: ret
define void @test_128_7(i8 * %addr, <2 x i64> %data) {
+; CHECK-LABEL: test_128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
store <2 x i64>%data, <2 x i64>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_8
-; CHECK: vmovdqu64
-; CHECK: ret
define <2 x i64> @test_128_8(i8 * %addr) {
+; CHECK-LABEL: test_128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
%res = load <2 x i64>, <2 x i64>* %vaddr, align 1
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_9
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_9(i8 * %addr, <2 x double> %data) {
+; CHECK-LABEL: test_128_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x08,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
store <2 x double>%data, <2 x double>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_10
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <2 x double> @test_128_10(i8 * %addr) {
+; CHECK-LABEL: test_128_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
%res = load <2 x double>, <2 x double>* %vaddr, align 16
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_11
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_11(i8 * %addr, <4 x float> %data) {
+; CHECK-LABEL: test_128_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x08,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
store <4 x float>%data, <4 x float>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_12
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x float> @test_128_12(i8 * %addr) {
+; CHECK-LABEL: test_128_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
%res = load <4 x float>, <4 x float>* %vaddr, align 16
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_13
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_13(i8 * %addr, <2 x double> %data) {
+; CHECK-LABEL: test_128_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
store <2 x double>%data, <2 x double>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_14
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <2 x double> @test_128_14(i8 * %addr) {
+; CHECK-LABEL: test_128_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
%res = load <2 x double>, <2 x double>* %vaddr, align 1
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_15
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_15(i8 * %addr, <4 x float> %data) {
+; CHECK-LABEL: test_128_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
store <4 x float>%data, <4 x float>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_16
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x float> @test_128_16(i8 * %addr) {
+; CHECK-LABEL: test_128_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
%res = load <4 x float>, <4 x float>* %vaddr, align 1
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_17
-; CHECK: vmovdqa32{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 16
@@ -475,10 +563,13 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_18
-; CHECK: vmovdqu32{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 1
@@ -486,10 +577,13 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_19
-; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 16
@@ -497,10 +591,13 @@ define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_20
-; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 1
@@ -508,10 +605,13 @@ define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_21
-; CHECK: vmovdqa64{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 16
@@ -519,10 +619,13 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_22
-; CHECK: vmovdqu64{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 1
@@ -530,10 +633,13 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_23
-; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 16
@@ -541,10 +647,13 @@ define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_24
-; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 1
@@ -552,10 +661,13 @@ define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_25
-; CHECK: vmovaps{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 16
@@ -563,10 +675,13 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_26
-; CHECK: vmovups{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 1
@@ -574,10 +689,13 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_27
-; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 16
@@ -585,10 +703,13 @@ define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_28
-; CHECK: vmovups{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 1
@@ -596,10 +717,13 @@ define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_29
-; CHECK: vmovapd{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_29:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 16
@@ -607,10 +731,13 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_30
-; CHECK: vmovupd{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_30:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 1
@@ -618,10 +745,13 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_31
-; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 16
@@ -629,10 +759,13 @@ define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_32
-; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 1
diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll
new file mode 100644
index 000000000000..dec6239fafc6
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+avx512vl| FileCheck %s
+
+declare void @func_f32(float)
+define <8 x float> @_256_broadcast_ss_spill(float %x) {
+; CHECK-LABEL: _256_broadcast_ss_spill:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: callq func_f32
+; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %ymm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %a = fadd float %x, %x
+ call void @func_f32(float %a)
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <4 x float> @_128_broadcast_ss_spill(float %x) {
+; CHECK-LABEL: _128_broadcast_ss_spill:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: callq func_f32
+; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %a = fadd float %x, %x
+ call void @func_f32(float %a)
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+declare void @func_f64(double)
+define <4 x double> @_256_broadcast_sd_spill(double %x) {
+; CHECK-LABEL: _256_broadcast_sd_spill:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT: callq func_f64
+; CHECK-NEXT: vbroadcastsd (%rsp), %ymm0 # 8-byte Folded Reload
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %a = fadd double %x, %x
+ call void @func_f64(double %a)
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
+
+define <8 x float> @_inreg8xfloat(float %a) {
+; CHECK-LABEL: _inreg8xfloat:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1) {
+; CHECK-LABEL: _ss8xfloat_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %ymm3, %ymm3, %ymm3
+; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x float> %c, <8 x float> %i
+ ret <8 x float> %r
+}
+
+define <8 x float> @_ss8xfloat_maskz(float %a, <8 x i32> %mask1) {
+; CHECK-LABEL: _ss8xfloat_maskz:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x float> %c, <8 x float> zeroinitializer
+ ret <8 x float> %r
+}
+
+define <4 x float> @_inreg4xfloat(float %a) {
+; CHECK-LABEL: _inreg4xfloat:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xfloat_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x float> %c, <4 x float> %i
+ ret <4 x float> %r
+}
+
+define <4 x float> @_ss4xfloat_maskz(float %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xfloat_maskz:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x float> %c, <4 x float> zeroinitializer
+ ret <4 x float> %r
+}
+
+define <4 x double> @_inreg4xdouble(double %a) {
+; CHECK-LABEL: _inreg4xdouble:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
+
+define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xdouble_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x double> %c, <4 x double> %i
+ ret <4 x double> %r
+}
+
+define <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xdouble_maskz:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x double> %c, <4 x double> zeroinitializer
+ ret <4 x double> %r
+}
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index aed8cb1cf559..62c8a26d1e60 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -1,94 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; CHECK-LABEL: test256_1
-; CHECK: vpcmpeqq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: test256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
ret <4 x i64> %max
}
-; CHECK-LABEL: test256_2
-; CHECK: vpcmpgtq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
+; CHECK-LABEL: test256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
-; CHECK-LABEL: @test256_3
-; CHECK: vpcmpled {{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
+; CHECK-LABEL: test256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1
+; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_4
-; CHECK: vpcmpnleuq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
+; CHECK-LABEL: test256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
-; CHECK-LABEL: test256_5
-; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
+; CHECK-LABEL: test256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_6
-; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sgt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_7
-; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sle <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_8
-; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp ule <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_9
-; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+; CHECK-LABEL: test256_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i32> %x1, %y1
%mask0 = icmp eq <8 x i32> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -96,11 +107,13 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32>
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_10
-; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+; CHECK-LABEL: test256_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%mask0 = icmp sle <4 x i64> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -108,11 +121,13 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64
ret <4 x i64> %max
}
-; CHECK-LABEL: @test256_11
-; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+; CHECK-LABEL: test256_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <4 x i64> %x1, %y1
%y = load <4 x i64>, <4 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <4 x i64> %x, %y
@@ -121,11 +136,13 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
ret <4 x i64> %max
}
-; CHECK-LABEL: @test256_12
-; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+; CHECK-LABEL: test256_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask0 = icmp ule <8 x i32> %x, %y
@@ -134,11 +151,12 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_13
-; CHECK: vpcmpeqq (%rdi){1to4}, %ymm
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
+; CHECK-LABEL: test256_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
%y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -147,11 +165,12 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind
ret <4 x i64> %max
}
-; CHECK-LABEL: test256_14
-; CHECK: vpcmpled (%rdi){1to8}, %ymm
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
+; CHECK-LABEL: test256_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
%y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -160,11 +179,13 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_15
-; CHECK: vpcmpgtd (%rdi){1to8}, %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+; CHECK-LABEL: test256_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
@@ -175,11 +196,13 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_16
-; CHECK: vpcmpgtq (%rdi){1to4}, %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+; CHECK-LABEL: test256_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
@@ -190,95 +213,105 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64
ret <4 x i64> %max
}
-; CHECK-LABEL: test128_1
-; CHECK: vpcmpeqq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
+; CHECK-LABEL: test128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
ret <2 x i64> %max
}
-; CHECK-LABEL: test128_2
-; CHECK: vpcmpgtq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
+; CHECK-LABEL: test128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
-; CHECK-LABEL: @test128_3
-; CHECK: vpcmpled {{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
+; CHECK-LABEL: test128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1
+; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_4
-; CHECK: vpcmpnleuq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
+; CHECK-LABEL: test128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
-; CHECK-LABEL: test128_5
-; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
+; CHECK-LABEL: test128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_6
-; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sgt <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_7
-; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sle <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_8
-; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ule <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_9
-; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+; CHECK-LABEL: test128_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <4 x i32> %x1, %y1
%mask0 = icmp eq <4 x i32> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -286,11 +319,13 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32>
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_10
-; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+; CHECK-LABEL: test128_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%mask0 = icmp sle <2 x i64> %x, %y
%mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
@@ -298,11 +333,13 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64
ret <2 x i64> %max
}
-; CHECK-LABEL: @test128_11
-; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+; CHECK-LABEL: test128_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <2 x i64> %x1, %y1
%y = load <2 x i64>, <2 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <2 x i64> %x, %y
@@ -311,11 +348,13 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
ret <2 x i64> %max
}
-; CHECK-LABEL: @test128_12
-; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+; CHECK-LABEL: test128_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask0 = icmp ule <4 x i32> %x, %y
@@ -324,11 +363,12 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_13
-; CHECK: vpcmpeqq (%rdi){1to2}, %xmm
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
+; CHECK-LABEL: test128_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
%y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
@@ -337,11 +377,12 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind
ret <2 x i64> %max
}
-; CHECK-LABEL: test128_14
-; CHECK: vpcmpled (%rdi){1to4}, %xmm
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
+; CHECK-LABEL: test128_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
%y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -350,11 +391,13 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_15
-; CHECK: vpcmpgtd (%rdi){1to4}, %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+; CHECK-LABEL: test128_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
@@ -365,11 +408,13 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_16
-; CHECK: vpcmpgtq (%rdi){1to2}, %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+; CHECK-LABEL: test128_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
diff --git a/test/CodeGen/X86/base-pointer-and-cmpxchg.ll b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
new file mode 100644
index 000000000000..8de6d64428e3
--- /dev/null
+++ b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_64 %s
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_32 %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s
+
+; This function uses dynamic allocated stack to force the use
+; of a frame pointer.
+; The inline asm clobbers a bunch of registers to make sure
+; the frame pointer will need to be used (for spilling in that case).
+;
+; Then, we check that when we use rbx as the base pointer,
+; we do not use cmpxchg, since using that instruction requires
+; to clobbers rbx to set the arguments of the instruction and when
+; rbx is used as the base pointer, RA cannot fix the code for us.
+;
+; CHECK-LABEL: cmp_and_swap16:
+; Check that we actually use rbx.
+; gnux32 use the 32bit variant of the registers.
+; USE_BASE_64: movq %rsp, %rbx
+; USE_BASE_32: movl %esp, %ebx
+;
+; Make sure the base pointer is saved before the RBX argument for
+; cmpxchg16b is set.
+;
+; Because of how the test is written, we spill SAVE_RBX.
+; However, it would have been perfectly fine to just keep it in register.
+; USE_BASE: movq %rbx, [[SAVE_RBX_SLOT:[0-9]*\(%[er]bx\)]]
+;
+; SAVE_RBX must be in register before we clobber rbx.
+; It is fine to use any register but rbx and the ones defined and use
+; by cmpxchg. Since such regex would be complicated to write, just stick
+; to the numbered registers. The bottom line is: if this test case fails
+; because of that regex, this is likely just the regex being too conservative.
+; USE_BASE: movq [[SAVE_RBX_SLOT]], [[SAVE_RBX:%r[0-9]+]]
+;
+; USE_BASE: movq {{[^ ]+}}, %rbx
+; USE_BASE-NEXT: cmpxchg16b
+; USE_BASE-NEXT: movq [[SAVE_RBX]], %rbx
+;
+; DONT_USE_BASE-NOT: movq %rsp, %rbx
+; DONT_USE_BASE-NOT: movl %esp, %ebx
+; DONT_USE_BASE: cmpxchg
+define i1 @cmp_and_swap16(i128 %a, i128 %b, i128* %addr, i32 %n) {
+ %dummy = alloca i32, i32 %n
+tail call void asm sideeffect "nop", "~{rax},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ %cmp = cmpxchg i128* %addr, i128 %a, i128 %b seq_cst seq_cst
+ %res = extractvalue { i128, i1 } %cmp, 1
+ %idx = getelementptr i32, i32* %dummy, i32 5
+ store i32 %n, i32* %idx
+ ret i1 %res
+}
diff --git a/test/CodeGen/X86/bit-piece-comment.ll b/test/CodeGen/X86/bit-piece-comment.ll
index 6ce858b11dcf..9ebe5bc6d5af 100644
--- a/test/CodeGen/X86/bit-piece-comment.ll
+++ b/test/CodeGen/X86/bit-piece-comment.ll
@@ -39,11 +39,10 @@ attributes #1 = { nounwind readnone }
!llvm.module.flags = !{!16, !17}
!llvm.ident = !{!18}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256088) (llvm/trunk 256097)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256088) (llvm/trunk 256097)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "test.cpp", directory: "/mnt/extra")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "fn1", linkageName: "_Z3fn1v", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, variables: !7)
+!4 = distinct !DISubprogram(name: "fn1", linkageName: "_Z3fn1v", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !7)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{!8}
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
index e3bc8ace38ab..f1b325a03ebd 100644
--- a/test/CodeGen/X86/bitreverse.ll
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -1,22 +1,390 @@
-; RUN: llc -march=x86 %s -o - | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s
; These tests just check that the plumbing is in place for @llvm.bitreverse. The
; actual output is massive at the moment as llvm.bitreverse is not yet legal.
declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
-define <2 x i16> @f(<2 x i16> %a) {
-; CHECK-LABEL: f:
-; CHECK: shll
+define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shll $15, %ecx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: shll $13, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $4, %ecx
+; CHECK-NEXT: shll $11, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: shll $9, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: shll $7, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $32, %ecx
+; CHECK-NEXT: shll $5, %ecx
+; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $64, %esi
+; CHECK-NEXT: shll $3, %esi
+; CHECK-NEXT: leal (%eax,%eax), %edi
+; CHECK-NEXT: andl $256, %edi # imm = 0x100
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl %esi
+; CHECK-NEXT: andl $128, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: shrl $3, %edi
+; CHECK-NEXT: andl $64, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $5, %esi
+; CHECK-NEXT: andl $32, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: shrl $7, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $9, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: shrl $11, %edi
+; CHECK-NEXT: andl $4, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $13, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: shrl $15, %eax
+; CHECK-NEXT: orl %esi, %eax
+; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: shll $15, %ecx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: shll $13, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: andl $4, %ecx
+; CHECK-NEXT: shll $11, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: shll $9, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: shll $7, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: andl $32, %ecx
+; CHECK-NEXT: shll $5, %ecx
+; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: andl $64, %esi
+; CHECK-NEXT: shll $3, %esi
+; CHECK-NEXT: leal (%edx,%edx), %edi
+; CHECK-NEXT: andl $256, %edi # imm = 0x100
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl %esi
+; CHECK-NEXT: andl $128, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shrl $3, %edi
+; CHECK-NEXT: andl $64, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $5, %esi
+; CHECK-NEXT: andl $32, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shrl $7, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $9, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shrl $11, %edi
+; CHECK-NEXT: andl $4, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $13, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: shrl $15, %edx
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: retl
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
ret <2 x i16> %b
}
+declare i24 @llvm.bitreverse.i24(i24) readnone
+
+define i24 @test_bitreverse_i24(i24 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_i24:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shll $31, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $2, %edx
+; CHECK-NEXT: shll $29, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $4, %ecx
+; CHECK-NEXT: shll $27, %ecx
+; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $8, %edx
+; CHECK-NEXT: shll $25, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $16, %esi
+; CHECK-NEXT: shll $23, %esi
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $32, %ecx
+; CHECK-NEXT: shll $21, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $64, %edx
+; CHECK-NEXT: shll $19, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $17, %esi
+; CHECK-NEXT: andl $16777216, %esi # imm = 0x1000000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $15, %edx
+; CHECK-NEXT: andl $8388608, %edx # imm = 0x800000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $13, %esi
+; CHECK-NEXT: andl $4194304, %esi # imm = 0x400000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $11, %edx
+; CHECK-NEXT: andl $2097152, %edx # imm = 0x200000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $9, %esi
+; CHECK-NEXT: andl $1048576, %esi # imm = 0x100000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $7, %edx
+; CHECK-NEXT: andl $524288, %edx # imm = 0x80000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $5, %esi
+; CHECK-NEXT: andl $262144, %esi # imm = 0x40000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: leal (,%eax,8), %edx
+; CHECK-NEXT: andl $131072, %edx # imm = 0x20000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: leal (%eax,%eax), %esi
+; CHECK-NEXT: andl $65536, %esi # imm = 0x10000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl %edx
+; CHECK-NEXT: andl $32768, %edx # imm = 0x8000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $3, %esi
+; CHECK-NEXT: andl $16384, %esi # imm = 0x4000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $5, %edx
+; CHECK-NEXT: andl $8192, %edx # imm = 0x2000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $7, %esi
+; CHECK-NEXT: andl $4096, %esi # imm = 0x1000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $9, %edx
+; CHECK-NEXT: andl $2048, %edx # imm = 0x800
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $11, %esi
+; CHECK-NEXT: andl $1024, %esi # imm = 0x400
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $13, %edx
+; CHECK-NEXT: andl $512, %edx # imm = 0x200
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: shrl $15, %eax
+; CHECK-NEXT: andl $256, %eax # imm = 0x100
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: shrl $8, %eax
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: retl
+ %b = call i24 @llvm.bitreverse.i24(i24 %a)
+ ret i24 %b
+}
+
declare i8 @llvm.bitreverse.i8(i8) readnone
-define i8 @g(i8 %a) {
-; CHECK-LABEL: g:
-; CHECK: shlb
+define i8 @test_bitreverse_i8(i8 %a) {
+; CHECK-LABEL: test_bitreverse_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shlb $7, %cl
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shlb $5, %dl
+; CHECK-NEXT: andb $64, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shlb $3, %ah
+; CHECK-NEXT: andb $32, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: addb %dl, %dl
+; CHECK-NEXT: andb $16, %dl
+; CHECK-NEXT: orb %ah, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shrb %ah
+; CHECK-NEXT: andb $8, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrb $3, %dl
+; CHECK-NEXT: andb $4, %dl
+; CHECK-NEXT: orb %ah, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shrb $5, %ah
+; CHECK-NEXT: andb $2, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: shrb $7, %al
+; CHECK-NEXT: orb %ah, %al
+; CHECK-NEXT: orb %cl, %al
+; CHECK-NEXT: retl
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
}
+
+declare i4 @llvm.bitreverse.i4(i4) readnone
+
+define i4 @test_bitreverse_i4(i4 %a) {
+; CHECK-LABEL: test_bitreverse_i4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shlb $7, %cl
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shlb $5, %dl
+; CHECK-NEXT: andb $64, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shlb $3, %ah
+; CHECK-NEXT: andb $32, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: addb %al, %al
+; CHECK-NEXT: andb $16, %al
+; CHECK-NEXT: orb %ah, %al
+; CHECK-NEXT: orb %cl, %al
+; CHECK-NEXT: shrb $4, %al
+; CHECK-NEXT: retl
+ %b = call i4 @llvm.bitreverse.i4(i4 %a)
+ ret i4 %b
+}
+
+; These tests check that bitreverse(constant) calls are folded
+
+define <2 x i16> @fold_v2i16() {
+; CHECK-LABEL: fold_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: movw $-4096, %ax # imm = 0xF000
+; CHECK-NEXT: movw $240, %dx
+; CHECK-NEXT: retl
+ %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
+ ret <2 x i16> %b
+}
+
+define i24 @fold_i24() {
+; CHECK-LABEL: fold_i24:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $2048, %eax # imm = 0x800
+; CHECK-NEXT: retl
+ %b = call i24 @llvm.bitreverse.i24(i24 4096)
+ ret i24 %b
+}
+
+define i8 @fold_i8() {
+; CHECK-LABEL: fold_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb $-16, %al
+; CHECK-NEXT: retl
+ %b = call i8 @llvm.bitreverse.i8(i8 15)
+ ret i8 %b
+}
+
+define i4 @fold_i4() {
+; CHECK-LABEL: fold_i4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb $1, %al
+; CHECK-NEXT: retl
+ %b = call i4 @llvm.bitreverse.i4(i4 8)
+ ret i4 %b
+}
+
+; These tests check that bitreverse(bitreverse()) calls are removed
+
+define i8 @identity_i8(i8 %a) {
+; CHECK-LABEL: identity_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT: retl
+ %b = call i8 @llvm.bitreverse.i8(i8 %a)
+ %c = call i8 @llvm.bitreverse.i8(i8 %b)
+ ret i8 %c
+}
+
+define <2 x i16> @identity_v2i16(<2 x i16> %a) {
+; CHECK-LABEL: identity_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: retl
+ %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
+ %c = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b)
+ ret <2 x i16> %c
+}
+
+; These tests check that bitreverse(undef) calls are removed
+
+define i8 @undef_i8() {
+; CHECK-LABEL: undef_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: retl
+ %b = call i8 @llvm.bitreverse.i8(i8 undef)
+ ret i8 %b
+}
+
+define <2 x i16> @undef_v2i16() {
+; CHECK-LABEL: undef_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: retl
+ %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
+ ret <2 x i16> %b
+}
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index 89defa956a45..0e790864db49 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -7,15 +7,15 @@ define i32 @test_ifchains(i32 %i, i32* %a, i32 %b) {
; that is not expected to run.
; CHECK-LABEL: test_ifchains:
; CHECK: %entry
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else1
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else2
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else3
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else4
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %exit
; CHECK: %then1
; CHECK: %then2
@@ -81,11 +81,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
; Check that we sink cold loop blocks after the hot loop body.
; CHECK-LABEL: test_loop_cold_blocks:
; CHECK: %entry
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %unlikely1
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %unlikely2
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %body1
; CHECK: %body2
; CHECK: %body3
@@ -242,7 +242,7 @@ define i32 @test_loop_align(i32 %i, i32* %a) {
; pass.
; CHECK-LABEL: test_loop_align:
; CHECK: %entry
-; CHECK: .align [[ALIGN:[0-9]+]],
+; CHECK: .p2align [[ALIGN:[0-9]+]],
; CHECK-NEXT: %body
; CHECK: %exit
@@ -267,11 +267,11 @@ define i32 @test_nested_loop_align(i32 %i, i32* %a, i32* %b) {
; Check that we provide nested loop body alignment.
; CHECK-LABEL: test_nested_loop_align:
; CHECK: %entry
-; CHECK: .align [[ALIGN]],
+; CHECK: .p2align [[ALIGN]],
; CHECK-NEXT: %loop.body.1
-; CHECK: .align [[ALIGN]],
+; CHECK: .p2align [[ALIGN]],
; CHECK-NEXT: %inner.loop.body
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %exit
entry:
@@ -463,26 +463,24 @@ exit:
}
define void @fpcmp_unanalyzable_branch(i1 %cond) {
-; This function's CFG contains an unanalyzable branch that is likely to be
-; split due to having a different high-probability predecessor.
-; CHECK: fpcmp_unanalyzable_branch
-; CHECK: %entry
-; CHECK: %exit
-; CHECK-NOT: %if.then
-; CHECK-NOT: %if.end
-; CHECK-NOT: jne
-; CHECK-NOT: jnp
-; CHECK: jne
-; CHECK-NEXT: jnp
-; CHECK-NEXT: %if.then
+; This function's CFG contains an once-unanalyzable branch (une on floating
+; points). As now it becomes analyzable, we should get best layout in which each
+; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
+; fall-through.
+; CHECK-LABEL: fpcmp_unanalyzable_branch:
+; CHECK: # BB#0: # %entry
+; CHECK: # BB#1: # %entry.if.then_crit_edge
+; CHECK: .LBB10_4: # %if.then
+; CHECK: .LBB10_5: # %if.end
+; CHECK: # BB#3: # %exit
+; CHECK: jne .LBB10_4
+; CHECK-NEXT: jnp .LBB10_5
+; CHECK-NEXT: jmp .LBB10_4
entry:
; Note that this branch must be strongly biased toward
; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for
-; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that
-; chain which would violate the unanalyzable branch in 'exit', but we won't even
-; try this trick unless 'if.then' is believed to almost always be reached from
-; 'entry.if.then_crit_edge'.
+; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'.
br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1
entry.if.then_crit_edge:
@@ -494,7 +492,7 @@ lor.lhs.false:
exit:
%cmp.i = fcmp une double 0.000000e+00, undef
- br i1 %cmp.i, label %if.then, label %if.end
+ br i1 %cmp.i, label %if.then, label %if.end, !prof !3
if.then:
%0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ]
@@ -507,6 +505,7 @@ if.end:
}
!1 = !{!"branch_weights", i32 1000, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 1000}
declare i32 @f()
declare i32 @g()
@@ -604,10 +603,8 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
;
; CHECK: test_unnatural_cfg_backwards_inner_loop
; CHECK: %entry
-; CHECK: [[BODY:# BB#[0-9]+]]:
; CHECK: %loop2b
; CHECK: %loop1
-; CHECK: %loop2a
entry:
br i1 undef, label %loop2a, label %body
@@ -665,11 +662,14 @@ define void @unanalyzable_branch_to_best_succ(i1 %cond) {
; Ensure that we can handle unanalyzable branches where the destination block
; gets selected as the optimal successor to merge.
;
+; This branch is now analyzable and hence the destination block becomes the
+; hotter one. The right order is entry->bar->exit->foo.
+;
; CHECK: unanalyzable_branch_to_best_succ
; CHECK: %entry
-; CHECK: %foo
; CHECK: %bar
; CHECK: %exit
+; CHECK: %foo
entry:
; Bias this branch toward bar to ensure we form that chain.
@@ -943,18 +943,18 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
; CHECK: @benchmark_heapsort
; CHECK: %entry
; First rotated loop top.
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %while.end
; CHECK: %for.cond
; CHECK: %if.then
; CHECK: %if.else
; CHECK: %if.end10
; Second rotated loop top
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %if.then24
; CHECK: %while.cond.outer
; Third rotated loop top
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %while.cond
; CHECK: %while.body
; CHECK: %land.lhs.true
@@ -1083,3 +1083,206 @@ exit:
%ret = phi i32 [ %val1, %then ], [ %val2, %else ]
ret i32 %ret
}
+
+; Make sure we put landingpads out of the way.
+declare i32 @pers(...)
+
+declare i32 @foo();
+
+declare i32 @bar();
+
+define i32 @test_lp(i32 %a) personality i32 (...)* @pers {
+; CHECK-LABEL: test_lp:
+; CHECK: %entry
+; CHECK: %hot
+; CHECK: %then
+; CHECK: %cold
+; CHECK: %coldlp
+; CHECK: %hotlp
+; CHECK: %lpret
+entry:
+ %0 = icmp sgt i32 %a, 1
+ br i1 %0, label %hot, label %cold, !prof !4
+
+hot:
+ %1 = invoke i32 @foo()
+ to label %then unwind label %hotlp
+
+cold:
+ %2 = invoke i32 @bar()
+ to label %then unwind label %coldlp
+
+then:
+ %3 = phi i32 [ %1, %hot ], [ %2, %cold ]
+ ret i32 %3
+
+hotlp:
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ br label %lpret
+
+coldlp:
+ %5 = landingpad { i8*, i32 }
+ cleanup
+ br label %lpret
+
+lpret:
+ %6 = phi i32 [-1, %hotlp], [-2, %coldlp]
+ %7 = add i32 %6, 42
+ ret i32 %7
+}
+
+!4 = !{!"branch_weights", i32 65536, i32 0}
+
+; Make sure that ehpad are scheduled from the least probable one
+; to the most probable one. See selectBestCandidateBlock as to why.
+declare void @clean();
+
+define void @test_flow_unwind() personality i32 (...)* @pers {
+; CHECK-LABEL: test_flow_unwind:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+; CHECK: %innerlp
+; CHECK: %outerlp
+; CHECK: %outercleanup
+entry:
+ %0 = invoke i32 @foo()
+ to label %then unwind label %outerlp
+
+then:
+ %1 = invoke i32 @bar()
+ to label %exit unwind label %innerlp
+
+exit:
+ ret void
+
+innerlp:
+ %2 = landingpad { i8*, i32 }
+ cleanup
+ br label %innercleanup
+
+outerlp:
+ %3 = landingpad { i8*, i32 }
+ cleanup
+ br label %outercleanup
+
+outercleanup:
+ %4 = phi { i8*, i32 } [%2, %innercleanup], [%3, %outerlp]
+ call void @clean()
+ resume { i8*, i32 } %4
+
+innercleanup:
+ call void @clean()
+ br label %outercleanup
+}
+
+declare void @hot_function()
+
+define void @test_hot_branch(i32* %a) {
+; Test that a hot branch that has a probability a little larger than 80% will
+; break CFG constrains when doing block placement.
+; CHECK-LABEL: test_hot_branch:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+; CHECK: %else
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %then, label %else, !prof !5
+
+then:
+ call void @hot_function()
+ br label %exit
+
+else:
+ call void @cold_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+define void @test_hot_branch_profile(i32* %a) !prof !6 {
+; Test that a hot branch that has a probability a little larger than 50% will
+; break CFG constrains when doing block placement when profile is available.
+; CHECK-LABEL: test_hot_branch_profile:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+; CHECK: %else
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %then, label %else, !prof !7
+
+then:
+ call void @hot_function()
+ br label %exit
+
+else:
+ call void @cold_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+define void @test_hot_branch_triangle_profile(i32* %a) !prof !6 {
+; Test that a hot branch that has a probability a little larger than 80% will
+; break triangle shaped CFG constrains when doing block placement if profile
+; is present.
+; CHECK-LABEL: test_hot_branch_triangle_profile:
+; CHECK: %entry
+; CHECK: %exit
+; CHECK: %then
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %exit, label %then, !prof !5
+
+then:
+ call void @hot_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+define void @test_hot_branch_triangle_profile_topology(i32* %a) !prof !6 {
+; Test that a hot branch that has a probability between 50% and 66% will not
+; break triangle shaped CFG constrains when doing block placement if profile
+; is present.
+; CHECK-LABEL: test_hot_branch_triangle_profile_topology:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %exit, label %then, !prof !7
+
+then:
+ call void @hot_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+!5 = !{!"branch_weights", i32 84, i32 16}
+!6 = !{!"function_entry_count", i32 10}
+!7 = !{!"branch_weights", i32 60, i32 40}
diff --git a/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..8b15a1591b67
--- /dev/null
+++ b/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/bmi-builtins.c
+
+;
+; AMD Intrinsics
+;
+
+define i64 @test__andn_u64(i64 %a0, i64 %a1) {
+; X64-LABEL: test__andn_u64:
+; X64: # BB#0:
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: andq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %xor = xor i64 %a0, -1
+ %res = and i64 %xor, %a1
+ ret i64 %res
+}
+
+define i64 @test__bextr_u64(i64 %a0, i64 %a1) {
+; X64-LABEL: test__bextr_u64:
+; X64: # BB#0:
+; X64-NEXT: bextrq %rsi, %rdi, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1)
+ ret i64 %res
+}
+
+define i64 @test__blsi_u64(i64 %a0) {
+; X64-LABEL: test__blsi_u64:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %neg = sub i64 0, %a0
+ %res = and i64 %a0, %neg
+ ret i64 %res
+}
+
+define i64 @test__blsmsk_u64(i64 %a0) {
+; X64-LABEL: test__blsmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: xorq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = xor i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test__blsr_u64(i64 %a0) {
+; X64-LABEL: test__blsr_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = and i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test__tzcnt_u64(i64 %a0) {
+; X64-LABEL: test__tzcnt_u64:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %ecx
+; X64-NEXT: tzcntq %rdi, %rax
+; X64-NEXT: cmovbq %rcx, %rax
+; X64-NEXT: retq
+ %cmp = icmp ne i64 %a0, 0
+ %cttz = call i64 @llvm.cttz.i64(i64 %a0, i1 true)
+ %res = select i1 %cmp, i64 %cttz, i64 64
+ ret i64 %res
+}
+
+;
+; Intel intrinsics
+;
+
+define i64 @test_andn_u64(i64 %a0, i64 %a1) {
+; X64-LABEL: test_andn_u64:
+; X64: # BB#0:
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: andq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %xor = xor i64 %a0, -1
+ %res = and i64 %xor, %a1
+ ret i64 %res
+}
+
+define i64 @test_bextr_u64(i64 %a0, i32 %a1, i32 %a2) {
+; X64-LABEL: test_bextr_u64:
+; X64: # BB#0:
+; X64-NEXT: andl $255, %esi
+; X64-NEXT: andl $255, %edx
+; X64-NEXT: shll $8, %edx
+; X64-NEXT: orl %esi, %edx
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: bextrq %rax, %rdi, %rax
+; X64-NEXT: retq
+ %and1 = and i32 %a1, 255
+ %and2 = and i32 %a2, 255
+ %shl = shl i32 %and2, 8
+ %or = or i32 %and1, %shl
+ %zext = zext i32 %or to i64
+ %res = call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %zext)
+ ret i64 %res
+}
+
+define i64 @test_blsi_u64(i64 %a0) {
+; X64-LABEL: test_blsi_u64:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %neg = sub i64 0, %a0
+ %res = and i64 %a0, %neg
+ ret i64 %res
+}
+
+define i64 @test_blsmsk_u64(i64 %a0) {
+; X64-LABEL: test_blsmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: xorq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = xor i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test_blsr_u64(i64 %a0) {
+; X64-LABEL: test_blsr_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = and i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test_tzcnt_u64(i64 %a0) {
+; X64-LABEL: test_tzcnt_u64:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %ecx
+; X64-NEXT: tzcntq %rdi, %rax
+; X64-NEXT: cmovbq %rcx, %rax
+; X64-NEXT: retq
+ %cmp = icmp ne i64 %a0, 0
+ %cttz = call i64 @llvm.cttz.i64(i64 %a0, i1 true)
+ %res = select i1 %cmp, i64 %cttz, i64 64
+ ret i64 %res
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
diff --git a/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll b/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..2b889dd054fa
--- /dev/null
+++ b/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
@@ -0,0 +1,326 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/bmi-builtins.c
+
+;
+; AMD Intrinsics
+;
+
+define i16 @test__tzcnt_u16(i16 %a0) {
+; X32-LABEL: test__tzcnt_u16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzwl %ax, %ecx
+; X32-NEXT: cmpl $0, %ecx
+; X32-NEXT: jne .LBB0_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: retl
+; X32-NEXT: .LBB0_1:
+; X32-NEXT: tzcntw %ax, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__tzcnt_u16:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %cx
+; X64-NEXT: movzwl %di, %edx
+; X64-NEXT: tzcntw %dx, %ax
+; X64-NEXT: cmpl $0, %edx
+; X64-NEXT: cmovew %cx, %ax
+; X64-NEXT: retq
+ %zext = zext i16 %a0 to i32
+ %cmp = icmp ne i32 %zext, 0
+ %cttz = call i16 @llvm.cttz.i16(i16 %a0, i1 true)
+ %res = select i1 %cmp, i16 %cttz, i16 16
+ ret i16 %res
+}
+
+define i32 @test__andn_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test__andn_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $-1, %eax
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__andn_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl $-1, %edi
+; X64-NEXT: andl %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %xor = xor i32 %a0, -1
+ %res = and i32 %xor, %a1
+ ret i32 %res
+}
+
+define i32 @test__bextr_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test__bextr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__bextr_u32:
+; X64: # BB#0:
+; X64-NEXT: bextrl %esi, %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1)
+ ret i32 %res
+}
+
+define i32 @test__blsi_u32(i32 %a0) {
+; X32-LABEL: test__blsi_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsi_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subl %edi, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %neg = sub i32 0, %a0
+ %res = and i32 %a0, %neg
+ ret i32 %res
+}
+
+define i32 @test__blsmsk_u32(i32 %a0) {
+; X32-LABEL: test__blsmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: xorl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = xor i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test__blsr_u32(i32 %a0) {
+; X32-LABEL: test__blsr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsr_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = and i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test__tzcnt_u32(i32 %a0) {
+; X32-LABEL: test__tzcnt_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmpl $0, %eax
+; X32-NEXT: jne .LBB6_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: retl
+; X32-NEXT: .LBB6_1:
+; X32-NEXT: tzcntl %eax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__tzcnt_u32:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %ecx
+; X64-NEXT: tzcntl %edi, %eax
+; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: retq
+ %cmp = icmp ne i32 %a0, 0
+ %cttz = call i32 @llvm.cttz.i32(i32 %a0, i1 true)
+ %res = select i1 %cmp, i32 %cttz, i32 32
+ ret i32 %res
+}
+
+;
+; Intel intrinsics
+;
+
+define i16 @test_tzcnt_u16(i16 %a0) {
+; X32-LABEL: test_tzcnt_u16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzwl %ax, %ecx
+; X32-NEXT: cmpl $0, %ecx
+; X32-NEXT: jne .LBB7_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: retl
+; X32-NEXT: .LBB7_1:
+; X32-NEXT: tzcntw %ax, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_tzcnt_u16:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %cx
+; X64-NEXT: movzwl %di, %edx
+; X64-NEXT: tzcntw %dx, %ax
+; X64-NEXT: cmpl $0, %edx
+; X64-NEXT: cmovew %cx, %ax
+; X64-NEXT: retq
+ %zext = zext i16 %a0 to i32
+ %cmp = icmp ne i32 %zext, 0
+ %cttz = call i16 @llvm.cttz.i16(i16 %a0, i1 true)
+ %res = select i1 %cmp, i16 %cttz, i16 16
+ ret i16 %res
+}
+
+define i32 @test_andn_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test_andn_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $-1, %eax
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_andn_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl $-1, %edi
+; X64-NEXT: andl %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %xor = xor i32 %a0, -1
+ %res = and i32 %xor, %a1
+ ret i32 %res
+}
+
+define i32 @test_bextr_u32(i32 %a0, i32 %a1, i32 %a2) {
+; X32-LABEL: test_bextr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: andl $255, %ecx
+; X32-NEXT: andl $255, %eax
+; X32-NEXT: shll $8, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_bextr_u32:
+; X64: # BB#0:
+; X64-NEXT: andl $255, %esi
+; X64-NEXT: andl $255, %edx
+; X64-NEXT: shll $8, %edx
+; X64-NEXT: orl %esi, %edx
+; X64-NEXT: bextrl %edx, %edi, %eax
+; X64-NEXT: retq
+ %and1 = and i32 %a1, 255
+ %and2 = and i32 %a2, 255
+ %shl = shl i32 %and2, 8
+ %or = or i32 %and1, %shl
+ %res = call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %or)
+ ret i32 %res
+}
+
+define i32 @test_blsi_u32(i32 %a0) {
+; X32-LABEL: test_blsi_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_blsi_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subl %edi, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %neg = sub i32 0, %a0
+ %res = and i32 %a0, %neg
+ ret i32 %res
+}
+
+define i32 @test_blsmsk_u32(i32 %a0) {
+; X32-LABEL: test_blsmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_blsmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: xorl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = xor i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test_blsr_u32(i32 %a0) {
+; X32-LABEL: test_blsr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_blsr_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = and i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test_tzcnt_u32(i32 %a0) {
+; X32-LABEL: test_tzcnt_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmpl $0, %eax
+; X32-NEXT: jne .LBB13_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: retl
+; X32-NEXT: .LBB13_1:
+; X32-NEXT: tzcntl %eax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_tzcnt_u32:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %ecx
+; X64-NEXT: tzcntl %edi, %eax
+; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: retq
+ %cmp = icmp ne i32 %a0, 0
+ %cttz = call i32 @llvm.cttz.i32(i32 %a0, i1 true)
+ %res = select i1 %cmp, i32 %cttz, i32 32
+ ret i32 %res
+}
+
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 8b13e960cd8f..afeba4ef2d99 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -1,218 +1,437 @@
-; RUN: llc < %s -march=x86-64 -mattr=+bmi,+bmi2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s
-declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
-declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
-declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
-declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
-define i8 @t1(i8 %x) nounwind {
+define i8 @t1(i8 %x) {
+; CHECK-LABEL: t1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: orl $256, %eax # imm = 0x100
+; CHECK-NEXT: tzcntl %eax, %eax
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 false )
ret i8 %tmp
-; CHECK-LABEL: t1:
-; CHECK: tzcntl
}
-define i16 @t2(i16 %x) nounwind {
+define i16 @t2(i16 %x) {
+; CHECK-LABEL: t2:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntw %di, %ax
+; CHECK-NEXT: retq
%tmp = tail call i16 @llvm.cttz.i16( i16 %x, i1 false )
ret i16 %tmp
-; CHECK-LABEL: t2:
-; CHECK: tzcntw
}
-define i32 @t3(i32 %x) nounwind {
+define i32 @t3(i32 %x) {
+; CHECK-LABEL: t3:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntl %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.cttz.i32( i32 %x, i1 false )
ret i32 %tmp
-; CHECK-LABEL: t3:
-; CHECK: tzcntl
}
-define i32 @tzcnt32_load(i32* %x) nounwind {
+define i32 @tzcnt32_load(i32* %x) {
+; CHECK-LABEL: tzcnt32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntl (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = tail call i32 @llvm.cttz.i32(i32 %x1, i1 false )
ret i32 %tmp
-; CHECK-LABEL: tzcnt32_load:
-; CHECK: tzcntl ({{.*}})
}
-define i64 @t4(i64 %x) nounwind {
+define i64 @t4(i64 %x) {
+; CHECK-LABEL: t4:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 false )
ret i64 %tmp
-; CHECK-LABEL: t4:
-; CHECK: tzcntq
}
-define i8 @t5(i8 %x) nounwind {
+define i8 @t5(i8 %x) {
+; CHECK-LABEL: t5:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: tzcntl %eax, %eax
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 true )
ret i8 %tmp
-; CHECK-LABEL: t5:
-; CHECK: tzcntl
}
-define i16 @t6(i16 %x) nounwind {
+define i16 @t6(i16 %x) {
+; CHECK-LABEL: t6:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntw %di, %ax
+; CHECK-NEXT: retq
%tmp = tail call i16 @llvm.cttz.i16( i16 %x, i1 true )
ret i16 %tmp
-; CHECK-LABEL: t6:
-; CHECK: tzcntw
}
-define i32 @t7(i32 %x) nounwind {
+define i32 @t7(i32 %x) {
+; CHECK-LABEL: t7:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntl %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.cttz.i32( i32 %x, i1 true )
ret i32 %tmp
-; CHECK-LABEL: t7:
-; CHECK: tzcntl
}
-define i64 @t8(i64 %x) nounwind {
+define i64 @t8(i64 %x) {
+; CHECK-LABEL: t8:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 true )
ret i64 %tmp
-; CHECK-LABEL: t8:
-; CHECK: tzcntq
}
-define i32 @andn32(i32 %x, i32 %y) nounwind readnone {
+define i32 @andn32(i32 %x, i32 %y) {
+; CHECK-LABEL: andn32:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp1 = xor i32 %x, -1
%tmp2 = and i32 %y, %tmp1
ret i32 %tmp2
-; CHECK-LABEL: andn32:
-; CHECK: andnl
}
-define i32 @andn32_load(i32 %x, i32* %y) nounwind readnone {
+define i32 @andn32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: andn32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
%y1 = load i32, i32* %y
%tmp1 = xor i32 %x, -1
%tmp2 = and i32 %y1, %tmp1
ret i32 %tmp2
-; CHECK-LABEL: andn32_load:
-; CHECK: andnl ({{.*}})
}
-define i64 @andn64(i64 %x, i64 %y) nounwind readnone {
+define i64 @andn64(i64 %x, i64 %y) {
+; CHECK-LABEL: andn64:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp1 = xor i64 %x, -1
%tmp2 = and i64 %tmp1, %y
ret i64 %tmp2
-; CHECK-LABEL: andn64:
-; CHECK: andnq
}
-define i32 @bextr32(i32 %x, i32 %y) nounwind readnone {
+; Don't choose a 'test' if an 'andn' can be used.
+define i1 @andn_cmp(i32 %x, i32 %y) {
+; CHECK-LABEL: andn_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %notx = xor i32 %x, -1
+ %and = and i32 %notx, %y
+ %cmp = icmp eq i32 %and, 0
+ ret i1 %cmp
+}
+
+; Recognize a disguised andn in the following 4 tests.
+define i1 @and_cmp1(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp1:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, %y
+ %cmp = icmp eq i32 %and, %y
+ ret i1 %cmp
+}
+
+define i1 @and_cmp2(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
+ %and = and i32 %y, %x
+ %cmp = icmp ne i32 %and, %y
+ ret i1 %cmp
+}
+
+define i1 @and_cmp3(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp3:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, %y
+ %cmp = icmp eq i32 %y, %and
+ ret i1 %cmp
+}
+
+define i1 @and_cmp4(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp4:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
+ %and = and i32 %y, %x
+ %cmp = icmp ne i32 %y, %and
+ ret i1 %cmp
+}
+
+; A mask and compare against constant is ok for an 'andn' too
+; even though the BMI instruction doesn't have an immediate form.
+define i1 @and_cmp_const(i32 %x) {
+; CHECK-LABEL: and_cmp_const:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $43, %eax
+; CHECK-NEXT: andnl %eax, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, 43
+ %cmp = icmp eq i32 %and, 43
+ ret i1 %cmp
+}
+
+; But don't use 'andn' if the mask is a power-of-two.
+define i1 @and_cmp_const_power_of_two(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp_const_power_of_two:
+; CHECK: # BB#0:
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: retq
+ %shl = shl i32 1, %y
+ %and = and i32 %x, %shl
+ %cmp = icmp ne i32 %and, %shl
+ ret i1 %cmp
+}
+
+; Don't transform to 'andn' if there's another use of the 'and'.
+define i32 @and_cmp_not_one_use(i32 %x) {
+; CHECK-LABEL: and_cmp_not_one_use:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl $37, %edi
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $37, %edi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: retq
+ %and = and i32 %x, 37
+ %cmp = icmp eq i32 %and, 37
+ %ext = zext i1 %cmp to i32
+ %add = add i32 %and, %ext
+ ret i32 %add
+}
+
+; Verify that we're not transforming invalid comparison predicates.
+define i1 @not_an_andn1(i32 %x, i32 %y) {
+; CHECK-LABEL: not_an_andn1:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, %y
+ %cmp = icmp sgt i32 %y, %and
+ ret i1 %cmp
+}
+
+define i1 @not_an_andn2(i32 %x, i32 %y) {
+; CHECK-LABEL: not_an_andn2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: setbe %al
+; CHECK-NEXT: retq
+ %and = and i32 %y, %x
+ %cmp = icmp ule i32 %y, %and
+ ret i1 %cmp
+}
+
+; Don't choose a 'test' if an 'andn' can be used.
+define i1 @andn_cmp_swap_ops(i64 %x, i64 %y) {
+; CHECK-LABEL: andn_cmp_swap_ops:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnq %rsi, %rdi, %rax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %notx = xor i64 %x, -1
+ %and = and i64 %y, %notx
+ %cmp = icmp eq i64 %and, 0
+ ret i1 %cmp
+}
+
+; Use a 'test' (not an 'and') because 'andn' only works for i32/i64.
+define i1 @andn_cmp_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: andn_cmp_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: notb %sil
+; CHECK-NEXT: testb %sil, %dil
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %noty = xor i8 %y, -1
+ %and = and i8 %x, %noty
+ %cmp = icmp eq i8 %and, 0
+ ret i1 %cmp
+}
+
+define i32 @bextr32(i32 %x, i32 %y) {
+; CHECK-LABEL: bextr32:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextrl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bextr32:
-; CHECK: bextrl
}
-define i32 @bextr32_load(i32* %x, i32 %y) nounwind readnone {
+define i32 @bextr32_load(i32* %x, i32 %y) {
+; CHECK-LABEL: bextr32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextrl %esi, (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x1, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bextr32_load:
-; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
}
-declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
-define i32 @bextr32b(i32 %x) nounwind uwtable readnone ssp {
+define i32 @bextr32b(i32 %x) uwtable ssp {
+; CHECK-LABEL: bextr32b:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, %edi, %eax
+; CHECK-NEXT: retq
%1 = lshr i32 %x, 4
%2 = and i32 %1, 4095
ret i32 %2
-; CHECK-LABEL: bextr32b:
-; CHECK: bextrl
}
-define i32 @bextr32b_load(i32* %x) nounwind uwtable readnone ssp {
+define i32 @bextr32b_load(i32* %x) uwtable ssp {
+; CHECK-LABEL: bextr32b_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, (%rdi), %eax
+; CHECK-NEXT: retq
%1 = load i32, i32* %x
%2 = lshr i32 %1, 4
%3 = and i32 %2, 4095
ret i32 %3
-; CHECK-LABEL: bextr32b_load:
-; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
}
-define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
+define i64 @bextr64(i64 %x, i64 %y) {
+; CHECK-LABEL: bextr64:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextrq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: bextr64:
-; CHECK: bextrq
}
-declare i64 @llvm.x86.bmi.bextr.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
-define i64 @bextr64b(i64 %x) nounwind uwtable readnone ssp {
+define i64 @bextr64b(i64 %x) uwtable ssp {
+; CHECK-LABEL: bextr64b:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, %edi, %eax
+; CHECK-NEXT: retq
%1 = lshr i64 %x, 4
%2 = and i64 %1, 4095
ret i64 %2
-; CHECK-LABEL: bextr64b:
-; CHECK: bextrq
}
define i64 @bextr64b_load(i64* %x) {
+; CHECK-LABEL: bextr64b_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, (%rdi), %eax
+; CHECK-NEXT: retq
%1 = load i64, i64* %x, align 8
%2 = lshr i64 %1, 4
%3 = and i64 %2, 4095
ret i64 %3
-; CHECK-LABEL: bextr64b_load:
-; CHECK: bextrq {{.*}}, ({{.*}}), {{.*}}
}
define i32 @non_bextr32(i32 %x) {
+; CHECK-LABEL: non_bextr32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: shrl $2, %edi
+; CHECK-NEXT: andl $111, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
entry:
%shr = lshr i32 %x, 2
%and = and i32 %shr, 111
ret i32 %and
-; CHECK-LABEL: non_bextr32:
-; CHECK: shrl $2
-; CHECK: andl $111
}
define i64 @non_bextr64(i64 %x) {
+; CHECK-LABEL: non_bextr64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: movabsq $8589934590, %rax # imm = 0x1FFFFFFFE
+; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: retq
entry:
%shr = lshr i64 %x, 2
%and = and i64 %shr, 8589934590
ret i64 %and
-; CHECK-LABEL: non_bextr64:
-; CHECK: shrq $2
-; CHECK: movabsq $8589934590
-; CHECK: andq
}
-define i32 @bzhi32(i32 %x, i32 %y) nounwind readnone {
+define i32 @bzhi32(i32 %x, i32 %y) {
+; CHECK-LABEL: bzhi32:
+; CHECK: # BB#0:
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bzhi32:
-; CHECK: bzhil
}
-define i32 @bzhi32_load(i32* %x, i32 %y) nounwind readnone {
+define i32 @bzhi32_load(i32* %x, i32 %y) {
+; CHECK-LABEL: bzhi32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: bzhil %esi, (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bzhi32_load:
-; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
}
-declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
-define i64 @bzhi64(i64 %x, i64 %y) nounwind readnone {
+define i64 @bzhi64(i64 %x, i64 %y) {
+; CHECK-LABEL: bzhi64:
+; CHECK: # BB#0:
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: bzhi64:
-; CHECK: bzhiq
}
-declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
-define i32 @bzhi32b(i32 %x, i8 zeroext %index) #0 {
+define i32 @bzhi32b(i32 %x, i8 zeroext %index) {
+; CHECK-LABEL: bzhi32b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
entry:
%conv = zext i8 %index to i32
%shl = shl i32 1, %conv
%sub = add nsw i32 %shl, -1
%and = and i32 %sub, %x
ret i32 %and
-; CHECK-LABEL: bzhi32b:
-; CHECK: bzhil
}
-define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) #0 {
+define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) {
+; CHECK-LABEL: bzhi32b_load:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, (%rdi), %eax
+; CHECK-NEXT: retq
entry:
%x = load i32, i32* %w
%conv = zext i8 %index to i32
@@ -220,173 +439,211 @@ entry:
%sub = add nsw i32 %shl, -1
%and = and i32 %sub, %x
ret i32 %and
-; CHECK-LABEL: bzhi32b_load:
-; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
}
-define i32 @bzhi32c(i32 %x, i8 zeroext %index) #0 {
+define i32 @bzhi32c(i32 %x, i8 zeroext %index) {
+; CHECK-LABEL: bzhi32c:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
entry:
%conv = zext i8 %index to i32
%shl = shl i32 1, %conv
%sub = add nsw i32 %shl, -1
%and = and i32 %x, %sub
ret i32 %and
-; CHECK-LABEL: bzhi32c:
-; CHECK: bzhil
}
-define i64 @bzhi64b(i64 %x, i8 zeroext %index) #0 {
+define i64 @bzhi64b(i64 %x, i8 zeroext %index) {
+; CHECK-LABEL: bzhi64b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
entry:
%conv = zext i8 %index to i64
%shl = shl i64 1, %conv
%sub = add nsw i64 %shl, -1
%and = and i64 %x, %sub
ret i64 %and
-; CHECK-LABEL: bzhi64b:
-; CHECK: bzhiq
}
-define i64 @bzhi64_constant_mask(i64 %x) #0 {
+define i64 @bzhi64_constant_mask(i64 %x) {
+; CHECK-LABEL: bzhi64_constant_mask:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movb $62, %al
+; CHECK-NEXT: bzhiq %rax, %rdi, %rax
+; CHECK-NEXT: retq
entry:
%and = and i64 %x, 4611686018427387903
ret i64 %and
-; CHECK-LABEL: bzhi64_constant_mask:
-; CHECK: movb $62, %al
-; CHECK: bzhiq %rax, %r[[ARG1:di|cx]], %rax
}
-define i64 @bzhi64_small_constant_mask(i64 %x) #0 {
+define i64 @bzhi64_small_constant_mask(i64 %x) {
+; CHECK-LABEL: bzhi64_small_constant_mask:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
entry:
%and = and i64 %x, 2147483647
ret i64 %and
-; CHECK-LABEL: bzhi64_small_constant_mask:
-; CHECK: andl $2147483647, %e[[ARG1]]
}
-define i32 @blsi32(i32 %x) nounwind readnone {
+define i32 @blsi32(i32 %x) {
+; CHECK-LABEL: blsi32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsil %edi, %eax
+; CHECK-NEXT: retq
%tmp = sub i32 0, %x
%tmp2 = and i32 %x, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsi32:
-; CHECK: blsil
}
-define i32 @blsi32_load(i32* %x) nounwind readnone {
+define i32 @blsi32_load(i32* %x) {
+; CHECK-LABEL: blsi32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsil (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = sub i32 0, %x1
%tmp2 = and i32 %x1, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsi32_load:
-; CHECK: blsil ({{.*}})
}
-define i64 @blsi64(i64 %x) nounwind readnone {
+define i64 @blsi64(i64 %x) {
+; CHECK-LABEL: blsi64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsiq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = sub i64 0, %x
%tmp2 = and i64 %tmp, %x
ret i64 %tmp2
-; CHECK-LABEL: blsi64:
-; CHECK: blsiq
}
-define i32 @blsmsk32(i32 %x) nounwind readnone {
+define i32 @blsmsk32(i32 %x) {
+; CHECK-LABEL: blsmsk32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsmskl %edi, %eax
+; CHECK-NEXT: retq
%tmp = sub i32 %x, 1
%tmp2 = xor i32 %x, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsmsk32:
-; CHECK: blsmskl
}
-define i32 @blsmsk32_load(i32* %x) nounwind readnone {
+define i32 @blsmsk32_load(i32* %x) {
+; CHECK-LABEL: blsmsk32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsmskl (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = sub i32 %x1, 1
%tmp2 = xor i32 %x1, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsmsk32_load:
-; CHECK: blsmskl ({{.*}})
}
-define i64 @blsmsk64(i64 %x) nounwind readnone {
+define i64 @blsmsk64(i64 %x) {
+; CHECK-LABEL: blsmsk64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsmskq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = sub i64 %x, 1
%tmp2 = xor i64 %tmp, %x
ret i64 %tmp2
-; CHECK-LABEL: blsmsk64:
-; CHECK: blsmskq
}
-define i32 @blsr32(i32 %x) nounwind readnone {
+define i32 @blsr32(i32 %x) {
+; CHECK-LABEL: blsr32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsrl %edi, %eax
+; CHECK-NEXT: retq
%tmp = sub i32 %x, 1
%tmp2 = and i32 %x, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsr32:
-; CHECK: blsrl
}
-define i32 @blsr32_load(i32* %x) nounwind readnone {
+define i32 @blsr32_load(i32* %x) {
+; CHECK-LABEL: blsr32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsrl (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = sub i32 %x1, 1
%tmp2 = and i32 %x1, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsr32_load:
-; CHECK: blsrl ({{.*}})
}
-define i64 @blsr64(i64 %x) nounwind readnone {
+define i64 @blsr64(i64 %x) {
+; CHECK-LABEL: blsr64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsrq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = sub i64 %x, 1
%tmp2 = and i64 %tmp, %x
ret i64 %tmp2
-; CHECK-LABEL: blsr64:
-; CHECK: blsrq
}
-define i32 @pdep32(i32 %x, i32 %y) nounwind readnone {
+define i32 @pdep32(i32 %x, i32 %y) {
+; CHECK-LABEL: pdep32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pdepl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: pdep32:
-; CHECK: pdepl
}
-define i32 @pdep32_load(i32 %x, i32* %y) nounwind readnone {
+define i32 @pdep32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: pdep32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: pdepl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
%y1 = load i32, i32* %y
%tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
ret i32 %tmp
-; CHECK-LABEL: pdep32_load:
-; CHECK: pdepl ({{.*}})
}
-declare i32 @llvm.x86.bmi.pdep.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
-define i64 @pdep64(i64 %x, i64 %y) nounwind readnone {
+define i64 @pdep64(i64 %x, i64 %y) {
+; CHECK-LABEL: pdep64:
+; CHECK: # BB#0:
+; CHECK-NEXT: pdepq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: pdep64:
-; CHECK: pdepq
}
-declare i64 @llvm.x86.bmi.pdep.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
-define i32 @pext32(i32 %x, i32 %y) nounwind readnone {
+define i32 @pext32(i32 %x, i32 %y) {
+; CHECK-LABEL: pext32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pextl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: pext32:
-; CHECK: pextl
}
-define i32 @pext32_load(i32 %x, i32* %y) nounwind readnone {
+define i32 @pext32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: pext32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: pextl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
%y1 = load i32, i32* %y
%tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
ret i32 %tmp
-; CHECK-LABEL: pext32_load:
-; CHECK: pextl ({{.*}})
}
-declare i32 @llvm.x86.bmi.pext.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.pext.32(i32, i32)
-define i64 @pext64(i64 %x, i64 %y) nounwind readnone {
+define i64 @pext64(i64 %x, i64 %y) {
+; CHECK-LABEL: pext64:
+; CHECK: # BB#0:
+; CHECK-NEXT: pextq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: pext64:
-; CHECK: pextq
}
-declare i64 @llvm.x86.bmi.pext.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.pext.64(i64, i64)
diff --git a/test/CodeGen/X86/bool-zext.ll b/test/CodeGen/X86/bool-zext.ll
index c98ad9e36d7e..5cc758c06b5d 100644
--- a/test/CodeGen/X86/bool-zext.ll
+++ b/test/CodeGen/X86/bool-zext.ll
@@ -1,10 +1,15 @@
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
-; X64: @bar1
+; Check that the argument gets zero-extended before calling.
+; X86-LABEL: bar1
+; X86: movzbl
+; X86: calll
+; X64-LABEL: bar1
; X64: movzbl
; X64: jmp
-; WIN64: @bar1
+; WIN64-LABEL: bar1
; WIN64: movzbl
; WIN64: callq
define void @bar1(i1 zeroext %v1) nounwind ssp {
@@ -14,10 +19,11 @@ entry:
ret void
}
-; X64: @bar2
+; Check that on x86-64 the arguments are simply forwarded.
+; X64-LABEL: bar2
; X64-NOT: movzbl
; X64: jmp
-; WIN64: @bar2
+; WIN64-LABEL: bar2
; WIN64-NOT: movzbl
; WIN64: callq
define void @bar2(i8 zeroext %v1) nounwind ssp {
@@ -27,16 +33,19 @@ entry:
ret void
}
-; X64: @bar3
-; X64: callq
-; X64-NOT: movzbl
-; X64-NOT: and
-; X64: ret
-; WIN64: @bar3
-; WIN64: callq
-; WIN64-NOT: movzbl
-; WIN64-NOT: and
-; WIN64: ret
+; Check that i1 return values are not zero-extended.
+; X86-LABEL: bar3
+; X86: call
+; X86-NEXT: {{add|pop}}
+; X86-NEXT: ret
+; X64-LABEL: bar3
+; X64: call
+; X64-NEXT: {{add|pop}}
+; X64-NEXT: ret
+; WIN64-LABEL: bar3
+; WIN64: call
+; WIN64-NEXT: {{add|pop}}
+; WIN64-NEXT: ret
define zeroext i1 @bar3() nounwind ssp {
entry:
%call = call i1 @foo2() nounwind
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index fd1e73bde8cc..d3aedbb17e7d 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -2,11 +2,14 @@
; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck -check-prefix=X64_LINUX %s
; RUN: llc -mtriple=x86_64-pc-windows < %s | FileCheck -check-prefix=X64_WINDOWS %s
; RUN: llc -mtriple=x86_64-pc-windows-gnu < %s | FileCheck -check-prefix=X64_WINDOWS_GNU %s
+; RUN: llc -mtriple=x86_64-scei-ps4 < %s | FileCheck -check-prefix=PS4 %s
; X64_DARWIN: orq
+; X64_DARWIN-NEXT: jne
; X64_DARWIN-NEXT: %bb8.i329
; X64_LINUX: orq %rax, %rcx
+; X64_LINUX-NEXT: jne
; X64_LINUX-NEXT: %bb8.i329
; X64_WINDOWS: orq %rax, %rcx
@@ -15,6 +18,9 @@
; X64_WINDOWS_GNU: orq %rax, %rcx
; X64_WINDOWS_GNU-NEXT: ud2
+; PS4: orq %rax, %rcx
+; PS4-NEXT: ud2
+
@_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
@_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
index 699de22d5b56..74a0728f918d 100644
--- a/test/CodeGen/X86/break-false-dep.ll
+++ b/test/CodeGen/X86/break-false-dep.ll
@@ -64,7 +64,7 @@ declare float @llvm.sqrt.f32(float)
declare double @llvm.sqrt.f64(double)
; SSE-LABEL: loopdep1
-; SSE: for.body
+; SSE: for.body{{$}}
;
; This loop contains two cvtsi2ss instructions that update the same xmm
; register. Verify that the execution dependency fix pass breaks those
@@ -139,7 +139,7 @@ ret:
; This loop contains a cvtsi2sd instruction that has a loop-carried
; false dependency on an xmm that is modified by other scalar instructions
-; that follow it in the loop. Additionally, the source of convert is a
+; that follow it in the loop. Additionally, the source of convert is a
; memory operand. Verify the execution dependency fix pass breaks this
; dependency by inserting a xor before the convert.
@x = common global [1024 x double] zeroinitializer, align 16
diff --git a/test/CodeGen/X86/bss_pagealigned.ll b/test/CodeGen/X86/bss_pagealigned.ll
index da95aca110da..4e9f9241011c 100644
--- a/test/CodeGen/X86/bss_pagealigned.ll
+++ b/test/CodeGen/X86/bss_pagealigned.ll
@@ -15,7 +15,7 @@ define void @unxlate_dev_mem_ptr(i64 %phis, i8* %addr) nounwind {
}
@bm_pte = internal global [512 x %struct.kmem_cache_order_objects] zeroinitializer, section ".bss.page_aligned", align 4096
; CHECK: .section .bss.page_aligned,"aw",@nobits
-; CHECK-NEXT: .align 4096
+; CHECK-NEXT: .p2align 12
; CHECK-NEXT: bm_pte:
; CHECK-NEXT: .zero 4096
; CHECK-NEXT: .size bm_pte, 4096
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 5376601a95e3..6697183ab679 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,7 +1,8 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-NOSSSE3
; RUN: llc < %s -mcpu=core2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-SSSE3
-; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2
-; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE-AVX2
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2
+; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-WIDE-AVX --check-prefix=CHECK-WIDE-AVX2
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -29,15 +30,15 @@ define <8 x i16> @test1(<8 x i16> %v) {
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test1:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test1:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test1:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test1:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
ret <8 x i16> %r
@@ -62,15 +63,15 @@ define <4 x i32> @test2(<4 x i32> %v) {
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test2:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test2:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test2:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test2:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
ret <4 x i32> %r
@@ -97,15 +98,15 @@ define <2 x i64> @test3(<2 x i64> %v) {
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test3:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test3:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test3:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test3:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
ret <2 x i64> %r
@@ -144,15 +145,15 @@ define <16 x i16> @test4(<16 x i16> %v) {
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test4:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test4:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test4:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test4:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
ret <16 x i16> %r
@@ -187,15 +188,15 @@ define <8 x i32> @test5(<8 x i32> %v) {
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test5:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test5:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test5:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test5:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
ret <8 x i32> %r
@@ -234,15 +235,15 @@ define <4 x i64> @test6(<4 x i64> %v) {
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test6:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test6:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test6:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test6:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
ret <4 x i64> %r
@@ -271,16 +272,16 @@ define <4 x i16> @test7(<4 x i16> %v) {
; CHECK-SSSE3-NEXT: psrld $16, %xmm0
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test7:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test7:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test7:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test7:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
ret <4 x i16> %r
@@ -293,7 +294,7 @@ entry:
define <8 x i16> @identity_v8i16(<8 x i16> %v) {
; CHECK-ALL-LABEL: identity_v8i16:
; CHECK-ALL: # BB#0: # %entry
-; CHECK-ALL: retq
+; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
%bs2 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %bs1)
@@ -374,6 +375,11 @@ define <8 x i16> @fold_v8i16() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v8i16:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6>)
ret <8 x i16> %r
@@ -389,6 +395,11 @@ define <4 x i32> @fold_v4i32() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v4i32:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> <i32 0, i32 -1, i32 2, i32 -3>)
ret <4 x i32> %r
@@ -404,6 +415,11 @@ define <2 x i64> @fold_v2i64() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v2i64:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> <i64 255, i64 -1>)
ret <2 x i64> %r
@@ -420,6 +436,11 @@ define <16 x i16> @fold_v16i16() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v16i16:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14>)
ret <16 x i16> %r
@@ -436,6 +457,11 @@ define <8 x i32> @fold_v8i32() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v8i32:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> <i32 0, i32 1, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6>)
ret <8 x i32> %r
@@ -452,6 +478,11 @@ define <4 x i64> @fold_v4i64() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v4i64:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> <i64 255, i64 -1, i64 65535, i64 16776960>)
ret <4 x i64> %r
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index 036ec0acc6e8..aee4a93c6473 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=penryn | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK --check-prefix=PENTIUM4 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX-512 %s
+
; PR3253
; The register+memory form of the BT instruction should be usable on
@@ -18,516 +21,950 @@
; - The and can be commuted.
define void @test2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: test2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB0_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB0_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: test2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB0_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB0_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: test2
-; CHECK: btl %ecx, %eax
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @test2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: test2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB1_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB1_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: test2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB1_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB1_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: test2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atest2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atest2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB2_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB2_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atest2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB2_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB2_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atest2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atest2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atest2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB3_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB3_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atest2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB3_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB3_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atest2b
-; CHECK: btl %e{{..}}, %e{{..}}
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @test3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB4_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB4_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: test3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @test3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: test3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB5_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB5_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: test3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: testne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB6_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB6_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: testne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB6_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB6_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: testne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: testne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB7_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB7_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: testne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB7_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB7_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: testne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atestne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atestne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB8_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB8_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atestne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB8_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB8_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atestne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atestne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atestne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB9_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB9_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atestne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB9_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB9_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atestne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: testne3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB10_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB10_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: testne3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: testne3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB11_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB11_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: testne3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: query2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB12_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB12_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: query2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB12_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB12_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: query2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: query2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB13_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB13_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: query2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB13_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB13_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: query2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aquery2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aquery2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB14_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB14_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aquery2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB14_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB14_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aquery2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aquery2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aquery2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB15_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB15_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aquery2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB15_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB15_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aquery2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB16_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB16_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp eq i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB17_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB17_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp eq i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3x(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3x:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB18_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB18_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3x
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp eq i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3bx(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3bx:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB19_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB19_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3bx
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp eq i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: queryne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB20_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB20_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: queryne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB20_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB20_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: queryne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: queryne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB21_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB21_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: queryne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB21_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB21_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: queryne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aqueryne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aqueryne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB22_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB22_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aqueryne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB22_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB22_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aqueryne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aqueryne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aqueryne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB23_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB23_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aqueryne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB23_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB23_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aqueryne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB24_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB24_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp ne i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB25_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB25_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp ne i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3x(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3x:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB26_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB26_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3x
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp ne i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3bx(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3bx:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB27_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB27_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3bx
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp ne i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
declare void @foo()
define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
-; CHECK: btl
-entry:
+; CHECK-LABEL: invert:
+; CHECK: # BB#0:
+; CHECK-NEXT: notl %edi
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: retq
%neg = xor i32 %flags, -1
%shl = shl i32 1, %flag
%and = and i32 %shl, %neg
%tobool = icmp ne i32 %and, 0
ret i1 %tobool
}
+
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
index fd7290d58179..2ee33a1a9028 100644
--- a/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -47,7 +47,7 @@ entry:
define <2 x double> @test_negative_zero_2(<2 x double> %A) {
; CHECK-LABEL: test_negative_zero_2:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movhpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: retq
entry:
%0 = extractelement <2 x double> %A, i32 0
diff --git a/test/CodeGen/X86/byval2.ll b/test/CodeGen/X86/byval2.ll
index cc72a8699a9c..5eb8b590e8da 100644
--- a/test/CodeGen/X86/byval2.ll
+++ b/test/CodeGen/X86/byval2.ll
@@ -37,8 +37,8 @@ entry:
store i64 %b, i64* %tmp2, align 16
%tmp4 = getelementptr %struct.s, %struct.s* %d, i32 0, i32 2
store i64 %c, i64* %tmp4, align 16
- call void @f( %struct.s*byval %d )
- call void @f( %struct.s*byval %d )
+ call void @f( %struct.s* byval %d )
+ call void @f( %struct.s* byval %d )
ret void
}
diff --git a/test/CodeGen/X86/call-push.ll b/test/CodeGen/X86/call-push.ll
index 6bcb5d665618..e8afa1e77afa 100644
--- a/test/CodeGen/X86/call-push.ll
+++ b/test/CodeGen/X86/call-push.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim -no-x86-call-frame-opt | FileCheck %s
%struct.decode_t = type { i8, i8, i8, i8, i16, i8, i8, %struct.range_t** }
%struct.range_t = type { float, float, i32, i32, i32, [0 x i8] }
diff --git a/test/CodeGen/X86/catchpad-dynamic-alloca.ll b/test/CodeGen/X86/catchpad-dynamic-alloca.ll
new file mode 100644
index 000000000000..4e8a8d8868bd
--- /dev/null
+++ b/test/CodeGen/X86/catchpad-dynamic-alloca.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @rt_init()
+
+declare i32 @__CxxFrameHandler3(...)
+
+define void @test1(void ()* %fp, i64 %n) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ %t.i = alloca i8*
+ %t.ii = alloca i8
+ %.alloca8 = alloca i8, i64 %n
+ store volatile i8 0, i8* %t.ii
+ store volatile i8 0, i8* %.alloca8
+ invoke void @rt_init()
+ to label %try.cont unwind label %catch.switch
+
+try.cont:
+ invoke void %fp()
+ to label %exit unwind label %catch.switch
+
+exit:
+ ret void
+
+catch.pad:
+ %cp = catchpad within %cs [i8* null, i32 0, i8** %t.i]
+ catchret from %cp to label %exit
+
+catch.switch:
+ %cs = catchswitch within none [label %catch.pad] unwind to caller
+}
+
+; CHECK-LABEL: $handlerMap$0$test1:
+; CHECK: .long 0
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 16
+
+define void @test2(void ()* %fp, i64 %n) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ %t.i = alloca i128
+ %.alloca8 = alloca i8, i64 %n
+ store volatile i8 0, i8* %.alloca8
+ invoke void @rt_init()
+ to label %try.cont unwind label %catch.switch
+
+try.cont:
+ invoke void %fp()
+ to label %exit unwind label %catch.switch
+
+exit:
+ ret void
+
+catch.pad:
+ %cp = catchpad within %cs [i8* null, i32 0, i128* %t.i]
+ catchret from %cp to label %exit
+
+catch.switch:
+ %cs = catchswitch within none [label %catch.pad] unwind to caller
+}
+
+; CHECK-LABEL: $handlerMap$0$test2:
+; CHECK: .long 0
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 8
diff --git a/test/CodeGen/X86/catchpad-lifetime.ll b/test/CodeGen/X86/catchpad-lifetime.ll
index dfd75334561f..77d3f25057cf 100644
--- a/test/CodeGen/X86/catchpad-lifetime.ll
+++ b/test/CodeGen/X86/catchpad-lifetime.ll
@@ -16,7 +16,7 @@ entry:
to label %unreachable unwind label %catch.dispatch
; CHECK-LABEL: test1:
-; CHECK: movq $0, -16(%rbp)
+; CHECK: movq $0, -8(%rbp)
; CHECK: callq throw
catch.dispatch: ; preds = %entry
@@ -33,8 +33,8 @@ catch.pad: ; preds = %catch.dispatch
unreachable
; CHECK-LABEL: "?catch$2@?0?test1@4HA"
-; CHECK: movq $0, -16(%rbp)
-; CHECK: movq $0, -16(%rbp)
+; CHECK: movq $0, -8(%rbp)
+; CHECK: movq $0, -8(%rbp)
; CHECK: ud2
unreachable: ; preds = %entry
@@ -42,7 +42,7 @@ unreachable: ; preds = %entry
}
; CHECK-LABEL: $cppxdata$test1:
-; CHECK: .long 32 # CatchObjOffset
+; CHECK: .long 56 # CatchObjOffset
define void @test2() personality i32 (...)* @__CxxFrameHandler3 {
entry:
diff --git a/test/CodeGen/X86/catchret-regmask.ll b/test/CodeGen/X86/catchret-regmask.ll
new file mode 100644
index 000000000000..1231172a7e95
--- /dev/null
+++ b/test/CodeGen/X86/catchret-regmask.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @throw() noreturn uwtable
+declare i8* @getval()
+
+define i8* @reload_out_of_pad(i8* %arg) #0 personality i32 (...)* @__CxxFrameHandler3 {
+assertPassed:
+ invoke void @throw()
+ to label %unreachable unwind label %catch.dispatch
+
+catch:
+ %cp = catchpad within %cs [i8* null, i32 0, i8* null]
+ catchret from %cp to label %return
+
+ ; This block *must* appear after the catchret to test the bug.
+ ; FIXME: Make this an MIR test so we can control MBB layout.
+unreachable:
+ unreachable
+
+catch.dispatch:
+ %cs = catchswitch within none [label %catch] unwind to caller
+
+return:
+ ret i8* %arg
+}
+
+; CHECK-LABEL: reload_out_of_pad: # @reload_out_of_pad
+; CHECK: movq %rcx, -[[arg_slot:[0-9]+]](%rbp) # 8-byte Spill
+; CHECK: callq throw
+; CHECK: ud2
+; CHECK: movq -[[arg_slot]](%rbp), %rax # 8-byte Reload
+; CHECK: retq
+
+; CHECK: "?catch$3@?0?reload_out_of_pad@4HA":
+; CHECK-NOT: Reload
+; CHECK: retq
+
+define i8* @spill_in_pad() #0 personality i32 (...)* @__CxxFrameHandler3 {
+assertPassed:
+ invoke void @throw()
+ to label %unreachable unwind label %catch.dispatch
+
+catch:
+ %cp = catchpad within %cs [i8* null, i32 0, i8* null]
+ %val = call i8* @getval() [ "funclet"(token %cp) ]
+ catchret from %cp to label %return
+
+unreachable:
+ unreachable
+
+catch.dispatch:
+ %cs = catchswitch within none [label %catch] unwind to caller
+
+return:
+ ret i8* %val
+}
+
+; CHECK-LABEL: spill_in_pad: # @spill_in_pad
+; CHECK: callq throw
+; CHECK: ud2
+; CHECK: movq -[[val_slot:[0-9]+]](%rbp), %rax # 8-byte Reload
+; CHECK: retq
+
+; CHECK: "?catch$3@?0?spill_in_pad@4HA":
+; CHECK: callq getval
+; CHECK: movq %rax, -[[val_slot]](%rbp) # 8-byte Spill
+; CHECK: retq
+
+attributes #0 = { uwtable }
diff --git a/test/CodeGen/X86/cfstring.ll b/test/CodeGen/X86/cfstring.ll
index 3eeb8d2890cc..84032d045fb8 100644
--- a/test/CodeGen/X86/cfstring.ll
+++ b/test/CodeGen/X86/cfstring.ll
@@ -10,7 +10,7 @@
; CHECK-NEXT: L_.str3:
; CHECK: .section __DATA,__cfstring
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: L__unnamed_cfstring_4:
; CHECK-NEXT: .quad ___CFConstantStringClassReference
; CHECK-NEXT: .long 1992
diff --git a/test/CodeGen/X86/cleanuppad-inalloca.ll b/test/CodeGen/X86/cleanuppad-inalloca.ll
index 2e34ada52e6b..c0660fee2f1a 100644
--- a/test/CodeGen/X86/cleanuppad-inalloca.ll
+++ b/test/CodeGen/X86/cleanuppad-inalloca.ll
@@ -38,8 +38,8 @@ ehcleanup: ; preds = %entry
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
; CHECK: calll "??0A@@QAE@XZ"
; CHECK: calll "??0A@@QAE@XZ"
; CHECK: calll _takes_two
diff --git a/test/CodeGen/X86/cleanuppad-realign.ll b/test/CodeGen/X86/cleanuppad-realign.ll
index 5a565cc1570f..314d5da07d72 100644
--- a/test/CodeGen/X86/cleanuppad-realign.ll
+++ b/test/CodeGen/X86/cleanuppad-realign.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc -mtriple=i686-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X64 %s
declare i32 @__CxxFrameHandler3(...)
declare void @Dtor(i64* %o)
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
new file mode 100644
index 000000000000..e05451b80271
--- /dev/null
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -0,0 +1,683 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; PR6455 'Clear Upper Bits' Patterns
+;
+
+define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind {
+; SSE-LABEL: _clearupper2xi64a:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper2xi64a:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <2 x i64> %0, i32 0
+ %x1 = extractelement <2 x i64> %0, i32 1
+ %trunc0 = trunc i64 %x0 to i32
+ %trunc1 = trunc i64 %x1 to i32
+ %ext0 = zext i32 %trunc0 to i64
+ %ext1 = zext i32 %trunc1 to i64
+ %v0 = insertelement <2 x i64> undef, i64 %ext0, i32 0
+ %v1 = insertelement <2 x i64> %v0, i64 %ext1, i32 1
+ ret <2 x i64> %v1
+}
+
+define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind {
+; SSE-LABEL: _clearupper4xi32a:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: _clearupper4xi32a:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _clearupper4xi32a:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <4 x i32> %0, i32 0
+ %x1 = extractelement <4 x i32> %0, i32 1
+ %x2 = extractelement <4 x i32> %0, i32 2
+ %x3 = extractelement <4 x i32> %0, i32 3
+ %trunc0 = trunc i32 %x0 to i16
+ %trunc1 = trunc i32 %x1 to i16
+ %trunc2 = trunc i32 %x2 to i16
+ %trunc3 = trunc i32 %x3 to i16
+ %ext0 = zext i16 %trunc0 to i32
+ %ext1 = zext i16 %trunc1 to i32
+ %ext2 = zext i16 %trunc2 to i32
+ %ext3 = zext i16 %trunc3 to i32
+ %v0 = insertelement <4 x i32> undef, i32 %ext0, i32 0
+ %v1 = insertelement <4 x i32> %v0, i32 %ext1, i32 1
+ %v2 = insertelement <4 x i32> %v1, i32 %ext2, i32 2
+ %v3 = insertelement <4 x i32> %v2, i32 %ext3, i32 3
+ ret <4 x i32> %v3
+}
+
+define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind {
+; SSE-LABEL: _clearupper8xi16a:
+; SSE: # BB#0:
+; SSE-NEXT: pextrw $1, %xmm0, %eax
+; SSE-NEXT: pextrw $2, %xmm0, %r9d
+; SSE-NEXT: pextrw $3, %xmm0, %edx
+; SSE-NEXT: pextrw $4, %xmm0, %r8d
+; SSE-NEXT: pextrw $5, %xmm0, %edi
+; SSE-NEXT: pextrw $6, %xmm0, %esi
+; SSE-NEXT: pextrw $7, %xmm0, %ecx
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: movd %edx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: movd %edi, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: movd %esi, %xmm1
+; SSE-NEXT: movd %r9d, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: movd %r8d, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper8xi16a:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrw $1, %xmm0, %eax
+; AVX-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-NEXT: vpextrw $3, %xmm0, %edx
+; AVX-NEXT: vpextrw $4, %xmm0, %esi
+; AVX-NEXT: vpextrw $5, %xmm0, %edi
+; AVX-NEXT: vpextrw $6, %xmm0, %r8d
+; AVX-NEXT: vpextrw $7, %xmm0, %r9d
+; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $3, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, %r8d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <8 x i16> %0, i32 0
+ %x1 = extractelement <8 x i16> %0, i32 1
+ %x2 = extractelement <8 x i16> %0, i32 2
+ %x3 = extractelement <8 x i16> %0, i32 3
+ %x4 = extractelement <8 x i16> %0, i32 4
+ %x5 = extractelement <8 x i16> %0, i32 5
+ %x6 = extractelement <8 x i16> %0, i32 6
+ %x7 = extractelement <8 x i16> %0, i32 7
+ %trunc0 = trunc i16 %x0 to i8
+ %trunc1 = trunc i16 %x1 to i8
+ %trunc2 = trunc i16 %x2 to i8
+ %trunc3 = trunc i16 %x3 to i8
+ %trunc4 = trunc i16 %x4 to i8
+ %trunc5 = trunc i16 %x5 to i8
+ %trunc6 = trunc i16 %x6 to i8
+ %trunc7 = trunc i16 %x7 to i8
+ %ext0 = zext i8 %trunc0 to i16
+ %ext1 = zext i8 %trunc1 to i16
+ %ext2 = zext i8 %trunc2 to i16
+ %ext3 = zext i8 %trunc3 to i16
+ %ext4 = zext i8 %trunc4 to i16
+ %ext5 = zext i8 %trunc5 to i16
+ %ext6 = zext i8 %trunc6 to i16
+ %ext7 = zext i8 %trunc7 to i16
+ %v0 = insertelement <8 x i16> undef, i16 %ext0, i32 0
+ %v1 = insertelement <8 x i16> %v0, i16 %ext1, i32 1
+ %v2 = insertelement <8 x i16> %v1, i16 %ext2, i32 2
+ %v3 = insertelement <8 x i16> %v2, i16 %ext3, i32 3
+ %v4 = insertelement <8 x i16> %v3, i16 %ext4, i32 4
+ %v5 = insertelement <8 x i16> %v4, i16 %ext5, i32 5
+ %v6 = insertelement <8 x i16> %v5, i16 %ext6, i32 6
+ %v7 = insertelement <8 x i16> %v6, i16 %ext7, i32 7
+ ret <8 x i16> %v7
+}
+
+define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
+; SSE-LABEL: _clearupper16xi8a:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: movd %edx, %xmm0
+; SSE-NEXT: movd %esi, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd %edi, %xmm0
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: movd %edx, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT: movd %r9d, %xmm0
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd %r8d, %xmm0
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper16xi8a:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <16 x i8> %0, i32 0
+ %x1 = extractelement <16 x i8> %0, i32 1
+ %x2 = extractelement <16 x i8> %0, i32 2
+ %x3 = extractelement <16 x i8> %0, i32 3
+ %x4 = extractelement <16 x i8> %0, i32 4
+ %x5 = extractelement <16 x i8> %0, i32 5
+ %x6 = extractelement <16 x i8> %0, i32 6
+ %x7 = extractelement <16 x i8> %0, i32 7
+ %x8 = extractelement <16 x i8> %0, i32 8
+ %x9 = extractelement <16 x i8> %0, i32 9
+ %x10 = extractelement <16 x i8> %0, i32 10
+ %x11 = extractelement <16 x i8> %0, i32 11
+ %x12 = extractelement <16 x i8> %0, i32 12
+ %x13 = extractelement <16 x i8> %0, i32 13
+ %x14 = extractelement <16 x i8> %0, i32 14
+ %x15 = extractelement <16 x i8> %0, i32 15
+ %trunc0 = trunc i8 %x0 to i4
+ %trunc1 = trunc i8 %x1 to i4
+ %trunc2 = trunc i8 %x2 to i4
+ %trunc3 = trunc i8 %x3 to i4
+ %trunc4 = trunc i8 %x4 to i4
+ %trunc5 = trunc i8 %x5 to i4
+ %trunc6 = trunc i8 %x6 to i4
+ %trunc7 = trunc i8 %x7 to i4
+ %trunc8 = trunc i8 %x8 to i4
+ %trunc9 = trunc i8 %x9 to i4
+ %trunc10 = trunc i8 %x10 to i4
+ %trunc11 = trunc i8 %x11 to i4
+ %trunc12 = trunc i8 %x12 to i4
+ %trunc13 = trunc i8 %x13 to i4
+ %trunc14 = trunc i8 %x14 to i4
+ %trunc15 = trunc i8 %x15 to i4
+ %ext0 = zext i4 %trunc0 to i8
+ %ext1 = zext i4 %trunc1 to i8
+ %ext2 = zext i4 %trunc2 to i8
+ %ext3 = zext i4 %trunc3 to i8
+ %ext4 = zext i4 %trunc4 to i8
+ %ext5 = zext i4 %trunc5 to i8
+ %ext6 = zext i4 %trunc6 to i8
+ %ext7 = zext i4 %trunc7 to i8
+ %ext8 = zext i4 %trunc8 to i8
+ %ext9 = zext i4 %trunc9 to i8
+ %ext10 = zext i4 %trunc10 to i8
+ %ext11 = zext i4 %trunc11 to i8
+ %ext12 = zext i4 %trunc12 to i8
+ %ext13 = zext i4 %trunc13 to i8
+ %ext14 = zext i4 %trunc14 to i8
+ %ext15 = zext i4 %trunc15 to i8
+ %v0 = insertelement <16 x i8> undef, i8 %ext0, i32 0
+ %v1 = insertelement <16 x i8> %v0, i8 %ext1, i32 1
+ %v2 = insertelement <16 x i8> %v1, i8 %ext2, i32 2
+ %v3 = insertelement <16 x i8> %v2, i8 %ext3, i32 3
+ %v4 = insertelement <16 x i8> %v3, i8 %ext4, i32 4
+ %v5 = insertelement <16 x i8> %v4, i8 %ext5, i32 5
+ %v6 = insertelement <16 x i8> %v5, i8 %ext6, i32 6
+ %v7 = insertelement <16 x i8> %v6, i8 %ext7, i32 7
+ %v8 = insertelement <16 x i8> %v7, i8 %ext8, i32 8
+ %v9 = insertelement <16 x i8> %v8, i8 %ext9, i32 9
+ %v10 = insertelement <16 x i8> %v9, i8 %ext10, i32 10
+ %v11 = insertelement <16 x i8> %v10, i8 %ext11, i32 11
+ %v12 = insertelement <16 x i8> %v11, i8 %ext12, i32 12
+ %v13 = insertelement <16 x i8> %v12, i8 %ext13, i32 13
+ %v14 = insertelement <16 x i8> %v13, i8 %ext14, i32 14
+ %v15 = insertelement <16 x i8> %v14, i8 %ext15, i32 15
+ ret <16 x i8> %v15
+}
+
+define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
+; SSE-LABEL: _clearupper2xi64b:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: _clearupper2xi64b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _clearupper2xi64b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: retq
+ %x32 = bitcast <2 x i64> %0 to <4 x i32>
+ %r0 = insertelement <4 x i32> %x32, i32 zeroinitializer, i32 1
+ %r1 = insertelement <4 x i32> %r0, i32 zeroinitializer, i32 3
+ %r = bitcast <4 x i32> %r1 to <2 x i64>
+ ret <2 x i64> %r
+}
+
+define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind {
+; SSE-LABEL: _clearupper4xi32b:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: pinsrw $1, %eax, %xmm0
+; SSE-NEXT: pinsrw $3, %eax, %xmm0
+; SSE-NEXT: pinsrw $5, %eax, %xmm0
+; SSE-NEXT: pinsrw $7, %eax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper4xi32b:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: retq
+ %x16 = bitcast <4 x i32> %0 to <8 x i16>
+ %r0 = insertelement <8 x i16> %x16, i16 zeroinitializer, i32 1
+ %r1 = insertelement <8 x i16> %r0, i16 zeroinitializer, i32 3
+ %r2 = insertelement <8 x i16> %r1, i16 zeroinitializer, i32 5
+ %r3 = insertelement <8 x i16> %r2, i16 zeroinitializer, i32 7
+ %r = bitcast <8 x i16> %r3 to <4 x i32>
+ ret <4 x i32> %r
+}
+
+define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
+; SSE-LABEL: _clearupper8xi16b:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllw $8, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslld $24, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllq $40, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllq $56, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE-NEXT: pandn %xmm1, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper8xi16b:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x8 = bitcast <8 x i16> %0 to <16 x i8>
+ %r0 = insertelement <16 x i8> %x8, i8 zeroinitializer, i32 1
+ %r1 = insertelement <16 x i8> %r0, i8 zeroinitializer, i32 3
+ %r2 = insertelement <16 x i8> %r1, i8 zeroinitializer, i32 5
+ %r3 = insertelement <16 x i8> %r2, i8 zeroinitializer, i32 7
+ %r4 = insertelement <16 x i8> %r3, i8 zeroinitializer, i32 9
+ %r5 = insertelement <16 x i8> %r4, i8 zeroinitializer, i32 11
+ %r6 = insertelement <16 x i8> %r5, i8 zeroinitializer, i32 13
+ %r7 = insertelement <16 x i8> %r6, i8 zeroinitializer, i32 15
+ %r = bitcast <16 x i8> %r7 to <8 x i16>
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
+; SSE-LABEL: _clearupper16xi8b:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: movd %xmm0, %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: movq %rcx, %r9
+; SSE-NEXT: movq %rcx, %r10
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: movq %rcx, %rdx
+; SSE-NEXT: movq %rcx, %rsi
+; SSE-NEXT: movq %rcx, %rdi
+; SSE-NEXT: andb $15, %cl
+; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movd %xmm1, %rcx
+; SSE-NEXT: shrq $56, %rdi
+; SSE-NEXT: andb $15, %dil
+; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %r11
+; SSE-NEXT: shrq $48, %rsi
+; SSE-NEXT: andb $15, %sil
+; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %r14
+; SSE-NEXT: shrq $40, %rdx
+; SSE-NEXT: andb $15, %dl
+; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rdx
+; SSE-NEXT: shrq $32, %rax
+; SSE-NEXT: andb $15, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: shrq $24, %r10
+; SSE-NEXT: andb $15, %r10b
+; SSE-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rdi
+; SSE-NEXT: shrq $16, %r9
+; SSE-NEXT: andb $15, %r9b
+; SSE-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rsi
+; SSE-NEXT: shrq $8, %r8
+; SSE-NEXT: andb $15, %r8b
+; SSE-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rbx
+; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: andb $15, %cl
+; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $56, %rbx
+; SSE-NEXT: andb $15, %bl
+; SSE-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $48, %rsi
+; SSE-NEXT: andb $15, %sil
+; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $40, %rdi
+; SSE-NEXT: andb $15, %dil
+; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $32, %rax
+; SSE-NEXT: andb $15, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $24, %rdx
+; SSE-NEXT: andb $15, %dl
+; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $16, %r14
+; SSE-NEXT: andb $15, %r14b
+; SSE-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $8, %r11
+; SSE-NEXT: andb $15, %r11b
+; SSE-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper16xi8b:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT: movq %rcx, %r8
+; AVX-NEXT: movq %rcx, %r9
+; AVX-NEXT: movq %rcx, %r10
+; AVX-NEXT: movq %rcx, %r11
+; AVX-NEXT: movq %rcx, %r14
+; AVX-NEXT: movq %rcx, %r15
+; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: movq %rdx, %r13
+; AVX-NEXT: movq %rdx, %rdi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: movq %rdx, %rbp
+; AVX-NEXT: andb $15, %dl
+; AVX-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq %rcx, %rdx
+; AVX-NEXT: andb $15, %cl
+; AVX-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $56, %rbp
+; AVX-NEXT: andb $15, %bpl
+; AVX-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $48, %rbx
+; AVX-NEXT: andb $15, %bl
+; AVX-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $40, %rsi
+; AVX-NEXT: andb $15, %sil
+; AVX-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $32, %rax
+; AVX-NEXT: andb $15, %al
+; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $24, %rdi
+; AVX-NEXT: andb $15, %dil
+; AVX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $16, %r13
+; AVX-NEXT: andb $15, %r13b
+; AVX-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $8, %r12
+; AVX-NEXT: andb $15, %r12b
+; AVX-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $56, %rdx
+; AVX-NEXT: andb $15, %dl
+; AVX-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $48, %r15
+; AVX-NEXT: andb $15, %r15b
+; AVX-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $40, %r14
+; AVX-NEXT: andb $15, %r14b
+; AVX-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $32, %r11
+; AVX-NEXT: andb $15, %r11b
+; AVX-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $24, %r10
+; AVX-NEXT: andb $15, %r10b
+; AVX-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $16, %r9
+; AVX-NEXT: andb $15, %r9b
+; AVX-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $8, %r8
+; AVX-NEXT: andb $15, %r8b
+; AVX-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r12
+; AVX-NEXT: popq %r13
+; AVX-NEXT: popq %r14
+; AVX-NEXT: popq %r15
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: retq
+ %x4 = bitcast <16 x i8> %0 to <32 x i4>
+ %r0 = insertelement <32 x i4> %x4, i4 zeroinitializer, i32 1
+ %r1 = insertelement <32 x i4> %r0, i4 zeroinitializer, i32 3
+ %r2 = insertelement <32 x i4> %r1, i4 zeroinitializer, i32 5
+ %r3 = insertelement <32 x i4> %r2, i4 zeroinitializer, i32 7
+ %r4 = insertelement <32 x i4> %r3, i4 zeroinitializer, i32 9
+ %r5 = insertelement <32 x i4> %r4, i4 zeroinitializer, i32 11
+ %r6 = insertelement <32 x i4> %r5, i4 zeroinitializer, i32 13
+ %r7 = insertelement <32 x i4> %r6, i4 zeroinitializer, i32 15
+ %r8 = insertelement <32 x i4> %r7, i4 zeroinitializer, i32 17
+ %r9 = insertelement <32 x i4> %r8, i4 zeroinitializer, i32 19
+ %r10 = insertelement <32 x i4> %r9, i4 zeroinitializer, i32 21
+ %r11 = insertelement <32 x i4> %r10, i4 zeroinitializer, i32 23
+ %r12 = insertelement <32 x i4> %r11, i4 zeroinitializer, i32 25
+ %r13 = insertelement <32 x i4> %r12, i4 zeroinitializer, i32 27
+ %r14 = insertelement <32 x i4> %r13, i4 zeroinitializer, i32 29
+ %r15 = insertelement <32 x i4> %r14, i4 zeroinitializer, i32 31
+ %r = bitcast <32 x i4> %r15 to <16 x i8>
+ ret <16 x i8> %r
+}
+
+define <2 x i64> @_clearupper2xi64c(<2 x i64>) nounwind {
+; SSE-LABEL: _clearupper2xi64c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: _clearupper2xi64c:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _clearupper2xi64c:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: retq
+ %r = and <2 x i64> <i64 4294967295, i64 4294967295>, %0
+ ret <2 x i64> %r
+}
+
+define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind {
+; SSE-LABEL: _clearupper4xi32c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper4xi32c:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: retq
+ %r = and <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, %0
+ ret <4 x i32> %r
+}
+
+define <8 x i16> @_clearupper8xi16c(<8 x i16>) nounwind {
+; SSE-LABEL: _clearupper8xi16c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper8xi16c:
+; AVX: # BB#0:
+; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %r = and <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>, %0
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @_clearupper16xi8c(<16 x i8>) nounwind {
+; SSE-LABEL: _clearupper16xi8c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper16xi8c:
+; AVX: # BB#0:
+; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %r = and <16 x i8> <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>, %0
+ ret <16 x i8> %r
+}
diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll
index 4a094480c931..685b2588bf52 100644
--- a/test/CodeGen/X86/clz.ll
+++ b/test/CodeGen/X86/clz.ll
@@ -1,160 +1,753 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X32-CLZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X64-CLZ
declare i8 @llvm.cttz.i8(i8, i1)
declare i16 @llvm.cttz.i16(i16, i1)
declare i32 @llvm.cttz.i32(i32, i1)
declare i64 @llvm.cttz.i64(i64, i1)
+
declare i8 @llvm.ctlz.i8(i8, i1)
declare i16 @llvm.ctlz.i16(i16, i1)
declare i32 @llvm.ctlz.i32(i32, i1)
declare i64 @llvm.ctlz.i64(i64, i1)
define i8 @cttz_i8(i8 %x) {
-; CHECK-LABEL: cttz_i8:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: bsfl %eax, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: bsfl %eax, %eax
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsfl %eax, %eax
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i8:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i8:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: tzcntl %eax, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
%tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
ret i8 %tmp
}
define i16 @cttz_i16(i16 %x) {
-; CHECK-LABEL: cttz_i16:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsfw %di, %ax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i16:
+; X32: # BB#0:
+; X32-NEXT: bsfw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i16:
+; X64: # BB#0:
+; X64-NEXT: bsfw %di, %ax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i16:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i16:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntw %di, %ax
+; X64-CLZ-NEXT: retq
%tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true )
ret i16 %tmp
}
define i32 @cttz_i32(i32 %x) {
-; CHECK-LABEL: cttz_i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsfl %edi, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i32:
+; X32: # BB#0:
+; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i32:
+; X64: # BB#0:
+; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i32:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i32:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true )
ret i32 %tmp
}
define i64 @cttz_i64(i64 %x) {
-; CHECK-LABEL: cttz_i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsfq %rdi, %rax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testl %eax, %eax
+; X32-NEXT: jne .LBB3_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB3_1:
+; X32-NEXT: bsfl %eax, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i64:
+; X64: # BB#0:
+; X64-NEXT: bsfq %rdi, %rax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i64:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB3_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB3_1:
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i64:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
%tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true )
ret i64 %tmp
}
define i8 @ctlz_i8(i8 %x) {
-; CHECK-LABEL: ctlz_i8:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: bsrl %eax, %eax
-; CHECK-NEXT: xorl $7, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: bsrl %eax, %eax
+; X32-NEXT: xorl $7, %eax
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsrl %eax, %eax
+; X64-NEXT: xorl $7, %eax
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i8:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: addl $-24, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i8:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: lzcntl %eax, %eax
+; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
%tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true )
ret i8 %tmp2
}
define i16 @ctlz_i16(i16 %x) {
-; CHECK-LABEL: ctlz_i16:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrw %di, %ax
-; CHECK-NEXT: xorl $15, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i16:
+; X32: # BB#0:
+; X32-NEXT: bsrw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: xorl $15, %eax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i16:
+; X64: # BB#0:
+; X64-NEXT: bsrw %di, %ax
+; X64-NEXT: xorl $15, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i16:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i16:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntw %di, %ax
+; X64-CLZ-NEXT: retq
%tmp2 = call i16 @llvm.ctlz.i16( i16 %x, i1 true )
ret i16 %tmp2
}
define i32 @ctlz_i32(i32 %x) {
-; CHECK-LABEL: ctlz_i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i32:
+; X32: # BB#0:
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i32:
+; X64: # BB#0:
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i32:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i32:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 true )
ret i32 %tmp
}
define i64 @ctlz_i64(i64 %x) {
-; CHECK-LABEL: ctlz_i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrq %rdi, %rax
-; CHECK-NEXT: xorq $63, %rax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testl %eax, %eax
+; X32-NEXT: jne .LBB7_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB7_1:
+; X32-NEXT: bsrl %eax, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i64:
+; X64: # BB#0:
+; X64-NEXT: bsrq %rdi, %rax
+; X64-NEXT: xorq $63, %rax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i64:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB7_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB7_1:
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i64:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
%tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true )
ret i64 %tmp
}
-define i32 @ctlz_i32_zero_test(i32 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i8 @ctlz_i8_zero_test(i8 %n) {
+; X32-LABEL: ctlz_i8_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movb $8, %al
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: je .LBB8_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: bsrl %eax, %eax
+; X32-NEXT: xorl $7, %eax
+; X32-NEXT: .LBB8_2: # %cond.end
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i8_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movb $8, %al
+; X64-NEXT: testb %dil, %dil
+; X64-NEXT: je .LBB8_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsrl %eax, %eax
+; X64-NEXT: xorl $7, %eax
+; X64-NEXT: .LBB8_2: # %cond.end
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i8_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: addl $-24, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i8_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: lzcntl %eax, %eax
+; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false)
+ ret i8 %tmp1
+}
-; CHECK-LABEL: ctlz_i32_zero_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: je .LBB8_2
-; CHECK-NEXT: # BB#1: # %cond.false
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: .LBB8_2: # %cond.end
-; CHECK-NEXT: retq
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i16 @ctlz_i16_zero_test(i16 %n) {
+; X32-LABEL: ctlz_i16_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: testw %cx, %cx
+; X32-NEXT: je .LBB9_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrw %cx, %ax
+; X32-NEXT: xorl $15, %eax
+; X32-NEXT: .LBB9_2: # %cond.end
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i16_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %ax
+; X64-NEXT: testw %di, %di
+; X64-NEXT: je .LBB9_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrw %di, %ax
+; X64-NEXT: xorl $15, %eax
+; X64-NEXT: .LBB9_2: # %cond.end
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i16_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i16_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntw %di, %ax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i16 @llvm.ctlz.i16(i16 %n, i1 false)
+ ret i16 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i32 @ctlz_i32_zero_test(i32 %n) {
+; X32-LABEL: ctlz_i32_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: je .LBB10_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: .LBB10_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i32_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: je .LBB10_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: .LBB10_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i32_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i32_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
ret i32 %tmp1
}
-define i32 @ctlz_i32_fold_cmov(i32 %n) {
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i64 @ctlz_i64_zero_test(i64 %n) {
+; X32-LABEL: ctlz_i64_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl $63, %eax
+; X32-NEXT: je .LBB11_2
+; X32-NEXT: # BB#1:
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: .LBB11_2:
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: jne .LBB11_3
+; X32-NEXT: # BB#4:
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB11_3:
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i64_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %eax
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: je .LBB11_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrq %rdi, %rax
+; X64-NEXT: xorq $63, %rax
+; X64-NEXT: .LBB11_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i64_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB11_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB11_1:
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i64_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i64 @llvm.ctlz.i64(i64 %n, i1 false)
+ ret i64 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i8 @cttz_i8_zero_test(i8 %n) {
+; X32-LABEL: cttz_i8_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movb $8, %al
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: je .LBB12_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: bsfl %eax, %eax
+; X32-NEXT: .LBB12_2: # %cond.end
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i8_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movb $8, %al
+; X64-NEXT: testb %dil, %dil
+; X64-NEXT: je .LBB12_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsfl %eax, %eax
+; X64-NEXT: .LBB12_2: # %cond.end
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i8_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: orl $256, %eax # imm = 0x100
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i8_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: orl $256, %eax # imm = 0x100
+; X64-CLZ-NEXT: tzcntl %eax, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)
+ ret i8 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i16 @cttz_i16_zero_test(i16 %n) {
+; X32-LABEL: cttz_i16_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: testw %cx, %cx
+; X32-NEXT: je .LBB13_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsfw %cx, %ax
+; X32-NEXT: .LBB13_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i16_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %ax
+; X64-NEXT: testw %di, %di
+; X64-NEXT: je .LBB13_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsfw %di, %ax
+; X64-NEXT: .LBB13_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i16_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i16_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntw %di, %ax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i16 @llvm.cttz.i16(i16 %n, i1 false)
+ ret i16 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i32 @cttz_i32_zero_test(i32 %n) {
+; X32-LABEL: cttz_i32_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: je .LBB14_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsfl %ecx, %eax
+; X32-NEXT: .LBB14_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i32_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: je .LBB14_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: .LBB14_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i32_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i32_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i32 @llvm.cttz.i32(i32 %n, i1 false)
+ ret i32 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i64 @cttz_i64_zero_test(i64 %n) {
+; X32-LABEL: cttz_i64_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: bsfl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: je .LBB15_2
+; X32-NEXT: # BB#1:
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: .LBB15_2:
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: jne .LBB15_3
+; X32-NEXT: # BB#4:
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB15_3:
+; X32-NEXT: bsfl %ecx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i64_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %eax
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: je .LBB15_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsfq %rdi, %rax
+; X64-NEXT: .LBB15_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i64_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB15_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB15_1:
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i64_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i64 @llvm.cttz.i64(i64 %n, i1 false)
+ ret i64 %tmp1
+}
+
; Don't generate the cmovne when the source is known non-zero (and bsr would
; not set ZF).
; rdar://9490949
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
; codegen doesn't know how to delete the movl and je.
-
-; CHECK-LABEL: ctlz_i32_fold_cmov:
-; CHECK: # BB#0:
-; CHECK-NEXT: orl $1, %edi
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: je .LBB9_2
-; CHECK-NEXT: # BB#1: # %cond.false
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: .LBB9_2: # %cond.end
-; CHECK-NEXT: retq
+define i32 @ctlz_i32_fold_cmov(i32 %n) {
+; X32-LABEL: ctlz_i32_fold_cmov:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: orl $1, %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: je .LBB16_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: .LBB16_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i32_fold_cmov:
+; X64: # BB#0:
+; X64-NEXT: orl $1, %edi
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: je .LBB16_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: .LBB16_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i32_fold_cmov:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: orl $1, %eax
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i32_fold_cmov:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: orl $1, %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%or = or i32 %n, 1
%tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)
ret i32 %tmp1
}
-define i32 @ctlz_bsr(i32 %n) {
; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
; the most significant bit, which is what 'bsr' does natively.
-
-; CHECK-LABEL: ctlz_bsr:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: retq
+; FIXME: We should probably select BSR instead of LZCNT in these circumstances.
+define i32 @ctlz_bsr(i32 %n) {
+; X32-LABEL: ctlz_bsr:
+; X32: # BB#0:
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_bsr:
+; X64: # BB#0:
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_bsr:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: xorl $31, %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_bsr:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: xorl $31, %eax
+; X64-CLZ-NEXT: retq
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
%bsr = xor i32 %ctlz, 31
ret i32 %bsr
}
-define i32 @ctlz_bsr_zero_test(i32 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
; codegen doesn't know how to combine the $32 and $31 into $63.
-
-; CHECK-LABEL: ctlz_bsr_zero_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: je .LBB11_2
-; CHECK-NEXT: # BB#1: # %cond.false
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: .LBB11_2: # %cond.end
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: retq
+define i32 @ctlz_bsr_zero_test(i32 %n) {
+; X32-LABEL: ctlz_bsr_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: je .LBB18_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: .LBB18_2: # %cond.end
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_bsr_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: je .LBB18_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: .LBB18_2: # %cond.end
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_bsr_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: xorl $31, %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_bsr_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: xorl $31, %eax
+; X64-CLZ-NEXT: retq
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
%bsr = xor i32 %ctlz, 31
ret i32 %bsr
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
index 909440800a56..acb5a2bb51f1 100644
--- a/test/CodeGen/X86/cmov-into-branch.ll
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -1,63 +1,135 @@
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-; cmp with single-use load, should not form cmov.
+; cmp with single-use load, should not form branch.
define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: ucomisd (%rdi), %xmm0
+; CHECK-NEXT: cmovbel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+;
%load = load double, double* %b, align 8
%cmp = fcmp olt double %load, %a
%cond = select i1 %cmp, i32 %x, i32 %y
ret i32 %cond
-; CHECK-LABEL: test1:
-; CHECK: ucomisd
-; CHECK-NOT: cmov
-; CHECK: j
-; CHECK-NOT: cmov
}
; Sanity check: no load.
define i32 @test2(double %a, double %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: cmovbel %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+;
%cmp = fcmp ogt double %a, %b
%cond = select i1 %cmp, i32 %x, i32 %y
ret i32 %cond
-; CHECK-LABEL: test2:
-; CHECK: ucomisd
-; CHECK: cmov
-}
-
-; Multiple uses of %a, should not form cmov.
-define i32 @test3(i32 %a, i32* nocapture %b, i32 %x) {
- %load = load i32, i32* %b, align 4
- %cmp = icmp ult i32 %load, %a
- %cond = select i1 %cmp, i32 %a, i32 %x
- ret i32 %cond
-; CHECK-LABEL: test3:
-; CHECK: cmpl
-; CHECK-NOT: cmov
-; CHECK: j
-; CHECK-NOT: cmov
}
; Multiple uses of the load.
define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl (%rsi), %eax
+; CHECK-NEXT: cmpl %edi, %eax
+; CHECK-NEXT: cmovael %ecx, %edx
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+;
%load = load i32, i32* %b, align 4
%cmp = icmp ult i32 %load, %a
%cond = select i1 %cmp, i32 %x, i32 %y
%add = add i32 %cond, %load
ret i32 %add
-; CHECK-LABEL: test4:
-; CHECK: cmpl
-; CHECK: cmov
}
; Multiple uses of the cmp.
define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpl %edi, (%rsi)
+; CHECK-NEXT: cmoval %edi, %ecx
+; CHECK-NEXT: cmovael %edx, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retq
+;
%load = load i32, i32* %b, align 4
%cmp = icmp ult i32 %load, %a
%cmp1 = icmp ugt i32 %load, %a
%cond = select i1 %cmp1, i32 %a, i32 %y
%cond5 = select i1 %cmp, i32 %cond, i32 %x
ret i32 %cond5
-; CHECK-LABEL: test5:
-; CHECK: cmpl
-; CHECK: cmov
-; CHECK: cmov
}
+
+; If a select is not obviously predictable, don't turn it into a branch.
+define i32 @weighted_select1(i32 %a, i32 %b) {
+; CHECK-LABEL: weighted_select1:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cmovnel %edi, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
+ ret i32 %sel
+}
+
+; If a select is obviously predictable, turn it into a branch.
+define i32 @weighted_select2(i32 %a, i32 %b) {
+; CHECK-LABEL: weighted_select2:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jne [[LABEL_BB5:.*]]
+; CHECK: movl %esi, %edi
+; CHECK-NEXT: [[LABEL_BB5]]
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1
+ ret i32 %sel
+}
+
+; Note the reversed profile weights: it doesn't matter if it's
+; obviously true or obviously false.
+; Either one should become a branch rather than conditional move.
+; TODO: But likely true vs. likely false should affect basic block placement?
+define i32 @weighted_select3(i32 %a, i32 %b) {
+; CHECK-LABEL: weighted_select3:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jne [[LABEL_BB6:.*]]
+; CHECK: movl %esi, %edi
+; CHECK-NEXT: [[LABEL_BB6]]
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
+ ret i32 %sel
+}
+
+; Weightlessness is no reason to die.
+define i32 @unweighted_select(i32 %a, i32 %b) {
+; CHECK-LABEL: unweighted_select:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cmovnel %edi, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3
+ ret i32 %sel
+}
+
+!0 = !{!"branch_weights", i32 1, i32 99}
+!1 = !{!"branch_weights", i32 1, i32 100}
+!2 = !{!"branch_weights", i32 100, i32 1}
+!3 = !{!"branch_weights", i32 0, i32 0}
+
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index f2f36b15d0c5..9acc9ea4fb18 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -120,8 +120,8 @@ declare i32 @printf(i8* nocapture, ...) nounwind
define i32 @test5(i32* nocapture %P) nounwind readonly {
entry:
; CHECK-LABEL: test5:
+; CHECK: xorl %eax, %eax
; CHECK: setg %al
-; CHECK: movzbl %al, %eax
; CHECK: orl $-2, %eax
; CHECK: ret
@@ -134,8 +134,8 @@ entry:
define i32 @test6(i32* nocapture %P) nounwind readonly {
entry:
; CHECK-LABEL: test6:
+; CHECK: xorl %eax, %eax
; CHECK: setl %al
-; CHECK: movzbl %al, %eax
; CHECK: leal 4(%rax,%rax,8), %eax
; CHECK: ret
%0 = load i32, i32* %P, align 4 ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll
index 9363d31866d7..38ba308ecff5 100644
--- a/test/CodeGen/X86/cmovcmov.ll
+++ b/test/CodeGen/X86/cmovcmov.ll
@@ -250,14 +250,14 @@ attributes #0 = { nounwind }
; CMOV-DAG: movb $20, %al
; CMOV-DAG: movb $20, %dl
; CMOV: jl [[BB0:.LBB[0-9_]+]]
-; CMOV: movb %cl, %dl
+; CMOV: movl %ecx, %edx
; CMOV: [[BB0]]:
; CMOV: jg [[BB1:.LBB[0-9_]+]]
-; CMOV: movb %dl, %al
+; CMOV: movl %edx, %eax
; CMOV: [[BB1]]:
; CMOV: testl %edi, %edi
; CMOV: je [[BB2:.LBB[0-9_]+]]
-; CMOV: movb %dl, %al
+; CMOV: movl %edx, %eax
; CMOV: [[BB2]]:
; CMOV: movb %al, g8(%rip)
; CMOV: retq
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index eb9a29011428..d24f27ddf22c 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -49,9 +49,9 @@ define i64 @test3(i64 %x) nounwind {
%r = zext i1 %t to i64
ret i64 %r
; CHECK-LABEL: test3:
+; CHECK: xorl %eax, %eax
; CHECK: testq %rdi, %rdi
; CHECK: sete %al
-; CHECK: movzbl %al, %eax
; CHECK: ret
}
@@ -60,9 +60,9 @@ define i64 @test4(i64 %x) nounwind {
%r = zext i1 %t to i64
ret i64 %r
; CHECK-LABEL: test4:
+; CHECK: xorl %eax, %eax
; CHECK: testq %rdi, %rdi
; CHECK: setle %al
-; CHECK: movzbl %al, %eax
; CHECK: ret
}
@@ -255,3 +255,30 @@ define zeroext i1 @test19(i32 %L) {
; CHECK: testl %edi, %edi
; CHECK: setns %al
}
+
+@d = global i8 0, align 1
+
+; This test failed due to incorrect handling of "shift + icmp" sequence
+define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
+ %bf.shl = shl i32 %bf.load, 8
+ %bf.ashr = ashr exact i32 %bf.shl, 8
+ %tobool4 = icmp ne i32 %bf.ashr, 0
+ %conv = zext i1 %tobool4 to i32
+ %conv6 = zext i8 %x1 to i32
+ %add = add nuw nsw i32 %conv, %conv6
+ %tobool7 = icmp ne i32 %add, 0
+ %frombool = zext i1 %tobool7 to i8
+ store i8 %frombool, i8* %b_addr, align 1
+ %tobool14 = icmp ne i32 %bf.shl, 0
+ %frombool15 = zext i1 %tobool14 to i8
+ store i8 %frombool15, i8* @d, align 1
+ ret void
+
+; CHECK-LABEL: test20
+; CHECK: andl
+; CHECK: setne
+; CHECK: addl
+; CHECK: setne
+; CHECK: testl
+; CHECK: setne
+} \ No newline at end of file
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index 1665360e4990..f2b9dee91037 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -21,9 +21,11 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
; i386-NEXT: lahf
; i386-NEXT: movl %eax, [[FLAGS:%.*]]
; i386-NEXT: popl %eax
-; i386-NEXT: movl %edx, 4(%esp)
-; i386-NEXT: movl %eax, (%esp)
+; i386-NEXT: subl $8, %esp
+; i386-NEXT: pushl %edx
+; i386-NEXT: pushl %eax
; i386-NEXT: calll bar
+; i386-NEXT: addl $16, %esp
; i386-NEXT: movl [[FLAGS]], %eax
; i386-NEXT: addb $127, %al
; i386-NEXT: sahf
@@ -61,11 +63,10 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: movq %rax, %rdi
; x8664-sahf-NEXT: callq bar
-; x8664-sahf-NEXT: pushq %rax
+; RAX is dead, no need to push and pop it.
; x8664-sahf-NEXT: movq [[FLAGS]], %rax
; x8664-sahf-NEXT: addb $127, %al
; x8664-sahf-NEXT: sahf
-; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: jne
%cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
@@ -166,11 +167,10 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
; x8664-sahf-LABEL: test_feed_cmov:
; x8664-sahf: cmpxchgl
-; x8664-sahf: pushq %rax
+; RAX is dead, do not push or pop it.
; x8664-sahf-NEXT: seto %al
; x8664-sahf-NEXT: lahf
; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
-; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: callq foo
; x8664-sahf-NEXT: pushq %rax
; x8664-sahf-NEXT: movq [[FLAGS]], %rax
diff --git a/test/CodeGen/X86/cmpxchg-i1.ll b/test/CodeGen/X86/cmpxchg-i1.ll
index 5f5869f78bba..97e4472b0890 100644
--- a/test/CodeGen/X86/cmpxchg-i1.ll
+++ b/test/CodeGen/X86/cmpxchg-i1.ll
@@ -34,7 +34,7 @@ define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) {
; CHECK-LABEL: cmpxchg_sext:
; CHECK-DAG: cmpxchgl
; CHECK-NOT: cmpl
-; CHECK: sete %al
+; CHECK: sete %cl
; CHECK: retq
%pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
%success = extractvalue { i32, i1 } %pair, 1
@@ -44,10 +44,10 @@ define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) {
define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) {
; CHECK-LABEL: cmpxchg_zext:
+; CHECK: xorl %e[[R:[a-z]]]x
; CHECK: cmpxchgl
; CHECK-NOT: cmp
-; CHECK: sete [[BYTE:%[a-z0-9]+]]
-; CHECK: movzbl [[BYTE]], %eax
+; CHECK: sete %[[R]]l
%pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
%success = extractvalue { i32, i1 } %pair, 1
%mask = zext i1 %success to i32
diff --git a/test/CodeGen/X86/cmpxchg-i128-i1.ll b/test/CodeGen/X86/cmpxchg-i128-i1.ll
index 278e6a4ed75e..1510b2a49c32 100644
--- a/test/CodeGen/X86/cmpxchg-i128-i1.ll
+++ b/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -44,10 +44,10 @@ define i1 @cmpxchg_arithcmp(i128* %addr, i128 %desired, i128 %new) {
define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) {
; CHECK-LABEL: cmpxchg_zext:
+; CHECK: xorl
; CHECK: cmpxchg16b
; CHECK-NOT: cmpq
-; CHECK: sete [[BYTE:%[a-z0-9]+]]
-; CHECK: movzbl [[BYTE]], %eax
+; CHECK: sete
%pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
%success = extractvalue { i128, i1 } %pair, 1
%mask = zext i1 %success to i128
diff --git a/test/CodeGen/X86/coalescer-commute3.ll b/test/CodeGen/X86/coalescer-commute3.ll
index e5bd448a4158..9f22bf0e1a7a 100644
--- a/test/CodeGen/X86/coalescer-commute3.ll
+++ b/test/CodeGen/X86/coalescer-commute3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | grep mov | count 6
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 -no-x86-call-frame-opt | grep mov | count 6
%struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* }
diff --git a/test/CodeGen/X86/code_placement_align_all.ll b/test/CodeGen/X86/code_placement_align_all.ll
index 53df90620204..11dc59a3bab9 100644
--- a/test/CodeGen/X86/code_placement_align_all.ll
+++ b/test/CodeGen/X86/code_placement_align_all.ll
@@ -1,9 +1,9 @@
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -align-all-blocks=16 < %s | FileCheck %s
;CHECK-LABEL: foo:
-;CHECK: .align 65536, 0x90
-;CHECK: .align 65536, 0x90
-;CHECK: .align 65536, 0x90
+;CHECK: .p2align 16, 0x90
+;CHECK: .p2align 16, 0x90
+;CHECK: .p2align 16, 0x90
;CHECK: ret
define i32 @foo(i32 %t, i32 %l) nounwind readnone ssp uwtable {
%1 = icmp eq i32 %t, 0
diff --git a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
index 592d1ce45bb6..d7dc8defac3a 100644
--- a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
+++ b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
define void @foo() !prof !1 {
; Test if a cold block in a loop will be placed at the end of the function
diff --git a/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll b/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
index 79b4883fb1d6..b30aaea9024b 100644
--- a/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
+++ b/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
define void @foo() {
; Test that when determining the edge probability from a node in an inner loop
diff --git a/test/CodeGen/X86/code_placement_loop_rotation.ll b/test/CodeGen/X86/code_placement_loop_rotation.ll
index 3ec5961486e8..96fbc8138999 100644
--- a/test/CodeGen/X86/code_placement_loop_rotation.ll
+++ b/test/CodeGen/X86/code_placement_loop_rotation.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
define void @foo() {
diff --git a/test/CodeGen/X86/code_placement_loop_rotation2.ll b/test/CodeGen/X86/code_placement_loop_rotation2.ll
index 6d8b3c99cd05..ea95c5438e3b 100644
--- a/test/CodeGen/X86/code_placement_loop_rotation2.ll
+++ b/test/CodeGen/X86/code_placement_loop_rotation2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
define void @foo() {
diff --git a/test/CodeGen/X86/code_placement_loop_rotation3.ll b/test/CodeGen/X86/code_placement_loop_rotation3.ll
new file mode 100644
index 000000000000..6a5b743ef8a1
--- /dev/null
+++ b/test/CodeGen/X86/code_placement_loop_rotation3.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -force-precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK
+
+define void @bar() {
+; Test that all edges in the loop chain are fall through with profile data.
+;
+; CHECK-LABEL: bar:
+; CHECK: latch
+; CHECK: header
+; CHECK: if.then
+; CHECK: end
+
+entry:
+ br label %header
+
+header:
+ call void @e()
+ %call = call zeroext i1 @a()
+ br i1 %call, label %if.then, label %latch, !prof !1
+
+if.then:
+ call void @f()
+ %call3 = call zeroext i1 @a()
+ br i1 %call3, label %latch, label %end, !prof !2
+
+latch:
+ call void @h()
+ %call2 = call zeroext i1 @a()
+ br i1 %call2, label %header, label %end, !prof !3
+
+end:
+ ret void
+}
+
+declare zeroext i1 @a()
+declare void @e()
+declare void @f()
+declare void @g()
+declare void @h()
+
+!1 = !{!"branch_weights", i32 16, i32 16}
+!2 = !{!"branch_weights", i32 97, i32 3}
+!3 = !{!"branch_weights", i32 97, i32 3}
diff --git a/test/CodeGen/X86/code_placement_outline_optional_branches.ll b/test/CodeGen/X86/code_placement_outline_optional_branches.ll
index 3364915fd1b7..5624d435215a 100644
--- a/test/CodeGen/X86/code_placement_outline_optional_branches.ll
+++ b/test/CodeGen/X86/code_placement_outline_optional_branches.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -outline-optional-branches < %s | FileCheck %s -check-prefix=CHECK-OUTLINE
define void @foo(i32 %t1, i32 %t2, i32 %t3) {
diff --git a/test/CodeGen/X86/combine-multiplies.ll b/test/CodeGen/X86/combine-multiplies.ll
index 5e51edbf52f9..15528cd0714b 100644
--- a/test/CodeGen/X86/combine-multiplies.ll
+++ b/test/CodeGen/X86/combine-multiplies.ll
@@ -31,10 +31,10 @@
;
; CHECK-LABEL: testCombineMultiplies
; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190
-; CHECK-NEXT: leal ([[MUL]],[[ARG2:%[a-z]+]]), [[LEA:%[a-z]+]]
+; CHECK-NEXT: leal ([[ARG2:%[a-z]+]],[[MUL]]), [[LEA:%[a-z]+]]
; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4)
-; CHECK-NEXT: movl $22, {{[0-9]+}}([[MUL]],[[ARG2]])
-; CHECK-NEXT: movl $33, {{[0-9]+}}([[MUL]],[[ARG2]])
+; CHECK-NEXT: movl $22, {{[0-9]+}}([[ARG2]],[[MUL]])
+; CHECK-NEXT: movl $33, {{[0-9]+}}([[ARG2]],[[MUL]])
; CHECK: retl
;
@@ -109,7 +109,7 @@ entry:
; CHECK-NEXT: movdqa [[C242]], v2
; CHECK-NEXT: [[C726]], v3
; CHECK-NEXT: [[C11]], x
-; CHECK-NEXT: retl
+; CHECK-NEXT: retl
@v2 = common global <4 x i32> zeroinitializer, align 16
@v3 = common global <4 x i32> zeroinitializer, align 16
@@ -148,7 +148,7 @@ entry:
; CHECK-NEXT: movdqa [[C242]], v2
; CHECK-NEXT: [[C726]], v3
; CHECK-NEXT: [[C11]], x
-; CHECK-NEXT: retl
+; CHECK-NEXT: retl
; Function Attrs: nounwind
define void @testCombineMultiplies_non_splat(<4 x i32> %v1) {
entry:
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index e17cfbeeee12..5cbd74980cab 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -356,3 +356,62 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
ret <4 x i8> %or
}
+; Verify that we can fold regardless of which operand is the zeroinitializer
+
+define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2b:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2c:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+ %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2d:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+ %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+; Make sure we can have an undef where an index pointing to the zero vector should be
+
+define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2e:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2f:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
diff --git a/test/CodeGen/X86/combine-testm-and.ll b/test/CodeGen/X86/combine-testm-and.ll
new file mode 100644
index 000000000000..2b95a114540d
--- /dev/null
+++ b/test/CodeGen/X86/combine-testm-and.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s
+
+define i32 @combineTESTM_AND_1(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: combineTESTM_AND_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %and.i = and <8 x i64> %b, %a
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 -1)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+define i32 @combineTESTM_AND_2(<8 x i64> %a, <8 x i64> %b , i8 %mask) {
+; CHECK-LABEL: combineTESTM_AND_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %and.i = and <8 x i64> %b, %a
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+define i32 @combineTESTM_AND_mask_3(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
+; CHECK-LABEL: combineTESTM_AND_mask_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %b = load <8 x i64>, <8 x i64>* %bptr
+ %and.i = and <8 x i64> %a, %b
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+define i32 @combineTESTM_AND_mask_4(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
+; CHECK-LABEL: combineTESTM_AND_mask_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %b = load <8 x i64>, <8 x i64>* %bptr
+ %and.i = and <8 x i64> %b, %a
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll
index bd497ba40767..c39aa0b12b32 100644
--- a/test/CodeGen/X86/commute-blend-avx2.ll
+++ b/test/CodeGen/X86/commute-blend-avx2.ll
@@ -1,89 +1,90 @@
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendw_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; CHECK-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %b
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
ret <8 x i16> %2
-
- ;LABEL: commute_fold_vpblendw_128
- ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
- ;CHECK-NEXT: retq
}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendw_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
+; CHECK-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %b
%2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17)
ret <16 x i16> %2
-
- ;LABEL: commute_fold_vpblendw_256
- ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
- ;CHECK-NEXT: retq
}
declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; CHECK-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %b
%2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
ret <4 x i32> %2
-
- ;LABEL: commute_fold_vpblendd_128
- ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
- ;CHECK-NEXT: retq
}
declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
+; CHECK-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %b
%2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
ret <8 x i32> %2
-
- ;LABEL: commute_fold_vpblendd_256
- ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
- ;CHECK-NEXT: retq
}
declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendps_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
+; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b
%2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 5)
ret <4 x float> %2
-
- ;LABEL: commute_fold_vblendps_128
- ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
- ;CHECK-NEXT: retq
}
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
+; CHECK-NEXT: retq
%1 = load <8 x float>, <8 x float>* %b
%2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7)
ret <8 x float> %2
-
- ;LABEL: commute_fold_vblendps_256
- ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
- ;CHECK-NEXT: retq
}
declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendpd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double>* %b
%2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
ret <2 x double> %2
-
- ;LABEL: commute_fold_vblendpd_128
- ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
- ;CHECK-NEXT: retq
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendpd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
+; CHECK-NEXT: retq
%1 = load <4 x double>, <4 x double>* %b
%2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
ret <4 x double> %2
-
- ;LABEL: commute_fold_vblendpd_256
- ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
- ;CHECK-NEXT: retq
}
declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll
index 8cebcdb8eeae..14a685b179a5 100644
--- a/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/test/CodeGen/X86/commute-blend-sse41.ll
@@ -1,34 +1,35 @@
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s
define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
+; CHECK-LABEL: commute_fold_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; CHECK-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %b
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
ret <8 x i16> %2
-
- ;LABEL: commute_fold_pblendw
- ;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
- ;CHECK-NEXT: retq
}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
+; CHECK-LABEL: commute_fold_blendps:
+; CHECK: # BB#0:
+; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
+; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b
%2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 5)
ret <4 x float> %2
-
- ;LABEL: commute_fold_blendps
- ;CHECK: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
- ;CHECK-NEXT: retq
}
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
+; CHECK-LABEL: commute_fold_blendpd:
+; CHECK: # BB#0:
+; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double>* %b
%2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
ret <2 x double> %2
-
- ;LABEL: commute_fold_vblendpd
- ;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
- ;CHECK-NEXT: retq
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commute-fcmp.ll b/test/CodeGen/X86/commute-fcmp.ll
index 6f43ebe1fcd7..4274d1feaa3b 100644
--- a/test/CodeGen/X86/commute-fcmp.ll
+++ b/test/CodeGen/X86/commute-fcmp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
@@ -6,164 +7,332 @@
; Only equal/not-equal/ordered/unordered can be safely commuted
;
-define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_eq
- ;SSE: cmpeqps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_eq
- ;AVX: vcmpeqps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_eq:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_eq:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp oeq <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_ne
- ;SSE: cmpneqps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_ne
- ;AVX: vcmpneqps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ne:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ne:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp une <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_ord
- ;SSE: cmpordps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_ord
- ;AVX: vcmpordps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ord:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ord:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp ord <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_uno
- ;SSE: cmpunordps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_uno
- ;AVX: vcmpunordps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_uno:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_uno:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp uno <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_lt
- ;SSE: movaps (%rdi), %xmm1
- ;SSE-NEXT: cmpltps %xmm0, %xmm1
- ;SSE-NEXT: movaps %xmm1, %xmm0
- ;SSE-NEXT: retq
+define <4 x i32> @commute_cmpps_ueq(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ueq:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: cmpeqps %xmm0, %xmm2
+; SSE-NEXT: cmpunordps %xmm1, %xmm0
+; SSE-NEXT: orps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ueq:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpeqps %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vorps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x float>, <4 x float>* %a0
+ %2 = fcmp ueq <4 x float> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i32>
+ ret <4 x i32> %3
+}
- ;AVX-LABEL: commute_cmpps_lt
- ;AVX: vmovaps (%rdi), %xmm1
- ;AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
+define <4 x i32> @commute_cmpps_one(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_one:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: cmpneqps %xmm0, %xmm2
+; SSE-NEXT: cmpordps %xmm1, %xmm0
+; SSE-NEXT: andps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_one:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpneqps %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpordps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x float>, <4 x float>* %a0
+ %2 = fcmp one <4 x float> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i32>
+ ret <4 x i32> %3
+}
+define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_lt:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: cmpltps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_lt:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp olt <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_le
- ;SSE: movaps (%rdi), %xmm1
- ;SSE-NEXT: cmpleps %xmm0, %xmm1
- ;SSE-NEXT: movaps %xmm1, %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_le
- ;AVX: vmovaps (%rdi), %xmm1
- ;AVX-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_le:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: cmpleps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_le:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp ole <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_eq_ymm
- ;AVX: vcmpeqps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_eq_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqps (%rdi), %xmm0
+; SSE-NEXT: cmpeqps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_eq_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp oeq <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_ne_ymm
- ;AVX: vcmpneqps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ne_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqps (%rdi), %xmm0
+; SSE-NEXT: cmpneqps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ne_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp une <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_ord_ymm
- ;AVX: vcmpordps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ord_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordps (%rdi), %xmm0
+; SSE-NEXT: cmpordps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ord_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp ord <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_uno_ymm
- ;AVX: vcmpunordps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_uno_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordps (%rdi), %xmm0
+; SSE-NEXT: cmpunordps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_uno_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp uno <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_lt_ymm
- ;AVX: vmovaps (%rdi), %ymm1
- ;AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
+define <8 x i32> @commute_cmpps_ueq_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ueq_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: cmpeqps %xmm0, %xmm4
+; SSE-NEXT: cmpunordps %xmm2, %xmm0
+; SSE-NEXT: orps %xmm4, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: cmpeqps %xmm1, %xmm2
+; SSE-NEXT: cmpunordps %xmm3, %xmm1
+; SSE-NEXT: orps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ueq_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpeqps %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpunordps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <8 x float>, <8 x float>* %a0
+ %2 = fcmp ueq <8 x float> %1, %a1
+ %3 = sext <8 x i1> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_one_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_one_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: cmpneqps %xmm0, %xmm4
+; SSE-NEXT: cmpordps %xmm2, %xmm0
+; SSE-NEXT: andps %xmm4, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: cmpneqps %xmm1, %xmm2
+; SSE-NEXT: cmpordps %xmm3, %xmm1
+; SSE-NEXT: andps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_one_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpneqps %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpordps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <8 x float>, <8 x float>* %a0
+ %2 = fcmp one <8 x float> %1, %a1
+ %3 = sext <8 x i1> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_lt_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: cmpltps %xmm0, %xmm2
+; SSE-NEXT: cmpltps %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_lt_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp olt <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_le_ymm
- ;AVX: vmovaps (%rdi), %ymm1
- ;AVX-NEXT: vcmpleps %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_le_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: cmpleps %xmm0, %xmm2
+; SSE-NEXT: cmpleps %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_le_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpleps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp ole <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -175,164 +344,332 @@ define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
; Only equal/not-equal/ordered/unordered can be safely commuted
;
-define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_eq
- ;SSE: cmpeqpd (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_eq
- ;AVX: vcmpeqpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_eq:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_eq:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp oeq <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_ne
- ;SSE: cmpneqpd (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_ne
- ;AVX: vcmpneqpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ne:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ne:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp une <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_ord
- ;SSE: cmpordpd (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_ord
- ;AVX: vcmpordpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ord:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ord:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp ord <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_uno
- ;SSE: cmpunordpd (%rdi), %xmm0
- ;SSE-NEXT: retq
+define <2 x i64> @commute_cmppd_ueq(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ueq:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm2
+; SSE-NEXT: cmpeqpd %xmm0, %xmm2
+; SSE-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE-NEXT: orpd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ueq:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vorpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <2 x double>, <2 x double>* %a0
+ %2 = fcmp ueq <2 x double> %1, %a1
+ %3 = sext <2 x i1> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
- ;AVX-LABEL: commute_cmppd_uno
- ;AVX: vcmpunordpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
+define <2 x i64> @commute_cmppd_one(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_one:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm2
+; SSE-NEXT: cmpneqpd %xmm0, %xmm2
+; SSE-NEXT: cmpordpd %xmm1, %xmm0
+; SSE-NEXT: andpd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_one:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmpneqpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpordpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <2 x double>, <2 x double>* %a0
+ %2 = fcmp one <2 x double> %1, %a1
+ %3 = sext <2 x i1> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
+define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_uno:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_uno:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp uno <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_lt
- ;SSE: movapd (%rdi), %xmm1
- ;SSE-NEXT: cmpltpd %xmm0, %xmm1
- ;SSE-NEXT: movapd %xmm1, %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_lt
- ;AVX: vmovapd (%rdi), %xmm1
- ;AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_lt:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: cmpltpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_lt:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp olt <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_le
- ;SSE: movapd (%rdi), %xmm1
- ;SSE-NEXT: cmplepd %xmm0, %xmm1
- ;SSE-NEXT: movapd %xmm1, %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_le
- ;AVX: vmovapd (%rdi), %xmm1
- ;AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_le:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: cmplepd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_le:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp ole <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_eq
- ;AVX: vcmpeqpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_eq_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqpd (%rdi), %xmm0
+; SSE-NEXT: cmpeqpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_eq_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp oeq <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_ne
- ;AVX: vcmpneqpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ne_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqpd (%rdi), %xmm0
+; SSE-NEXT: cmpneqpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ne_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp une <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_ord
- ;AVX: vcmpordpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ord_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordpd (%rdi), %xmm0
+; SSE-NEXT: cmpordpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ord_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp ord <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_uno
- ;AVX: vcmpunordpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_uno_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordpd (%rdi), %xmm0
+; SSE-NEXT: cmpunordpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_uno_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp uno <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_lt
- ;AVX: vmovapd (%rdi), %ymm1
- ;AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
+define <4 x i64> @commute_cmppd_ueq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ueq_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm4
+; SSE-NEXT: cmpeqpd %xmm0, %xmm4
+; SSE-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE-NEXT: orpd %xmm4, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm2
+; SSE-NEXT: cmpeqpd %xmm1, %xmm2
+; SSE-NEXT: cmpunordpd %xmm3, %xmm1
+; SSE-NEXT: orpd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ueq_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmpeqpd %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpunordpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vorpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x double>, <4 x double>* %a0
+ %2 = fcmp ueq <4 x double> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_one_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_one_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm4
+; SSE-NEXT: cmpneqpd %xmm0, %xmm4
+; SSE-NEXT: cmpordpd %xmm2, %xmm0
+; SSE-NEXT: andpd %xmm4, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm2
+; SSE-NEXT: cmpneqpd %xmm1, %xmm2
+; SSE-NEXT: cmpordpd %xmm3, %xmm1
+; SSE-NEXT: andpd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_one_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmpneqpd %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpordpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x double>, <4 x double>* %a0
+ %2 = fcmp one <4 x double> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_lt_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: cmpltpd %xmm0, %xmm2
+; SSE-NEXT: cmpltpd %xmm1, %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_lt_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp olt <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_le
- ;AVX: vmovapd (%rdi), %ymm1
- ;AVX-NEXT: vcmplepd %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_le_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: cmplepd %xmm0, %xmm2
+; SSE-NEXT: cmplepd %xmm1, %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_le_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmplepd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp ole <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index e7c846045f01..fd94f595005a 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll
@@ -16,22 +16,22 @@ entry:
}
; CTOR: .section .ctors.65520,"aGw",@progbits,v,comdat
-; CTOR-NEXT: .align 8
+; CTOR-NEXT: .p2align 3
; CTOR-NEXT: .quad g
; CTOR-NEXT: .section .ctors,"aw",@progbits
-; CTOR-NEXT: .align 8
+; CTOR-NEXT: .p2align 3
; CTOR-NEXT: .quad f
; INIT-ARRAY: .section .init_array.15,"aGw",@init_array,v,comdat
-; INIT-ARRAY-NEXT: .align 8
+; INIT-ARRAY-NEXT: .p2align 3
; INIT-ARRAY-NEXT: .quad g
; INIT-ARRAY-NEXT: .section .init_array,"aw",@init_array
-; INIT-ARRAY-NEXT: .align 8
+; INIT-ARRAY-NEXT: .p2align 3
; INIT-ARRAY-NEXT: .quad f
; NACL: .section .init_array.15,"aGw",@init_array,v,comdat
-; NACL-NEXT: .align 4
+; NACL-NEXT: .p2align 2
; NACL-NEXT: .long g
; NACL-NEXT: .section .init_array,"aw",@init_array
-; NACL-NEXT: .align 4
+; NACL-NEXT: .p2align 2
; NACL-NEXT: .long f
diff --git a/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll b/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
new file mode 100644
index 000000000000..b4c30a7380c8
--- /dev/null
+++ b/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
@@ -0,0 +1,268 @@
+; RUN: llc %s -o - | FileCheck %s
+; This file checks some weird corner case in LiveRangeEdit.
+; We used to do crash when we eliminate the definition
+; of the product of splitting when the original live-range
+; has already been removed.
+; Basically, we have the following input.
+; v1 = loadimm cst
+; ...
+; = use v1
+;
+; We split the live-range like this:
+; v1 = loadimm cst
+; ...
+; v2 = copy v1
+; ...
+; = use v2
+;
+; We actually issue loadimm instead of the copy:
+; v1 = loadimm cst
+; ...
+; v2 = loadimm cst
+; ...
+; = use v2
+;
+; v1 is now dead so we remove its live-range.
+; Actually, we shrink it to empty to keep the
+; instruction around for futher remat opportunities
+; (accessbile via the origin pointer.)
+;
+; Later v2 gets remove as well (e.g., because we
+; remat it closer to its use) and the live-range
+; gets eliminated. We used to crash at this point
+; because we were looking for a VNI of origin (v1)
+; at the slot index of the definition of v2. However,
+; we do not have a VNI for v1 at this point, since the
+; live-range is now empty... crash!
+; PR27983
+
+source_filename = "bugpoint-output-1e29d28.bc"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@r = external global i32, align 4
+@k = external global i32, align 4
+@g = external global i32, align 4
+@a = external global i16, align 2
+@p = external global i32, align 4
+@n = external global i16, align 2
+@.str = external unnamed_addr constant [12 x i8], align 1
+@.str.1 = external unnamed_addr constant [13 x i8], align 1
+@s = external global i32, align 4
+@z = external global i16, align 2
+
+; CHECK-LABEL: fn1:
+define void @fn1() #0 {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %cleanup100, %for.end29, %entry
+ %t7.0 = phi i16 [ undef, %entry ], [ %t7.1, %for.end29 ], [ %t7.19, %cleanup100 ]
+ %t9.0 = phi i32 [ undef, %entry ], [ %t9.1, %for.end29 ], [ 0, %cleanup100 ]
+ %t2.0 = phi i32 [ undef, %entry ], [ undef, %for.end29 ], [ %t2.18, %cleanup100 ]
+ %tmp = load i32, i32* @r, align 4
+ br i1 undef, label %if.then, label %if.end7
+
+if.then: ; preds = %for.cond
+ %tobool = icmp ne i32 %tmp, 0
+ %tobool1 = icmp ne i32 %t2.0, 0
+ %tmp1 = and i1 %tobool1, %tobool
+ %land.ext = zext i1 %tmp1 to i32
+ %tmp2 = load i32, i32* @k, align 4
+ %shr = lshr i32 %land.ext, %tmp2
+ %tobool4 = icmp ne i32 %shr, 0
+ %or.cond = and i1 false, %tobool4
+ br i1 %or.cond, label %L6, label %if.end7
+
+if.end7: ; preds = %if.then, %for.cond
+ %t2.1 = phi i32 [ %shr, %if.then ], [ %t2.0, %for.cond ]
+ %tobool8 = icmp eq i32 undef, 0
+ br i1 %tobool8, label %if.end11, label %for.cond10
+
+for.cond10: ; preds = %for.cond10, %if.end7
+ br label %for.cond10
+
+if.end11: ; preds = %if.end7
+ %tmp3 = load i32, i32* @g, align 4
+ %tmp4 = load i16, i16* @a, align 2
+ %conv = sext i16 %tmp4 to i32
+ %div = sdiv i32 %tmp3, %conv
+ %tobool12 = icmp eq i32 %div, 0
+ br i1 %tobool12, label %for.cond15, label %L5
+
+for.cond15: ; preds = %for.cond17, %if.end11
+ %t7.1 = phi i16 [ %t7.2, %for.cond17 ], [ %t7.0, %if.end11 ]
+ %t9.1 = phi i32 [ %t9.2, %for.cond17 ], [ %t9.0, %if.end11 ]
+ %tobool16 = icmp eq i32 undef, 0
+ br i1 %tobool16, label %for.end29, label %for.cond17
+
+for.cond17: ; preds = %for.cond20, %for.cond15
+ %t7.2 = phi i16 [ %t7.3, %for.cond20 ], [ %t7.1, %for.cond15 ]
+ %t9.2 = phi i32 [ undef, %for.cond20 ], [ %t9.1, %for.cond15 ]
+ %tobool18 = icmp eq i8 undef, 0
+ br i1 %tobool18, label %for.cond15, label %for.cond20
+
+for.cond20: ; preds = %for.cond23, %for.cond17
+ %t7.3 = phi i16 [ %t7.4, %for.cond23 ], [ %t7.2, %for.cond17 ]
+ %tobool21 = icmp eq i32 undef, 0
+ br i1 %tobool21, label %for.cond17, label %for.cond23
+
+for.cond23: ; preds = %L1, %for.cond20
+ %t7.4 = phi i16 [ %t7.5, %L1 ], [ %t7.3, %for.cond20 ]
+ %tobool24 = icmp eq i8 undef, 0
+ br i1 %tobool24, label %for.cond20, label %L1
+
+L1: ; preds = %cleanup100, %for.cond23
+ %t7.5 = phi i16 [ %t7.19, %cleanup100 ], [ %t7.4, %for.cond23 ]
+ %conv26 = sext i16 undef to i64
+ br label %for.cond23
+
+for.end29: ; preds = %for.cond15
+ br i1 undef, label %for.cond, label %for.cond32thread-pre-split
+
+for.cond32thread-pre-split: ; preds = %for.end29
+ %.pr = load i32, i32* @p, align 4
+ br label %for.cond32
+
+for.cond32: ; preds = %for.inc94, %for.cond32thread-pre-split
+ %t7.6 = phi i16 [ %t7.1, %for.cond32thread-pre-split ], [ %t7.17, %for.inc94 ]
+ %t3.4 = phi i64 [ 0, %for.cond32thread-pre-split ], [ 0, %for.inc94 ]
+ %t9.6 = phi i32 [ %t9.1, %for.cond32thread-pre-split ], [ 0, %for.inc94 ]
+ %t2.7 = phi i32 [ undef, %for.cond32thread-pre-split ], [ %t2.16, %for.inc94 ]
+ %tobool33 = icmp eq i32 0, 0
+ br i1 %tobool33, label %for.end95, label %for.body34
+
+for.body34: ; preds = %for.cond32
+ %tobool35 = icmp eq i16 undef, 0
+ br i1 %tobool35, label %for.inc94, label %if.then36
+
+if.then36: ; preds = %for.body34
+ %tmp5 = load i16, i16* @n, align 2
+ %tobool37 = icmp eq i32 undef, 0
+ br i1 %tobool37, label %if.end78, label %if.then38
+
+if.then38: ; preds = %if.then36
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i64 undef)
+ %tobool40 = icmp eq i32 undef, 0
+ br i1 %tobool40, label %L3, label %cleanup100
+
+L3: ; preds = %while.end.split, %if.then38
+ %t7.7 = phi i16 [ %tmp5, %if.then38 ], [ %t7.15, %while.end.split ]
+ %t3.5 = phi i64 [ %t3.4, %if.then38 ], [ %t3.11, %while.end.split ]
+ %t2.8 = phi i32 [ %t2.7, %if.then38 ], [ %t2.14, %while.end.split ]
+ %tobool43 = icmp eq i32 undef, 0
+ br i1 %tobool43, label %if.end48, label %cleanup75
+
+if.end48: ; preds = %L3
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i64 %t3.5)
+ br i1 undef, label %if.end61, label %for.cond52.preheader
+
+for.cond52.preheader: ; preds = %if.end48
+ %tobool57 = icmp eq i16 undef, 0
+ %.130 = select i1 %tobool57, i16 -8, i16 0
+ br label %if.end61
+
+if.end61: ; preds = %for.cond52.preheader, %if.end48
+ %t7.9 = phi i16 [ %t7.7, %if.end48 ], [ %.130, %for.cond52.preheader ]
+ %tobool62 = icmp eq i32 undef, 0
+ br i1 %tobool62, label %if.end71, label %if.then63
+
+if.then63: ; preds = %if.end61
+ br i1 undef, label %if.end67, label %L5
+
+L5: ; preds = %cleanup100.L5_crit_edge, %if.then63, %if.end11
+ %.pre = phi i32 [ %.pre.pre, %cleanup100.L5_crit_edge ], [ undef, %if.then63 ], [ %tmp, %if.end11 ]
+ %t7.10 = phi i16 [ %t7.19, %cleanup100.L5_crit_edge ], [ %t7.9, %if.then63 ], [ %t7.0, %if.end11 ]
+ %t3.6 = phi i64 [ 0, %cleanup100.L5_crit_edge ], [ %t3.5, %if.then63 ], [ 2, %if.end11 ]
+ %t9.8 = phi i32 [ 0, %cleanup100.L5_crit_edge ], [ undef, %if.then63 ], [ %t9.0, %if.end11 ]
+ %t2.9 = phi i32 [ %t2.18, %cleanup100.L5_crit_edge ], [ %t2.8, %if.then63 ], [ %t2.1, %if.end11 ]
+ store i32 %t9.8, i32* @s, align 4
+ br label %if.end67
+
+if.end67: ; preds = %L5, %if.then63
+ %tmp6 = phi i32 [ %.pre, %L5 ], [ undef, %if.then63 ]
+ %t7.11 = phi i16 [ %t7.10, %L5 ], [ %t7.9, %if.then63 ]
+ %t3.7 = phi i64 [ %t3.6, %L5 ], [ %t3.5, %if.then63 ]
+ %t9.9 = phi i32 [ %t9.8, %L5 ], [ undef, %if.then63 ]
+ %t2.10 = phi i32 [ %t2.9, %L5 ], [ %t2.8, %if.then63 ]
+ %tobool68 = icmp eq i32 %tmp6, 0
+ br i1 %tobool68, label %if.end71, label %for.end95
+
+if.end71: ; preds = %if.end67, %if.end61
+ %t7.12 = phi i16 [ %t7.11, %if.end67 ], [ %t7.9, %if.end61 ]
+ %t3.8 = phi i64 [ %t3.7, %if.end67 ], [ %t3.5, %if.end61 ]
+ %tobool72 = icmp eq i32 undef, 0
+ br i1 %tobool72, label %cleanup75.thread128, label %if.then73
+
+if.then73: ; preds = %if.end71
+ br label %cleanup100
+
+cleanup75.thread128: ; preds = %if.end71
+ br label %if.end78
+
+cleanup75: ; preds = %L3
+ br i1 false, label %for.cond98, label %for.end95
+
+if.end78: ; preds = %cleanup75.thread128, %if.then36
+ %t7.14 = phi i16 [ %tmp5, %if.then36 ], [ 0, %cleanup75.thread128 ]
+ %t3.10 = phi i64 [ %t3.4, %if.then36 ], [ %t3.8, %cleanup75.thread128 ]
+ %t9.12 = phi i32 [ %t9.6, %if.then36 ], [ undef, %cleanup75.thread128 ]
+ %t2.13 = phi i32 [ %t2.7, %if.then36 ], [ undef, %cleanup75.thread128 ]
+ store i16 %t7.14, i16* @z, align 2
+ br label %L6
+
+L6: ; preds = %if.end78, %if.then
+ %t7.15 = phi i16 [ %t7.0, %if.then ], [ %t7.14, %if.end78 ]
+ %t3.11 = phi i64 [ 2, %if.then ], [ %t3.10, %if.end78 ]
+ %t9.13 = phi i32 [ %t9.0, %if.then ], [ %t9.12, %if.end78 ]
+ %t2.14 = phi i32 [ %shr, %if.then ], [ %t2.13, %if.end78 ]
+ br i1 undef, label %while.condthread-pre-split, label %for.inc94
+
+while.condthread-pre-split: ; preds = %L6
+ %tobool83 = icmp eq i32 undef, 0
+ br i1 %tobool83, label %while.end.split, label %while.cond
+
+while.cond: ; preds = %while.cond, %while.condthread-pre-split
+ br label %while.cond
+
+while.end.split: ; preds = %while.condthread-pre-split
+ %tobool84 = icmp eq i16 undef, 0
+ br i1 %tobool84, label %for.inc94, label %L3
+
+for.inc94: ; preds = %while.end.split, %L6, %for.body34
+ %t7.17 = phi i16 [ %t7.6, %for.body34 ], [ %t7.15, %L6 ], [ %t7.15, %while.end.split ]
+ %t2.16 = phi i32 [ %t2.7, %for.body34 ], [ %t2.14, %L6 ], [ %t2.14, %while.end.split ]
+ store i32 undef, i32* @p, align 4
+ br label %for.cond32
+
+for.end95: ; preds = %cleanup75, %if.end67, %for.cond32
+ %t7.18 = phi i16 [ %t7.6, %for.cond32 ], [ %t7.7, %cleanup75 ], [ %t7.11, %if.end67 ]
+ %t2.17 = phi i32 [ %t2.7, %for.cond32 ], [ %t2.8, %cleanup75 ], [ %t2.10, %if.end67 ]
+ %tobool96 = icmp eq i32 undef, 0
+ br i1 %tobool96, label %cleanup100, label %for.cond98
+
+for.cond98: ; preds = %for.cond98, %for.end95, %cleanup75
+ br label %for.cond98
+
+cleanup100: ; preds = %for.end95, %if.then73, %if.then38
+ %t7.19 = phi i16 [ %t7.18, %for.end95 ], [ %tmp5, %if.then38 ], [ %t7.12, %if.then73 ]
+ %t2.18 = phi i32 [ %t2.17, %for.end95 ], [ %t2.7, %if.then38 ], [ undef, %if.then73 ]
+ switch i32 undef, label %unreachable [
+ i32 0, label %for.cond
+ i32 17, label %L1
+ i32 7, label %cleanup100.L5_crit_edge
+ ]
+
+cleanup100.L5_crit_edge: ; preds = %cleanup100
+ %.pre.pre = load i32, i32* @r, align 4
+ br label %L5
+
+unreachable: ; preds = %cleanup100
+ unreachable
+}
+
+; Function Attrs: nounwind
+declare void @printf(i8* nocapture readonly, ...) #1
+
+attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 463505bd95d9..435401639f05 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -1,40 +1,50 @@
-; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define i32 @test1(i64 %x) nounwind readnone {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: leaq -1(%rdi), %rcx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testq %rcx, %rdi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cast = trunc i64 %count to i32
%cmp = icmp ugt i32 %cast, 1
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: test1:
-; CHECK: leaq -1([[A0:%rdi|%rcx]])
-; CHECK-NEXT: testq
-; CHECK-NEXT: setne
-; CHECK: ret
}
define i32 @test2(i64 %x) nounwind readnone {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: leaq -1(%rdi), %rcx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testq %rcx, %rdi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cmp = icmp ult i64 %count, 2
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: test2:
-; CHECK: leaq -1([[A0]])
-; CHECK-NEXT: testq
-; CHECK-NEXT: sete
-; CHECK: ret
}
define i32 @test3(i64 %x) nounwind readnone {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: popcntq %rdi, %rax
+; CHECK-NEXT: andb $63, %al
+; CHECK-NEXT: cmpb $2, %al
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cast = trunc i64 %count to i6 ; Too small for 0-64
%cmp = icmp ult i6 %cast, 2
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: test3:
-; CHECK: cmpl $2
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
index 6c8e45e42d15..ef947367c09e 100644
--- a/test/CodeGen/X86/cxx_tlscc64.ll
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; TLS function were wrongly model and after fixing that, shrink-wrapping
; cannot help here. To achieve the expected lowering, we need to playing
; tricks similar to AArch64 fast TLS calling convention (r255821).
@@ -39,6 +39,27 @@ declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
; CHECK-NOT: popq %r9
; CHECK-NOT: popq %r10
; CHECK-NOT: popq %r11
+
+; CHECK-O0-LABEL: _ZTW2sg
+; CHECK-O0: pushq %r11
+; CHECK-O0: pushq %r10
+; CHECK-O0: pushq %r9
+; CHECK-O0: pushq %r8
+; CHECK-O0: pushq %rsi
+; CHECK-O0: pushq %rdx
+; CHECK-O0: pushq %rcx
+; CHECK-O0: callq
+; CHECK-O0: jne
+; CHECK-O0: callq
+; CHECK-O0: tlv_atexit
+; CHECK-O0: callq
+; CHECK-O0: popq %rcx
+; CHECK-O0: popq %rdx
+; CHECK-O0: popq %rsi
+; CHECK-O0: popq %r8
+; CHECK-O0: popq %r9
+; CHECK-O0: popq %r10
+; CHECK-O0: popq %r11
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
%.b.i = load i1, i1* @__tls_guard, align 1
br i1 %.b.i, label %__tls_init.exit, label %init.i
@@ -63,6 +84,24 @@ __tls_init.exit:
; CHECK-NOT: pushq %rcx
; CHECK-NOT: pushq %rbx
; CHECK: callq
+; CHECK-O0-LABEL: _ZTW4sum1
+; CHECK-O0-NOT: pushq %r11
+; CHECK-O0-NOT: pushq %r10
+; CHECK-O0-NOT: pushq %r9
+; CHECK-O0-NOT: pushq %r8
+; CHECK-O0-NOT: pushq %rsi
+; CHECK-O0-NOT: pushq %rdx
+; CHECK-O0-NOT: pushq %rcx
+; CHECK-O0-NOT: pushq %rbx
+; CHECK-O0-NOT: movq %r11
+; CHECK-O0-NOT: movq %r10
+; CHECK-O0-NOT: movq %r9
+; CHECK-O0-NOT: movq %r8
+; CHECK-O0-NOT: movq %rsi
+; CHECK-O0-NOT: movq %rdx
+; CHECK-O0-NOT: movq %rcx
+; CHECK-O0-NOT: movq %rbx
+; CHECK-O0: callq
define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
ret i32* @sum1
}
@@ -76,4 +115,57 @@ define cxx_fast_tlscc i32* @_ZTW4sum2() #0 {
ret i32* @sum1
}
+; Make sure at O0, we don't generate spilling/reloading of the CSRs.
+; CHECK-O0-LABEL: tls_test2
+; CHECK-O0-NOT: pushq %r11
+; CHECK-O0-NOT: pushq %r10
+; CHECK-O0-NOT: pushq %r9
+; CHECK-O0-NOT: pushq %r8
+; CHECK-O0-NOT: pushq %rsi
+; CHECK-O0-NOT: pushq %rdx
+; CHECK-O0: callq {{.*}}tls_helper
+; CHECK-O0-NOT: popq %rdx
+; CHECK-O0-NOT: popq %rsi
+; CHECK-O0-NOT: popq %r8
+; CHECK-O0-NOT: popq %r9
+; CHECK-O0-NOT: popq %r10
+; CHECK-O0-NOT: popq %r11
+; CHECK-O0: ret
+%class.C = type { i32 }
+@tC = internal thread_local global %class.C zeroinitializer, align 4
+declare cxx_fast_tlscc void @tls_helper()
+define cxx_fast_tlscc %class.C* @tls_test2() #1 {
+ call cxx_fast_tlscc void @tls_helper()
+ ret %class.C* @tC
+}
+
+; Make sure we do not allow tail call when caller and callee have different
+; calling conventions.
+declare %class.C* @_ZN1CD1Ev(%class.C* readnone returned %this)
+; CHECK-LABEL: tls_test
+; CHECK: callq {{.*}}tlv_atexit
+define cxx_fast_tlscc void @tls_test() {
+entry:
+ store i32 0, i32* getelementptr inbounds (%class.C, %class.C* @tC, i64 0, i32 0), align 4
+ %0 = tail call i32 @_tlv_atexit(void (i8*)* bitcast (%class.C* (%class.C*)* @_ZN1CD1Ev to void (i8*)*), i8* bitcast (%class.C* @tC to i8*), i8* nonnull @__dso_handle) #1
+ ret void
+}
+
+@ssp_var = internal thread_local global i8 0, align 1
+
+; CHECK-LABEL: test_ssp
+; CHECK-NOT: pushq %r11
+; CHECK-NOT: pushq %r10
+; CHECK-NOT: pushq %r9
+; CHECK-NOT: pushq %r8
+; CHECK-NOT: pushq %rsi
+; CHECK-NOT: pushq %rdx
+; CHECK-NOT: pushq %rcx
+; CHECK-NOT: pushq %rbx
+; CHECK: callq
+define cxx_fast_tlscc nonnull i8* @test_ssp() #2 {
+ ret i8* @ssp_var
+}
attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind sspreq }
diff --git a/test/CodeGen/X86/dag-optnone.ll b/test/CodeGen/X86/dag-optnone.ll
index f7774e6e8c54..e0e12553dac6 100644
--- a/test/CodeGen/X86/dag-optnone.ll
+++ b/test/CodeGen/X86/dag-optnone.ll
@@ -23,13 +23,12 @@
; The test cases @foo[WithOptnone] prove that the same DAG combine happens
-; with -O0 and with 'optnone' set. To prove this, we use a Windows triple to
-; cause fast-isel to bail out (because something about the calling convention
-; is not handled in fast-isel). Then we have a repeated fadd that can be
-; combined into an fmul. We show that this happens in both the non-optnone
-; function and the optnone function.
+; with -O0 and with 'optnone' set. To prove this, we use a varags to cause
+; fast-isel to bail out (varags aren't handled in fast isel). Then we have
+; a repeated fadd that can be combined into an fmul. We show that this
+; happens in both the non-optnone function and the optnone function.
-define float @foo(float %x) #0 {
+define float @foo(float %x, ...) #0 {
entry:
%add = fadd fast float %x, %x
%add1 = fadd fast float %add, %x
@@ -41,7 +40,7 @@ entry:
; CHECK: mul
; CHECK-NEXT: ret
-define float @fooWithOptnone(float %x) #1 {
+define float @fooWithOptnone(float %x, ...) #1 {
entry:
%add = fadd fast float %x, %x
%add1 = fadd fast float %add, %x
@@ -60,7 +59,7 @@ entry:
@id84 = common global <16 x i32> zeroinitializer, align 64
-define void @bar() #1 {
+define void @bar(...) #1 {
entry:
%id83 = alloca <16 x i8>, align 16
%0 = load <16 x i32>, <16 x i32>* @id84, align 64
diff --git a/test/CodeGen/X86/darwin-stub.ll b/test/CodeGen/X86/darwin-stub.ll
deleted file mode 100644
index 607f56fdd60b..000000000000
--- a/test/CodeGen/X86/darwin-stub.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep stub
-; RUN: llc < %s -mtriple=i386-apple-darwin9 | not grep stub
-
-@"\01LC" = internal constant [13 x i8] c"Hello World!\00" ; <[13 x i8]*> [#uses=1]
-
-define i32 @main() nounwind {
-entry:
- %0 = tail call i32 @puts(i8* getelementptr ([13 x i8], [13 x i8]* @"\01LC", i32 0, i32 0)) nounwind ; <i32> [#uses=0]
- ret i32 0
-}
-
-declare i32 @puts(i8*)
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 54bd48926834..1ff4d10c2f8f 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -12,7 +12,7 @@
; CHECK: je .LBB0_4
; Regenerate test with this command:
-; clang -emit-llvm -S -O2 -g
+; clang++ -emit-llvm -S -O2 -g
; from this source:
;
; extern void foo(char *dst,unsigned siz,const char *src);
@@ -44,161 +44,170 @@
%struct.AAA3 = type { [4 x i8] }
@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
-@.str1 = private unnamed_addr constant [2 x i8] c"+\00", align 1
-@.str2 = private unnamed_addr constant [2 x i8] c"-\00", align 1
+@.str.1 = private unnamed_addr constant [2 x i8] c"+\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"-\00", align 1
; Function Attrs: uwtable
define void @_Z3barii(i32 %param1, i32 %param2) #0 !dbg !24 {
entry:
%var1 = alloca %struct.AAA3, align 1
%var2 = alloca %struct.AAA3, align 1
- tail call void @llvm.dbg.value(metadata i32 %param1, i64 0, metadata !30, metadata !DIExpression()), !dbg !47
- tail call void @llvm.dbg.value(metadata i32 %param2, i64 0, metadata !31, metadata !DIExpression()), !dbg !47
- tail call void @llvm.dbg.value(metadata i8* null, i64 0, metadata !32, metadata !DIExpression()), !dbg !49
+ tail call void @llvm.dbg.value(metadata i32 %param1, i64 0, metadata !29, metadata !46), !dbg !47
+ tail call void @llvm.dbg.value(metadata i32 %param2, i64 0, metadata !30, metadata !46), !dbg !48
+ tail call void @llvm.dbg.value(metadata i8* null, i64 0, metadata !31, metadata !46), !dbg !49
%tobool = icmp eq i32 %param2, 0, !dbg !50
- br i1 %tobool, label %if.end, label %if.then, !dbg !50
+ br i1 %tobool, label %if.end, label %if.then, !dbg !52
if.then: ; preds = %entry
- %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52
- tail call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata !32, metadata !DIExpression()), !dbg !49
- br label %if.end, !dbg !54
+ %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !53
+ tail call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata !31, metadata !46), !dbg !49
+ br label %if.end, !dbg !55
if.end: ; preds = %entry, %if.then
- tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !DIExpression()), !dbg !55
- tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !56, metadata !DIExpression()), !dbg !57
- tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !DIExpression()), !dbg !60
- %arraydecay.i = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61
- call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !61
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !DIExpression()), !dbg !63
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !64, metadata !DIExpression()), !dbg !65
- call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !DIExpression()), !dbg !67
- %arraydecay.i5 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68
- call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !68
- %tobool1 = icmp eq i32 %param1, 0, !dbg !69
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !DIExpression()), !dbg !63
- br i1 %tobool1, label %if.else, label %if.then2, !dbg !69
+ %0 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !56
+ call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !56
+ tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !32, metadata !57), !dbg !58
+ tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !36, metadata !46), !dbg !59
+ tail call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !38, metadata !46), !dbg !62
+ call void @_Z3fooPcjPKc(i8* %0, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !63
+ %1 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !65
+ call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !65
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !33, metadata !57), !dbg !66
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !36, metadata !46), !dbg !67
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !38, metadata !46), !dbg !69
+ call void @_Z3fooPcjPKc(i8* %1, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !70
+ %tobool1 = icmp eq i32 %param1, 0, !dbg !71
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !33, metadata !57), !dbg !66
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !41, metadata !46), !dbg !73
+ br i1 %tobool1, label %if.else, label %if.then2, !dbg !75
if.then2: ; preds = %if.end
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !71, metadata !DIExpression()), !dbg !73
- call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !DIExpression()), !dbg !76
- call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i64 0, i64 0)), !dbg !76
- br label %if.end3, !dbg !72
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0), i64 0, metadata !42, metadata !46), !dbg !76
+ call void @_Z3fooPcjPKc(i8* %1, i32 4, i8* nonnull getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0)), !dbg !78
+ br label %if.end3, !dbg !79
if.else: ; preds = %if.end
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !77, metadata !DIExpression()), !dbg !79
- call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !DIExpression()), !dbg !82
- call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0)), !dbg !82
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0), i64 0, metadata !42, metadata !46), !dbg !80
+ call void @_Z3fooPcjPKc(i8* %1, i32 4, i8* nonnull getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0)), !dbg !81
br label %if.end3
if.end3: ; preds = %if.else, %if.then2
- call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !DIExpression()), !dbg !55
- call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !83, metadata !DIExpression()), !dbg !85
- call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !DIExpression()), !dbg !87
- call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !87
- ret void, !dbg !88
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !32, metadata !57), !dbg !58
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !41, metadata !46), !dbg !82
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !42, metadata !46), !dbg !84
+ call void @_Z3fooPcjPKc(i8* %0, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !85
+ call void @llvm.lifetime.end(i64 4, i8* %1) #4, !dbg !86
+ call void @llvm.lifetime.end(i64 4, i8* %0) #4, !dbg !87
+ ret void, !dbg !86
}
-declare i8* @_Z5i2stri(i32) #1
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1
+declare i8* @_Z5i2stri(i32) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare void @_Z3fooPcjPKc(i8*, i32, i8*) #2
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+attributes #4 = { nounwind }
!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!44, !45}
-!llvm.ident = !{!46}
+!llvm.module.flags = !{!43, !44}
+!llvm.ident = !{!45}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !3, subprograms: !23, globals: !2, imports: !2)
-!1 = !DIFile(filename: "dbg-changes-codegen-branch-folding.cpp", directory: "/tmp/dbginfo")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 255993) (llvm/trunk 256074)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.cpp", directory: "/mnt/extra")
!2 = !{}
!3 = !{!4}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "AAA3", line: 4, size: 32, align: 8, file: !1, elements: !5, identifier: "_ZTS4AAA3")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "AAA3", file: !1, line: 4, size: 32, align: 8, elements: !5, identifier: "_ZTS4AAA3")
!5 = !{!6, !11, !17, !18}
-!6 = !DIDerivedType(tag: DW_TAG_member, name: "text", line: 8, size: 32, align: 8, file: !1, scope: !"_ZTS4AAA3", baseType: !7)
-!7 = !DICompositeType(tag: DW_TAG_array_type, size: 32, align: 8, baseType: !8, elements: !9)
-!8 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!6 = !DIDerivedType(tag: DW_TAG_member, name: "text", scope: !4, file: !1, line: 8, baseType: !7, size: 32, align: 8)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 32, align: 8, elements: !9)
+!8 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
!9 = !{!10}
!10 = !DISubrange(count: 4)
-!11 = !DISubprogram(name: "AAA3", line: 5, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !1, scope: !"_ZTS4AAA3", type: !12)
+!11 = !DISubprogram(name: "AAA3", scope: !4, file: !1, line: 5, type: !12, isLocal: false, isDefinition: false, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true)
!12 = !DISubroutineType(types: !13)
!13 = !{null, !14, !15}
-!14 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !"_ZTS4AAA3")
-!15 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !16)
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 64)
!16 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !8)
-!17 = !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", line: 6, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !"_ZTS4AAA3", type: !12)
-!18 = !DISubprogram(name: "operator const char *", linkageName: "_ZNK4AAA3cvPKcEv", line: 7, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !1, scope: !"_ZTS4AAA3", type: !19)
+!17 = !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", scope: !4, file: !1, line: 6, type: !12, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true)
+!18 = !DISubprogram(name: "operator const char *", linkageName: "_ZNK4AAA3cvPKcEv", scope: !4, file: !1, line: 7, type: !19, isLocal: false, isDefinition: false, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true)
!19 = !DISubroutineType(types: !20)
!20 = !{!15, !21}
-!21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !22)
-!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !"_ZTS4AAA3")
-!23 = !{!24, !35, !40}
-!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !1, scope: !25, type: !26, variables: !29)
-!25 = !DIFile(filename: "dbg-changes-codegen-branch-folding.cpp", directory: "/tmp/dbginfo")
-!26 = !DISubroutineType(types: !27)
-!27 = !{null, !28, !28}
-!28 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!29 = !{!30, !31, !32, !33, !34}
-!30 = !DILocalVariable(name: "param1", line: 11, arg: 1, scope: !24, file: !25, type: !28)
-!31 = !DILocalVariable(name: "param2", line: 11, arg: 2, scope: !24, file: !25, type: !28)
-!32 = !DILocalVariable(name: "temp", line: 12, scope: !24, file: !25, type: !15)
-!33 = !DILocalVariable(name: "var1", line: 17, scope: !24, file: !25, type: !"_ZTS4AAA3")
-!34 = !DILocalVariable(name: "var2", line: 18, scope: !24, file: !25, type: !"_ZTS4AAA3")
-!35 = distinct !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !17, variables: !36)
-!36 = !{!37, !39}
-!37 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!38 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !"_ZTS4AAA3")
-!39 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!40 = distinct !DISubprogram(name: "AAA3", linkageName: "_ZN4AAA3C2EPKc", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !11, variables: !41)
-!41 = !{!42, !43}
-!42 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
-!43 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
-!44 = !{i32 2, !"Dwarf Version", i32 4}
-!45 = !{i32 2, !"Debug Info Version", i32 3}
-!46 = !{!"clang version 3.5.0 "}
-!47 = !DILocation(line: 11, scope: !24)
-!48 = !{i8* null}
-!49 = !DILocation(line: 12, scope: !24)
-!50 = !DILocation(line: 14, scope: !51)
-!51 = distinct !DILexicalBlock(line: 14, column: 0, file: !1, scope: !24)
-!52 = !DILocation(line: 15, scope: !53)
-!53 = distinct !DILexicalBlock(line: 14, column: 0, file: !1, scope: !51)
-!54 = !DILocation(line: 16, scope: !53)
-!55 = !DILocation(line: 17, scope: !24)
-!56 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
-!57 = !DILocation(line: 0, scope: !40, inlinedAt: !55)
-!58 = !{i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)}
-!59 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
-!60 = !DILocation(line: 5, scope: !40, inlinedAt: !55)
-!61 = !DILocation(line: 5, scope: !62, inlinedAt: !55)
-!62 = distinct !DILexicalBlock(line: 5, column: 0, file: !1, scope: !40)
-!63 = !DILocation(line: 18, scope: !24)
-!64 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
-!65 = !DILocation(line: 0, scope: !40, inlinedAt: !63)
-!66 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
-!67 = !DILocation(line: 5, scope: !40, inlinedAt: !63)
-!68 = !DILocation(line: 5, scope: !62, inlinedAt: !63)
-!69 = !DILocation(line: 20, scope: !70)
-!70 = distinct !DILexicalBlock(line: 20, column: 0, file: !1, scope: !24)
-!71 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!72 = !DILocation(line: 21, scope: !70)
-!73 = !DILocation(line: 0, scope: !35, inlinedAt: !72)
-!74 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i64 0, i64 0)}
-!75 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!76 = !DILocation(line: 6, scope: !35, inlinedAt: !72)
-!77 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!78 = !DILocation(line: 23, scope: !70)
-!79 = !DILocation(line: 0, scope: !35, inlinedAt: !78)
-!80 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0)}
-!81 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!82 = !DILocation(line: 6, scope: !35, inlinedAt: !78)
-!83 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!84 = !DILocation(line: 24, scope: !24)
-!85 = !DILocation(line: 0, scope: !35, inlinedAt: !84)
-!86 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!87 = !DILocation(line: 6, scope: !35, inlinedAt: !84)
-!88 = !DILocation(line: 25, scope: !24)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !4)
+!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", scope: !1, file: !1, line: 11, type: !25, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !28)
+!25 = !DISubroutineType(types: !26)
+!26 = !{null, !27, !27}
+!27 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!28 = !{!29, !30, !31, !32, !33}
+!29 = !DILocalVariable(name: "param1", arg: 1, scope: !24, file: !1, line: 11, type: !27)
+!30 = !DILocalVariable(name: "param2", arg: 2, scope: !24, file: !1, line: 11, type: !27)
+!31 = !DILocalVariable(name: "temp", scope: !24, file: !1, line: 12, type: !15)
+!32 = !DILocalVariable(name: "var1", scope: !24, file: !1, line: 17, type: !4)
+!33 = !DILocalVariable(name: "var2", scope: !24, file: !1, line: 18, type: !4)
+!34 = distinct !DISubprogram(name: "AAA3", linkageName: "_ZN4AAA3C2EPKc", scope: !4, file: !1, line: 5, type: !12, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !11, variables: !35)
+!35 = !{!36, !38}
+!36 = !DILocalVariable(name: "this", arg: 1, scope: !34, type: !37, flags: DIFlagArtificial | DIFlagObjectPointer)
+!37 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 64, align: 64)
+!38 = !DILocalVariable(name: "value", arg: 2, scope: !34, file: !1, line: 5, type: !15)
+!39 = distinct !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", scope: !4, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !17, variables: !40)
+!40 = !{!41, !42}
+!41 = !DILocalVariable(name: "this", arg: 1, scope: !39, type: !37, flags: DIFlagArtificial | DIFlagObjectPointer)
+!42 = !DILocalVariable(name: "value", arg: 2, scope: !39, file: !1, line: 6, type: !15)
+!43 = !{i32 2, !"Dwarf Version", i32 4}
+!44 = !{i32 2, !"Debug Info Version", i32 3}
+!45 = !{!"clang version 3.8.0 (trunk 255993) (llvm/trunk 256074)"}
+!46 = !DIExpression()
+!47 = !DILocation(line: 11, column: 15, scope: !24)
+!48 = !DILocation(line: 11, column: 26, scope: !24)
+!49 = !DILocation(line: 12, column: 16, scope: !24)
+!50 = !DILocation(line: 14, column: 7, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !24, file: !1, line: 14, column: 7)
+!52 = !DILocation(line: 14, column: 7, scope: !24)
+!53 = !DILocation(line: 15, column: 12, scope: !54)
+!54 = distinct !DILexicalBlock(scope: !51, file: !1, line: 14, column: 15)
+!55 = !DILocation(line: 16, column: 3, scope: !54)
+!56 = !DILocation(line: 17, column: 3, scope: !24)
+!57 = !DIExpression(DW_OP_deref)
+!58 = !DILocation(line: 17, column: 8, scope: !24)
+!59 = !DILocation(line: 0, scope: !34, inlinedAt: !60)
+!60 = distinct !DILocation(line: 17, column: 8, scope: !61)
+!61 = !DILexicalBlockFile(scope: !24, file: !1, discriminator: 1)
+!62 = !DILocation(line: 5, column: 19, scope: !34, inlinedAt: !60)
+!63 = !DILocation(line: 5, column: 28, scope: !64, inlinedAt: !60)
+!64 = distinct !DILexicalBlock(scope: !34, file: !1, line: 5, column: 26)
+!65 = !DILocation(line: 18, column: 3, scope: !24)
+!66 = !DILocation(line: 18, column: 8, scope: !24)
+!67 = !DILocation(line: 0, scope: !34, inlinedAt: !68)
+!68 = distinct !DILocation(line: 18, column: 8, scope: !61)
+!69 = !DILocation(line: 5, column: 19, scope: !34, inlinedAt: !68)
+!70 = !DILocation(line: 5, column: 28, scope: !64, inlinedAt: !68)
+!71 = !DILocation(line: 20, column: 7, scope: !72)
+!72 = distinct !DILexicalBlock(scope: !24, file: !1, line: 20, column: 7)
+!73 = !DILocation(line: 0, scope: !39, inlinedAt: !74)
+!74 = distinct !DILocation(line: 23, column: 10, scope: !72)
+!75 = !DILocation(line: 20, column: 7, scope: !24)
+!76 = !DILocation(line: 6, column: 29, scope: !39, inlinedAt: !77)
+!77 = distinct !DILocation(line: 21, column: 10, scope: !72)
+!78 = !DILocation(line: 6, column: 38, scope: !39, inlinedAt: !77)
+!79 = !DILocation(line: 21, column: 5, scope: !72)
+!80 = !DILocation(line: 6, column: 29, scope: !39, inlinedAt: !74)
+!81 = !DILocation(line: 6, column: 38, scope: !39, inlinedAt: !74)
+!82 = !DILocation(line: 0, scope: !39, inlinedAt: !83)
+!83 = distinct !DILocation(line: 24, column: 8, scope: !24)
+!84 = !DILocation(line: 6, column: 29, scope: !39, inlinedAt: !83)
+!85 = !DILocation(line: 6, column: 38, scope: !39, inlinedAt: !83)
+!86 = !DILocation(line: 25, column: 1, scope: !24)
+!87 = !DILocation(line: 25, column: 1, scope: !61)
diff --git a/test/CodeGen/X86/dbg-combine.ll b/test/CodeGen/X86/dbg-combine.ll
index 3e78c316a06f..3a44fe186f97 100644
--- a/test/CodeGen/X86/dbg-combine.ll
+++ b/test/CodeGen/X86/dbg-combine.ll
@@ -74,11 +74,10 @@ attributes #2 = { nounwind }
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.7.0 (trunk 227074)", isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.7.0 (trunk 227074)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "dbg-combine.c", directory: "/home/probinson/projects/scratch")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, isOptimized: false, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, isOptimized: false, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2)
!5 = !DIFile(filename: "dbg-combine.c", directory: "/home/probinson/projects/scratch")
!6 = !DISubroutineType(types: !7)
!7 = !{!8}
diff --git a/test/CodeGen/X86/debugloc-argsize.ll b/test/CodeGen/X86/debugloc-argsize.ll
index 0283154abab2..75a791757c01 100644
--- a/test/CodeGen/X86/debugloc-argsize.ll
+++ b/test/CodeGen/X86/debugloc-argsize.ll
@@ -38,11 +38,10 @@ attributes #2 = { nounwind }
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 249520)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 249520)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "foo.cpp", directory: "foo")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, variables: !2)
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/CodeGen/X86/deopt-bundles.ll b/test/CodeGen/X86/deopt-bundles.ll
new file mode 100644
index 000000000000..1fb73ea252ee
--- /dev/null
+++ b/test/CodeGen/X86/deopt-bundles.ll
@@ -0,0 +1,161 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 -debug-only=stackmaps < %s 2>&1 | FileCheck -check-prefix=STACKMAPS %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+
+; STACKMAPS: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 4242
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 4243
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 16 [encoding: .byte 4, .byte 8, .short 0, .int 16]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 2 [encoding: .byte 4, .byte 8, .short 0, .int 2]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 3 [encoding: .byte 4, .byte 8, .short 0, .int 3]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 4243
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 55 [encoding: .byte 4, .byte 8, .short 0, .int 55]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+
+
+declare i32 @callee_0()
+declare i32 @callee_1(i32)
+declare i32 @callee_vararg(...)
+
+define i32 @caller_0() {
+; CHECK-LABEL: _caller_0
+entry:
+ %v = call i32 @callee_0() [ "deopt"(i32 0) ]
+ %v2 = add i32 %v, 1
+ ret i32 %v2
+; CHECK: callq _callee_0
+; CHECK: incl %eax
+; CHECK: retq
+}
+
+define i32 @caller_1() {
+; CHECK-LABEL: _caller_1
+entry:
+ %v = call i32 @callee_1(i32 42) "statepoint-id"="4242" [ "deopt"(i32 1) ]
+ ret i32 %v
+; CHECK: callq _callee_1
+; CHECK: popq %rcx
+; CHECK: retq
+}
+
+define i32 @caller_vararg() {
+; CHECK-LABEL: _caller_vararg
+entry:
+; CHECK: movb $1, %al
+; CHECK: callq _callee_vararg
+ %v = call i32(...) @callee_vararg(i32 42, double 500.0) "statepoint-id"="4243" [ "deopt"(i32 16) ]
+ ret i32 %v
+}
+
+define i32 @invoker_0() personality i8 0 {
+; CHECK-LABEL: _invoker_0
+entry:
+ %v = invoke i32 @callee_0() [ "deopt"(i32 2) ]
+ to label %normal unwind label %uw
+
+normal:
+ ret i32 %v
+
+uw:
+ %ehvals = landingpad { i8*, i32 }
+ cleanup
+ ret i32 1
+; CHECK: callq _callee_0
+; CHECK: popq %rcx
+; CHECK: retq
+; CHECK: movl $1, %eax
+; CHECK: popq %rcx
+; CHECK: retq
+}
+
+define i32 @invoker_1() personality i8 0 {
+; CHECK-LABEL: _invoker_1
+entry:
+ %v = invoke i32 @callee_1(i32 45) "statepoint-num-patch-bytes"="9" [ "deopt"(i32 3) ]
+ to label %normal unwind label %uw
+
+normal:
+ ret i32 %v
+
+uw:
+ %ehvals = landingpad { i8*, i32 }
+ cleanup
+ ret i32 1
+; CHECK: movl $45, %edi
+; CHECK: nopw 512(%rax,%rax)
+; CHECK: popq %rcx
+; CHECK: retq
+; CHECK: movl $1, %eax
+; CHECK: popq %rcx
+; CHECK: retq
+}
+
+define i32 @invoker_2() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ %val = invoke i32 @callee_1(i32 1)
+ to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:
+ %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+ %cp1 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+ %val2 = call i32 @callee_1(i32 100) "statepoint-id"="4243" [ "funclet"(token %cp1), "deopt"(i32 55) ]
+ catchret from %cp1 to label %try.cont
+
+try.cont:
+ ret i32 0
+}
+
+declare i32 @__CxxFrameHandler3(...)
+
+define void @f_0(i64 %n) {
+ ; CHECK-LABEL: _f_0
+ %s = alloca i64
+ %vl = alloca i64, i64 %n
+ ; Check that the stackmap does not reference %s through
+ ; SP since the offset is not static because of %vl.
+ ; STACKMAPS: Loc 3: Direct 6
+ call void @g_0(i64* %vl) [ "deopt"(i64* %s) ]
+ ret void
+}
+
+declare void @g_0(i64* %vl)
diff --git a/test/CodeGen/X86/deopt-intrinsic-cconv.ll b/test/CodeGen/X86/deopt-intrinsic-cconv.ll
new file mode 100644
index 000000000000..8e240f8901d8
--- /dev/null
+++ b/test/CodeGen/X86/deopt-intrinsic-cconv.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -debug-only=stackmaps < %s 2>&1 | FileCheck --check-prefix=STACKMAPS %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare webkit_jscc i64 @llvm.experimental.deoptimize.i64(...)
+
+define i64 @caller_1() {
+; CHECK-LABEL: _caller_1:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: ##{{.+}}
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: movl $1140457472, (%rsp) ## imm = 0x43FA0000
+; CHECK-NEXT: movl $42, %eax
+; CHECK-NEXT: callq ___llvm_deoptimize
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+
+entry:
+ %v = call webkit_jscc i64(...) @llvm.experimental.deoptimize.i64(i32 42, float 500.0) [ "deopt"(i32 3) ]
+ ret i64 %v
+}
+
+; STACKMAPS: Stack Maps: callsites:
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 12 [encoding: .byte 4, .byte 8, .short 0, .int 12]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 3 [encoding: .byte 4, .byte 8, .short 0, .int 3]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
diff --git a/test/CodeGen/X86/deopt-intrinsic.ll b/test/CodeGen/X86/deopt-intrinsic.ll
new file mode 100644
index 000000000000..ceed2d248821
--- /dev/null
+++ b/test/CodeGen/X86/deopt-intrinsic.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -debug-only=stackmaps < %s 2>&1 | FileCheck --check-prefix=STACKMAPS %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare i32 @llvm.experimental.deoptimize.i32(...)
+declare i8 @llvm.experimental.deoptimize.i8(...)
+
+define i32 @caller_0() {
+; CHECK-LABEL: _caller_0:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: ##{{.+}}
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: callq ___llvm_deoptimize
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+entry:
+ %v = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0) ]
+ ret i32 %v
+}
+
+define i8 @caller_1() {
+; CHECK-LABEL: _caller_1:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: ##{{.+}}
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: movss {{[a-zA-Z0-9_]+}}(%rip), %xmm0 ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movl $42, %edi
+; CHECK-NEXT: callq ___llvm_deoptimize
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+
+entry:
+ %v = call i8(...) @llvm.experimental.deoptimize.i8(i32 42, float 500.0) [ "deopt"(i32 1) ]
+ ret i8 %v
+}
+
+; STACKMAPS: Stack Maps: callsites:
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index 58e25f923971..e45f3ba91495 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=NOTEXPORTED %s
+; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck -check-prefix=NOTEXPORTED %s
; CHECK: .text
@@ -50,11 +52,19 @@ define weak_odr dllexport void @weak1() {
; CHECK: .globl WeakVar2
@WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
+; CHECK: .bss
+; CHECK: .globl WeakVar3
+@WeakVar3 = weak_odr dllexport global i32 0, align 4
+
; CHECK: .globl alias
; CHECK: alias = notExported
@alias = dllexport alias void(), void()* @notExported
+; CHECK: .globl aliasNotExported
+; CHECK: aliasNotExported = f1
+@aliasNotExported = alias void(), void()* @f1
+
; CHECK: .globl alias2
; CHECK: alias2 = f1
@alias2 = dllexport alias void(), void()* @f1
@@ -70,6 +80,23 @@ define weak_odr dllexport void @weak1() {
@blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
@blob_alias = dllexport alias i32 (), bitcast ([6 x i8]* @blob to i32 ()*)
+@exportedButNotDefinedVariable = external dllexport global i32
+declare dllexport void @exportedButNotDefinedFunction()
+define void @foo() {
+entry:
+ store i32 4, i32* @exportedButNotDefinedVariable, align 4
+ call void @exportedButNotDefinedFunction()
+ ret void
+}
+
+; Verify items that should not be exported do not appear in the export table.
+; We use a separate check prefix to avoid confusion between -NOT and -SAME.
+; NOTEXPORTED: .section .drectve
+; NOTEXPORTED-NOT: notExported
+; NOTEXPORTED-NOT: aliasNotExported
+; NOTEXPORTED-NOT: exportedButNotDefinedVariable
+; NOTEXPORTED-NOT: exportedButNotDefinedFunction
+
; CHECK: .section .drectve
; WIN32: /EXPORT:f1
; WIN32-SAME: /EXPORT:f2
@@ -81,6 +108,7 @@ define weak_odr dllexport void @weak1() {
; WIN32-SAME: /EXPORT:Var3,DATA
; WIN32-SAME: /EXPORT:WeakVar1,DATA
; WIN32-SAME: /EXPORT:WeakVar2,DATA
+; WIN32-SAME: /EXPORT:WeakVar3,DATA
; WIN32-SAME: /EXPORT:alias
; WIN32-SAME: /EXPORT:alias2
; WIN32-SAME: /EXPORT:alias3
@@ -96,6 +124,7 @@ define weak_odr dllexport void @weak1() {
; MINGW-SAME: -export:Var3,data
; MINGW-SAME: -export:WeakVar1,data
; MINGW-SAME: -export:WeakVar2,data
+; MINGW-SAME: -export:WeakVar3,data
; MINGW-SAME: -export:alias
; MINGW-SAME: -export:alias2
; MINGW-SAME: -export:alias3
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index cde0955410b7..d833f3c22ffc 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -4,6 +4,12 @@
; RUN: | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
; RUN: llc -mtriple i686-pc-cygwin %s -o - \
; RUN: | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
+; RUN: llc -mtriple i386-pc-win32 < %s \
+; RUN: | FileCheck -check-prefix NOTEXPORTED %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s \
+; RUN: | FileCheck -check-prefix NOTEXPORTED %s
+; RUN: llc -mtriple i686-pc-cygwin %s -o - \
+; RUN: | FileCheck -check-prefix NOTEXPORTED %s
; CHECK: .text
@@ -21,7 +27,7 @@ define dllexport void @f2() unnamed_addr {
ret void
}
-declare dllexport void @not_defined()
+declare dllexport void @notDefined()
; CHECK: .globl _stdfun@0
define dllexport x86_stdcallcc void @stdfun() nounwind {
@@ -88,8 +94,13 @@ define weak_odr dllexport void @weak1() {
; CHECK: _weak_alias = _f1
@weak_alias = weak_odr dllexport alias void(), void()* @f1
+; Verify items that should not be exported do not appear in the export table.
+; We use a separate check prefix to avoid confusion between -NOT and -SAME.
+; NOTEXPORTED: .section .drectve
+; NOTEXPORTED-NOT: notExported
+; NOTEXPORTED-NOT: notDefined
+
; CHECK: .section .drectve
-; CHECK-CL-NOT: not_exported
; CHECK-CL: /EXPORT:_f1
; CHECK-CL-SAME: /EXPORT:_f2
; CHECK-CL-SAME: /EXPORT:_stdfun@0
@@ -107,8 +118,6 @@ define weak_odr dllexport void @weak1() {
; CHECK-CL-SAME: /EXPORT:_alias2
; CHECK-CL-SAME: /EXPORT:_alias3
; CHECK-CL-SAME: /EXPORT:_weak_alias"
-; CHECK-CL-NOT: not_exported
-; CHECK-GCC-NOT: not_exported
; CHECK-GCC: -export:f1
; CHECK-GCC-SAME: -export:f2
; CHECK-GCC-SAME: -export:stdfun@0
@@ -126,4 +135,3 @@ define weak_odr dllexport void @weak1() {
; CHECK-GCC-SAME: -export:alias2
; CHECK-GCC-SAME: -export:alias3
; CHECK-GCC-SAME: -export:weak_alias"
-; CHECK-GCC-NOT: not_exported
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index 31d2724aade3..b744a70288e5 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!5}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: 0, file: !4, enums: !2, retainedTypes: !7, subprograms: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !7, globals: !2)
!2 = !{}
!3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
!4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
diff --git a/test/CodeGen/X86/dynamic-alloca-in-entry.ll b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
index 7ed471c2f502..2b5721d7fcf1 100644
--- a/test/CodeGen/X86/dynamic-alloca-in-entry.ll
+++ b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
@@ -15,5 +15,5 @@ define void @bar() {
ret void
}
; CHECK-LABEL: _bar:
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
; CHECK: retl
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
index b0334d6a63ef..71e589275ede 100644
--- a/test/CodeGen/X86/dynamic-allocas-VLAs.ll
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
; rdar://11496434
; no VLAs or dynamic alignment
@@ -60,12 +60,10 @@ entry:
; CHECK: _t3
; CHECK: pushq %rbp
; CHECK: movq %rsp, %rbp
-; CHECK: pushq %rbx
; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
; CHECK: subq ${{[0-9]+}}, %rsp
;
-; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp
-; CHECK: popq %rbx
+; CHECK: movq %rbp, %rsp
; CHECK: popq %rbp
}
@@ -85,7 +83,6 @@ entry:
; CHECK: _t4
; CHECK: pushq %rbp
; CHECK: movq %rsp, %rbp
-; CHECK: pushq %r14
; CHECK: pushq %rbx
; CHECK: andq $-32, %rsp
; CHECK: subq ${{[0-9]+}}, %rsp
@@ -95,9 +92,8 @@ entry:
; CHECK: leaq {{[0-9]*}}(%rbx), %rdx
; CHECK: callq _t4_helper
;
-; CHECK: leaq -16(%rbp), %rsp
+; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp
; CHECK: popq %rbx
-; CHECK: popq %r14
; CHECK: popq %rbp
}
diff --git a/test/CodeGen/X86/eflags-copy-expansion.mir b/test/CodeGen/X86/eflags-copy-expansion.mir
new file mode 100644
index 000000000000..bf2d0be67c12
--- /dev/null
+++ b/test/CodeGen/X86/eflags-copy-expansion.mir
@@ -0,0 +1,67 @@
+# RUN: llc -run-pass postrapseudos -mtriple=i386-apple-macosx -o - %s | FileCheck %s
+
+# Verify that we correctly save and restore eax when copying eflags,
+# even when only a smaller alias of eax is used. We used to check only
+# eax and not its aliases.
+# PR27624.
+
+--- |
+ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+ define void @foo() {
+ entry:
+ br label %false
+ false:
+ ret void
+ }
+
+...
+
+---
+name: foo
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0.entry:
+ liveins: %edi
+ successors: %bb.1.false
+ NOOP implicit-def %al
+
+ ; The bug was triggered only when LivePhysReg is used, which
+ ; happens only when the heuristic for the liveness computation
+ ; failed. The liveness computation heuristic looks at 10 instructions
+ ; before and after the copy. Make sure we do not reach the definition of
+ ; AL in 10 instructions, otherwise the heuristic will see that it is live.
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ ; Save AL.
+ ; CHECK: PUSH32r killed %eax
+
+ ; Copy EDI into EFLAGS
+ ; CHECK-NEXT: %eax = MOV32rr %edi
+ ; CHECK-NEXT: %al = ADD8ri %al, 127, implicit-def %eflags
+ ; CHECK-NEXT: SAHF implicit-def %eflags, implicit %ah
+ %eflags = COPY %edi
+
+ ; Restore AL.
+ ; CHECK-NEXT: %eax = POP32r
+ bb.1.false:
+ liveins: %al
+ NOOP implicit %al
+ RETQ
+
+...
diff --git a/test/CodeGen/X86/emutls-pic.ll b/test/CodeGen/X86/emutls-pic.ll
index 11676aff1892..50dc72653aea 100644
--- a/test/CodeGen/X86/emutls-pic.ll
+++ b/test/CodeGen/X86/emutls-pic.ll
@@ -82,28 +82,29 @@ entry:
}
; X32-LABEL: f5:
-; X32: movl __emutls_v.j@GOT(%ebx), %eax
+; X32: leal __emutls_v.j@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: movl (%eax), %esi
-; X32-NEXT: movl __emutls_v.k@GOT(%ebx), %eax
+; X32-NEXT: leal __emutls_v.k@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: addl (%eax), %esi
; X32-NEXT: movl %esi, %eax
; X64-LABEL: f5:
-; X64: movq __emutls_v.j@GOTPCREL(%rip), %rdi
+; X64: leaq __emutls_v.j(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: movl (%rax), %ebx
-; X64-NEXT: movq __emutls_v.k@GOTPCREL(%rip), %rdi
+; X64-NEXT: leaq __emutls_v.k(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: addl (%rax), %ebx
; X64-NEXT: movl %ebx, %eax
;;;;; 32-bit targets
-; X32: .data
+; X32: .data{{$}}
+; X32: .globl __emutls_v.i
; X32-LABEL: __emutls_v.i:
; X32-NEXT: .long 4
; X32-NEXT: .long 4
@@ -114,7 +115,8 @@ entry:
; X32-LABEL: __emutls_t.i:
; X32-NEXT: .long 15
-; X32: .data
+; X32: .data{{$}}
+; X32-NOT: .globl
; X32-LABEL: __emutls_v.j:
; X32-NEXT: .long 4
; X32-NEXT: .long 4
@@ -125,7 +127,8 @@ entry:
; X32-LABEL: __emutls_t.j:
; X32-NEXT: .long 42
-; X32: .data
+; X32: .data{{$}}
+; X32-NOT: .globl
; X32-LABEL: __emutls_v.k:
; X32-NEXT: .long 4
; X32-NEXT: .long 8
@@ -136,7 +139,8 @@ entry:
;;;;; 64-bit targets
-; X64: .data
+; X64: .data{{$}}
+; X64: .globl __emutls_v.i
; X64-LABEL: __emutls_v.i:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
@@ -147,7 +151,8 @@ entry:
; X64-LABEL: __emutls_t.i:
; X64-NEXT: .long 15
-; X64: .data
+; X64: .data{{$}}
+; X64-NOT: .globl
; X64-LABEL: __emutls_v.j:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
@@ -158,7 +163,8 @@ entry:
; X64-LABEL: __emutls_t.j:
; X64-NEXT: .long 42
-; X64: .data
+; X64: .data{{$}}
+; X64-NOT: .globl
; X64-LABEL: __emutls_v.k:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 8
diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll
index 45e5c38c0d8a..5db8c888a4e4 100644
--- a/test/CodeGen/X86/emutls-pie.ll
+++ b/test/CodeGen/X86/emutls-pie.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-android -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-android -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
; Use my_emutls_get_address like __emutls_get_address.
@@ -39,7 +39,7 @@ entry:
define i32 @f1() {
; X32-LABEL: f1:
-; X32: movl __emutls_v.i@GOT(%ebx), %eax
+; X32: leal __emutls_v.i@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
@@ -47,7 +47,7 @@ define i32 @f1() {
; X32-NEXT: popl %ebx
; X32-NEXT: retl
; X64-LABEL: f1:
-; X64: movq __emutls_v.i@GOTPCREL(%rip), %rdi
+; X64: leaq __emutls_v.i(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
@@ -60,11 +60,11 @@ entry:
define i32* @f2() {
; X32-LABEL: f2:
-; X32: movl __emutls_v.i@GOT(%ebx), %eax
+; X32: leal __emutls_v.i@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X64-LABEL: f2:
-; X64: movq __emutls_v.i@GOTPCREL(%rip), %rdi
+; X64: leaq __emutls_v.i(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
entry:
@@ -129,3 +129,8 @@ entry:
; X64-NOT: __emutls_v.i2
; X64-NOT: __emutls_t.i2
+
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/emutls_generic.ll b/test/CodeGen/X86/emutls_generic.ll
index b99a195426c2..16d90001426f 100644
--- a/test/CodeGen/X86/emutls_generic.ll
+++ b/test/CodeGen/X86/emutls_generic.ll
@@ -45,18 +45,19 @@ entry:
; CHECK: __emutls_t.internal_y
; X86_32-LABEL: get_external_x:
-; X86_32: movl __emutls_v.external_x
+; X86_32: movl __emutls_v.external_x@GOT(%ebx)
; X86_32: calll __emutls_get_address
; X86_32-LABEL: get_external_y:
-; X86_32: movl __emutls_v.external_y
+; X86_32: movl __emutls_v.external_y@GOT(%ebx)
; X86_32: calll __emutls_get_address
; X86_32-LABEL: get_internal_y:
-; X86_32: movl __emutls_v.internal_y
-; X86_32: calll __emutls_get_address
-; X86_32-NOT: __emutls_t.external_x
-; X86_32-NOT: __emutls_v.external_x:
-; X86_32: .data
-; X86_32: .align 4
+; X86_32: leal __emutls_v.internal_y@GOTOFF(%ebx)
+; X86_32: calll __emutls_get_address
+; X86_32-NOT: __emutls_t.external_x
+; X86_32-NOT: __emutls_v.external_x:
+; X86_32: .data{{$}}
+; X86_32: .globl __emutls_v.external_y
+; X86_32: .p2align 2
; X86_32-LABEL: __emutls_v.external_y:
; X86_32-NEXT: .long 1
; X86_32-NEXT: .long 2
@@ -65,8 +66,9 @@ entry:
; X86_32: .section .rodata,
; X86_32-LABEL: __emutls_t.external_y:
; X86_32-NEXT: .byte 7
-; X86_32: .data
-; X86_32: .align 4
+; X86_32: .data{{$}}
+; X86_32-NOT: .globl
+; X86_32: .p2align 2
; X86_32-LABEL: __emutls_v.internal_y:
; X86_32-NEXT: .long 8
; X86_32-NEXT: .long 16
@@ -75,17 +77,18 @@ entry:
; X86_32-LABEL: __emutls_t.internal_y:
; X86_32-NEXT: .quad 9
; X86_64-LABEL: get_external_x:
-; X86_64: __emutls_v.external_x
-; X86_64: __emutls_get_address
+; X86_64: __emutls_v.external_x@GOTPCREL(%rip)
+; X86_64: __emutls_get_address
; X86_64-LABEL: get_external_y:
-; X86_64: __emutls_v.external_y
-; X86_64: __emutls_get_address
+; X86_64: __emutls_v.external_y@GOTPCREL(%rip)
+; X86_64: __emutls_get_address
; X86_64-LABEL: get_internal_y:
-; X86_64: __emutls_v.internal_y
-; X86_64: __emutls_get_address
-; X86_64-NOT: __emutls_t.external_x
-; X86_64-NOT: __emutls_v.external_x:
-; X86_64: .align 8
+; X86_64: __emutls_v.internal_y(%rip)
+; X86_64: __emutls_get_address
+; X86_64-NOT: __emutls_t.external_x
+; X86_64-NOT: __emutls_v.external_x:
+; X86_64: .globl __emutls_v.external_y
+; X86_64: .p2align 3
; X86_64-LABEL: __emutls_v.external_y:
; X86_64-NEXT: .quad 1
; X86_64-NEXT: .quad 2
@@ -95,8 +98,9 @@ entry:
; X86_64: .section .rodata,
; X86_64-LABEL: __emutls_t.external_y:
; X86_64-NEXT: .byte 7
-; X86_64: .data
-; X86_64: .align 8
+; X86_64: .data{{$}}
+; X86_64-NOT: .globl
+; X86_64: .p2align 3
; X86_64-LABEL: __emutls_v.internal_y:
; X86_64-NEXT: .quad 8
; X86_64-NEXT: .quad 16
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
index ab92fe0d1d0c..992b3a395e7b 100644
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -1,13 +1,16 @@
-; RUN: llc -O3 -mtriple=x86_64-apple-macosx -o - < %s -mattr=+avx2 -enable-unsafe-fp-math -mcpu=core2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 -enable-unsafe-fp-math | FileCheck %s
+
; Check that the ExeDepsFix pass correctly fixes the domain for broadcast instructions.
; <rdar://problem/16354675>
-; CHECK-LABEL: ExeDepsFix_broadcastss
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x float> %arg to <4 x i32>
%and = and <4 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
%floatcast = bitcast <4 x i32> %and to <4 x float>
@@ -16,12 +19,13 @@ define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2)
ret <4 x float> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastss256
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastss256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <8 x float> %arg to <8 x i32>
%and = and <8 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
%floatcast = bitcast <8 x i32> %and to <8 x float>
@@ -30,13 +34,14 @@ define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg
ret <8 x float> %max
}
-
-; CHECK-LABEL: ExeDepsFix_broadcastss_inreg
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %arg2, i32 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastss_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm2
+; CHECK-NEXT: vbroadcastss %xmm2, %xmm2
+; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x float> %arg to <4 x i32>
%in = insertelement <4 x i32> undef, i32 %broadcastvalue, i32 0
%mask = shufflevector <4 x i32> %in, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -47,12 +52,14 @@ define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %
ret <4 x float> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float> %arg2, i32 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm2
+; CHECK-NEXT: vbroadcastss %xmm2, %ymm2
+; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <8 x float> %arg to <8 x i32>
%in = insertelement <8 x i32> undef, i32 %broadcastvalue, i32 0
%mask = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -63,12 +70,13 @@ define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float
ret <8 x float> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastsd
; In that case the broadcast is directly folded into vandpd.
-; CHECK: vandpd
-; CHECK: vmaxpd
-; CHECK:ret
define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <2 x double> %arg to <2 x i64>
%and = and <2 x i64> %bitcast, <i64 2147483647, i64 2147483647>
%floatcast = bitcast <2 x i64> %and to <2 x double>
@@ -77,12 +85,13 @@ define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg
ret <2 x double> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastsd256
-; CHECK: broadcastsd
-; CHECK: vandpd
-; CHECK: vmaxpd
-; CHECK: ret
define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
+; CHECK-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x double> %arg to <4 x i64>
%and = and <4 x i64> %bitcast, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
%floatcast = bitcast <4 x i64> %and to <4 x double>
@@ -91,16 +100,16 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
ret <4 x double> %max
}
-
-; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
; vpand and there is nothing more you can do to match vmaxpd.
-; CHECK: vmovq
-; CHECK: vpbroadcastq
-; CHECK: vpand
-; CHECK: vmaxpd
-; CHECK: ret
define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm2
+; CHECK-NEXT: vpbroadcastq %xmm2, %xmm2
+; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <2 x double> %arg to <2 x i64>
%in = insertelement <2 x i64> undef, i64 %broadcastvalue, i32 0
%mask = shufflevector <2 x i64> %in, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -111,12 +120,14 @@ define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double
ret <2 x double> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg
-; CHECK: broadcastsd
-; CHECK: vandpd
-; CHECK: vmaxpd
-; CHECK: ret
define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x double> %arg2, i64 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm2
+; CHECK-NEXT: vbroadcastsd %xmm2, %ymm2
+; CHECK-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x double> %arg to <4 x i64>
%in = insertelement <4 x i64> undef, i64 %broadcastvalue, i32 0
%mask = shufflevector <4 x i64> %in, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -126,4 +137,3 @@ define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x dou
%max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
ret <4 x double> %max
}
-
diff --git a/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
index 8ce1c7eaae70..3598c045ad53 100644
--- a/test/CodeGen/X86/expand-vr64-gr64-copy.mir
+++ b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+3dnow -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+3dnow -o - %s | FileCheck %s
# This test verifies that the ExpandPostRA pass expands the GR64 <-> VR64
# copies into appropriate MMX_MOV instructions.
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index ab3ff8ed435e..eb7cdb6b57be 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -1,51 +1,636 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+;
+; ExtractElement - Constant Index
+;
-; CHECK-LABEL: extractelement_index_1:
-define i8 @extractelement_index_1(<32 x i8> %a) nounwind {
- ; X64: movaps
- ; AVX: vpextrb $1
+define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v16i8_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v16i8_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $1, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 1
+ ret i8 %b
+}
+
+define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v16i8_11:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v16i8_11:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $11, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_11:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 11
+ ret i8 %b
+}
+
+define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v16i8_14:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v16i8_14:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $14, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_14:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 14
+ ret i8 %b
+}
+
+define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v32i8_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v32i8_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $1, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v32i8_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 1
ret i8 %b
}
-; CHECK-LABEL: extractelement_index_2:
-define i32 @extractelement_index_2(<8 x i32> %a) nounwind {
- ; X64: pshufd
- ; AVX: vextractf128 $1
- ; AVX-NEXT: vpextrd $3
- %b = extractelement <8 x i32> %a, i64 7
+define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v32i8_17:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v32i8_17:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $1, %xmm1, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v32i8_17:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v32i8_17:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <32 x i8> %a, i256 17
+ ret i8 %b
+}
+
+define i16 @extractelement_v8i16_0(<8 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i16_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i16_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <8 x i16> %a, i256 0
+ ret i16 %b
+}
+
+define i16 @extractelement_v8i16_3(<8 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i16_3:
+; SSE: # BB#0:
+; SSE-NEXT: pextrw $3, %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i16_3:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrw $3, %xmm0, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <8 x i16> %a, i256 3
+ ret i16 %b
+}
+
+define i16 @extractelement_v16i16_0(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i16_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 0
+ ret i16 %b
+}
+
+define i16 @extractelement_v16i16_13(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_13:
+; SSE: # BB#0:
+; SSE-NEXT: pextrw $5, %xmm1, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v16i16_13:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrw $5, %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v16i16_13:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrw $5, %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 13
+ ret i16 %b
+}
+
+define i32 @extractelement_v4i32_0(<4 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v4i32_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i32_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
+ %b = extractelement <4 x i32> %a, i256 0
ret i32 %b
}
-; CHECK-LABEL: extractelement_index_3:
-define i32 @extractelement_index_3(<8 x i32> %a) nounwind {
- ; CHECK-NOT: pextr
- %b = extractelement <8 x i32> %a, i64 15
+define i32 @extractelement_v4i32_3(<4 x i32> %a) nounwind {
+; SSE2-LABEL: extractelement_v4i32_3:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v4i32_3:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i32_3:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: retq
+ %b = extractelement <4 x i32> %a, i256 3
+ ret i32 %b
+}
+
+define i32 @extractelement_v8i32_0(<8 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v8i32_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i32_0:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <8 x i32> %a, i256 4
ret i32 %b
}
-; CHECK-LABEL: extractelement_index_4:
-define i32 @extractelement_index_4(<8 x i32> %a) nounwind {
- ; X64: movd
- ; AVX: vextractf128 $1
- ; AVX-NEXT: vmovd
+define i32 @extractelement_v8i32_4(<8 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v8i32_4:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i32_4:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <8 x i32> %a, i256 4
ret i32 %b
}
-; CHECK-LABEL: extractelement_index_5:
-define i8 @extractelement_index_5(<32 x i8> %a, i256 %i) nounwind {
- ; X64: movaps
- ; AVX: vmovaps
+define i32 @extractelement_v8i32_7(<8 x i32> %a) nounwind {
+; SSE2-LABEL: extractelement_v8i32_7:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v8i32_7:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $3, %xmm1, %eax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v8i32_7:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v8i32_7:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrd $3, %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <8 x i32> %a, i64 7
+ ret i32 %b
+}
+
+define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v2i64_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v2i64_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: retq
+ %b = extractelement <2 x i64> %a, i256 0
+ ret i64 %b
+}
+
+define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind {
+; SSE2-LABEL: extractelement_v2i64_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v2i64_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v2i64_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: retq
+ %b = extractelement <2 x i64> %a, i256 1
+ ret i64 %b
+}
+
+define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind {
+; SSE2-LABEL: extractelement_v4i64_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v4i64_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i64_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 1
+ ret i64 %b
+}
+
+define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
+; SSE2-LABEL: extractelement_v4i64_3:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v4i64_3:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v4i64_3:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v4i64_3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 3
+ ret i64 %b
+}
+
+;
+; ExtractElement - Variable Index
+;
+
+define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i8_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movb (%rdi,%rax), %al
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movb (%rdi,%rax), %al
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 %i
+ ret i8 %b
+}
+
+define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v32i8_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: leaq (%rsp), %rax
+; SSE-NEXT: movb (%rdi,%rax), %al
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v32i8_var:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: leaq (%rsp), %rax
+; AVX-NEXT: movb (%rdi,%rax), %al
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 %i
ret i8 %b
}
-; CHECK-LABEL: extractelement_index_6:
-define i8 @extractelement_index_6(<32 x i8> %a) nounwind {
- ; CHECK-NOT: pextr
+define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i16_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i16_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; AVX-NEXT: retq
+ %b = extractelement <8 x i16> %a, i256 %i
+ ret i16 %b
+}
+
+define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movzwl (%rsp,%rdi,2), %eax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i16_var:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: movzwl (%rsp,%rdi,2), %eax
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 %i
+ ret i16 %b
+}
+
+define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v4i32_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -24(%rsp,%rdi,4), %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i32_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movl -24(%rsp,%rdi,4), %eax
+; AVX-NEXT: retq
+ %b = extractelement <4 x i32> %a, i256 %i
+ ret i32 %b
+}
+
+define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i32_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movl (%rsp,%rdi,4), %eax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v8i32_var:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: movl (%rsp,%rdi,4), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v8i32_var:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <8 x i32> %a, i256 %i
+ ret i32 %b
+}
+
+define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v2i64_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -24(%rsp,%rdi,8), %rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v2i64_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -24(%rsp,%rdi,8), %rax
+; AVX-NEXT: retq
+ %b = extractelement <2 x i64> %a, i256 %i
+ ret i64 %b
+}
+
+define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v4i64_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movq (%rsp,%rdi,8), %rax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i64_var:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: movq (%rsp,%rdi,8), %rax
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 %i
+ ret i64 %b
+}
+
+;
+; ExtractElement - Constant (Out Of Range) Index
+;
+
+define i8 @extractelement_32i8_m1(<32 x i8> %a) nounwind {
+; SSE-LABEL: extractelement_32i8_m1:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_32i8_m1:
+; AVX: # BB#0:
+; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 -1
ret i8 %b
-} \ No newline at end of file
+}
+
+define i16 @extractelement_v16i16_m4(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_m4:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i16_m4:
+; AVX: # BB#0:
+; AVX-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 -4
+ ret i16 %b
+}
+
+define i32 @extractelement_v8i32_15(<8 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v8i32_15:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i32_15:
+; AVX: # BB#0:
+; AVX-NEXT: retq
+ %b = extractelement <8 x i32> %a, i64 15
+ ret i32 %b
+}
+
+define i64 @extractelement_v4i64_4(<4 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v4i64_4:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i64_4:
+; AVX: # BB#0:
+; AVX-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 4
+ ret i64 %b
+}
diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index e50d353797a0..5855303e1278 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@@ -1,28 +1,48 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+avx -mcpu=btver2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=X64-SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i32 @t(<2 x i64>* %val) nounwind {
-; CHECK-LABEL: t:
-; CHECK-NOT: movd
-; CHECK: movl 8(
-; CHECK-NEXT: ret
- %tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
- %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1]
- ret i32 %tmp4
+; X32-SSE2-LABEL: t:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT: movl 8(%eax), %eax
+; X32-SSE2-NEXT: retl
+;
+; X64-SSSE3-LABEL: t:
+; X64-SSSE3: # BB#0:
+; X64-SSSE3-NEXT: movl 8(%rdi), %eax
+; X64-SSSE3-NEXT: retq
+;
+; X64-AVX-LABEL: t:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: movl 8(%rdi), %eax
+; X64-AVX-NEXT: retq
+ %tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
+ %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1]
+ %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1]
+ ret i32 %tmp4
}
; Case where extractelement of load ends up as undef.
; (Making sure this doesn't crash.)
define i32 @t2(<8 x i32>* %xp) {
-; CHECK-LABEL: t2:
-; CHECK: ret
+; X32-SSE2-LABEL: t2:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: retl
+;
+; X64-SSSE3-LABEL: t2:
+; X64-SSSE3: # BB#0:
+; X64-SSSE3-NEXT: retq
+;
+; X64-AVX-LABEL: t2:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: retq
%x = load <8 x i32>, <8 x i32>* %xp
- %Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32
-undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
+ %Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
%y = extractelement <8 x i32> %Shuff68, i32 0
ret i32 %y
}
@@ -36,10 +56,20 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
; need to special-case the checks.
define void @t3() {
-; CHECK-LABEL: t3:
-; CHECK: movupd
-; CHECK: movhpd
-
+; X32-SSE2-LABEL: t3:
+; X32-SSE2: # BB#0: # %bb
+; X32-SSE2-NEXT: movupd (%eax), %xmm0
+; X32-SSE2-NEXT: movhpd %xmm0, (%eax)
+;
+; X64-SSSE3-LABEL: t3:
+; X64-SSSE3: # BB#0: # %bb
+; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-SSSE3-NEXT: movlpd %xmm0, (%rax)
+;
+; X64-AVX-LABEL: t3:
+; X64-AVX: # BB#0: # %bb
+; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT: vmovlpd %xmm0, (%rax)
bb:
%tmp13 = load <2 x double>, <2 x double>* undef, align 1
%.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
@@ -52,9 +82,26 @@ bb:
; This is testing for an assertion - the extraction was assuming that the undef
; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
define i64 @t4(<2 x double>* %a) {
-; CHECK-LABEL: t4:
-; CHECK: mov
-; CHECK: ret
+; X32-SSE2-LABEL: t4:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT: movapd (%eax), %xmm0
+; X32-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: movd %xmm1, %eax
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-SSE2-NEXT: movd %xmm0, %edx
+; X32-SSE2-NEXT: retl
+;
+; X64-SSSE3-LABEL: t4:
+; X64-SSSE3: # BB#0:
+; X64-SSSE3-NEXT: movq (%rdi), %rax
+; X64-SSSE3-NEXT: retq
+;
+; X64-AVX-LABEL: t4:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: movq (%rdi), %rax
+; X64-AVX-NEXT: retq
%b = load <2 x double>, <2 x double>* %a, align 16
%c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>
%d = bitcast <2 x double> %c to <2 x i64>
diff --git a/test/CodeGen/X86/extractps.ll b/test/CodeGen/X86/extractps.ll
index fecd2faed321..7d4c2cf619a1 100644
--- a/test/CodeGen/X86/extractps.ll
+++ b/test/CodeGen/X86/extractps.ll
@@ -4,7 +4,7 @@
; RUN: grep "extractps \$1, %xmm0, " %t | count 1
; PR2647
-external global float, align 16 ; <float*>:0 [#uses=2]
+@0 = external global float, align 16 ; <float*>:0 [#uses=2]
define internal void @""() nounwind {
load float, float* @0, align 16 ; <float>:1 [#uses=1]
diff --git a/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..6b7d39548385
--- /dev/null
+++ b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
+
+define float @test_cvtsh_ss(i16 %a0) nounwind {
+; X32-LABEL: test_cvtsh_ss:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0
+; X32-NEXT: vmovss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_cvtsh_ss:
+; X64: # BB#0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+ %ins1 = insertelement <8 x i16> %ins0, i16 0, i32 1
+ %ins2 = insertelement <8 x i16> %ins1, i16 0, i32 2
+ %ins3 = insertelement <8 x i16> %ins2, i16 0, i32 3
+ %ins4 = insertelement <8 x i16> %ins3, i16 0, i32 4
+ %ins5 = insertelement <8 x i16> %ins4, i16 0, i32 5
+ %ins6 = insertelement <8 x i16> %ins5, i16 0, i32 6
+ %ins7 = insertelement <8 x i16> %ins6, i16 0, i32 7
+ %cvt = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %ins7)
+ %res = extractelement <4 x float> %cvt, i32 0
+ ret float %res
+}
+
+define i16 @test_cvtss_sh(float %a0) nounwind {
+; X32-LABEL: test_cvtss_sh:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X32-NEXT: vmovd %xmm0, %eax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_cvtss_sh:
+; X64: # BB#0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+ %ins0 = insertelement <4 x float> undef, float %a0, i32 0
+ %ins1 = insertelement <4 x float> %ins0, float 0.000000e+00, i32 1
+ %ins2 = insertelement <4 x float> %ins1, float 0.000000e+00, i32 2
+ %ins3 = insertelement <4 x float> %ins2, float 0.000000e+00, i32 3
+ %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %ins3, i32 0)
+ %res = extractelement <8 x i16> %cvt, i32 0
+ ret i16 %res
+}
+
+define <4 x float> @test_mm_cvtph_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtph_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtph_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %arg0)
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_cvtph_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtph_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtph2ps %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtph_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtph2ps %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %arg0)
+ ret <8 x float> %res
+}
+
+define <2 x i64> @test_mm_cvtps_ph(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_ph:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtps_ph:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
+ %res = bitcast <8 x i16> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm256_cvtps_ph(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtps_ph:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtps_ph:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
+ %res = bitcast <8 x i16> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
+
+declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
diff --git a/test/CodeGen/X86/fast-isel-call.ll b/test/CodeGen/X86/fast-isel-call.ll
index 9fd07b521ab2..ee70404bcedf 100644
--- a/test/CodeGen/X86/fast-isel-call.ll
+++ b/test/CodeGen/X86/fast-isel-call.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 -mtriple=i686-apple-darwin8 2>/dev/null | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 -mtriple=i686-apple-darwin8 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
%struct.s = type {i32, i32, i32}
@@ -22,12 +23,12 @@ define void @test2(%struct.s* %d) nounwind {
call void @foo2(%struct.s* byval %d )
ret void
; CHECK-LABEL: test2:
-; CHECK: movl (%eax)
-; CHECK: movl {{.*}}, (%esp)
-; CHECK: movl 4(%eax)
-; CHECK: movl {{.*}}, 4(%esp)
-; CHECK: movl 8(%eax)
-; CHECK: movl {{.*}}, 8(%esp)
+; CHECK: movl (%eax), %ecx
+; CHECK: movl %ecx, (%esp)
+; CHECK: movl 4(%eax), %ecx
+; CHECK: movl %ecx, 4(%esp)
+; CHECK: movl 8(%eax), %eax
+; CHECK: movl %eax, 8(%esp)
}
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
@@ -53,3 +54,32 @@ define void @test4(i8* %a, i8* %b) {
; CHECK: movl $100, 8(%esp)
; CHECK: calll {{.*}}memcpy
}
+
+; STDERR-NOT: FastISel missed call: call x86_thiscallcc void @thiscallfun
+%struct.S = type { i8 }
+define void @test5() {
+entry:
+ %s = alloca %struct.S, align 1
+; CHECK-LABEL: test5:
+; CHECK: subl $12, %esp
+; CHECK: leal 8(%esp), %ecx
+; CHECK: movl $43, (%esp)
+; CHECK: calll {{.*}}thiscallfun
+; CHECK: addl $8, %esp
+ call x86_thiscallcc void @thiscallfun(%struct.S* %s, i32 43)
+ ret void
+}
+declare x86_thiscallcc void @thiscallfun(%struct.S*, i32) #1
+
+; STDERR-NOT: FastISel missed call: call x86_stdcallcc void @stdcallfun
+define void @test6() {
+entry:
+; CHECK-LABEL: test6:
+; CHECK: subl $12, %esp
+; CHECK: movl $43, (%esp)
+; CHECK: calll {{.*}}stdcallfun
+; CHECK: addl $8, %esp
+ call x86_stdcallcc void @stdcallfun(i32 43)
+ ret void
+}
+declare x86_stdcallcc void @stdcallfun(i32) #1
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
index 04dbac07690a..475d8fcf7f35 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch2.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
@@ -5,7 +5,7 @@ define i32 @fcmp_oeq(float %x, float %y) {
; CHECK-LABEL: fcmp_oeq
; CHECK: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_1}}
-; CHECK-NEXT: jnp {{LBB.+_2}}
+; CHECK-NEXT: jp {{LBB.+_1}}
%1 = fcmp oeq float %x, %y
br i1 %1, label %bb1, label %bb2
bb2:
@@ -162,8 +162,7 @@ define i32 @fcmp_une(float %x, float %y) {
; CHECK-LABEL: fcmp_une
; CHECK: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_2}}
-; CHECK-NEXT: jp {{LBB.+_2}}
-; CHECK-NEXT: jmp {{LBB.+_1}}
+; CHECK-NEXT: jnp {{LBB.+_1}}
%1 = fcmp une float %x, %y
br i1 %1, label %bb1, label %bb2
bb2:
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
index e54d0ca40078..8f09b2e38356 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -17,7 +17,7 @@ define i32 @fcmp_oeq2(float %x) {
; CHECK: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_1}}
-; CHECK-NEXT: jnp {{LBB.+_2}}
+; CHECK-NEXT: jp {{LBB.+_1}}
%1 = fcmp oeq float %x, 0.000000e+00
br i1 %1, label %bb1, label %bb2
bb2:
@@ -338,8 +338,7 @@ define i32 @fcmp_une2(float %x) {
; CHECK: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_2}}
-; CHECK-NEXT: jp {{LBB.+_2}}
-; CHECK-NEXT: jmp {{LBB.+_1}}
+; CHECK-NEXT: jnp {{LBB.+_1}}
%1 = fcmp une float %x, 0.000000e+00
br i1 %1, label %bb1, label %bb2
bb2:
diff --git a/test/CodeGen/X86/fast-isel-float-half-convertion.ll b/test/CodeGen/X86/fast-isel-float-half-convertion.ll
index 707a325bf41d..acb85fd171f5 100644
--- a/test/CodeGen/X86/fast-isel-float-half-convertion.ll
+++ b/test/CodeGen/X86/fast-isel-float-half-convertion.ll
@@ -4,7 +4,7 @@
define i16 @test_fp32_to_fp16(float %a) {
; CHECK-LABEL: test_fp32_to_fp16:
-; CHECK: vcvtps2ph $0, %xmm0, %xmm0
+; CHECK: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll
index 6a174dbf5a8a..2fc08fb4135d 100644
--- a/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -1,7 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4a -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE4A
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse4a -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE4A
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Scalar Stores
+;
define void @test_nti32(i32* nocapture %ptr, i32 %X) {
; ALL-LABEL: test_nti32:
@@ -34,10 +42,20 @@ define void @test_ntfloat(float* nocapture %ptr, float %X) {
; SSE4A-NEXT: movntss %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
+; SSE41-LABEL: test_ntfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movss %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
; AVX-LABEL: test_ntfloat:
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_ntfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovss %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store float %X, float* %ptr, align 4, !nontemporal !1
ret void
@@ -54,15 +72,29 @@ define void @test_ntdouble(double* nocapture %ptr, double %X) {
; SSE4A-NEXT: movntsd %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
+; SSE41-LABEL: test_ntdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movsd %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
; AVX-LABEL: test_ntdouble:
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovsd %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_ntdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovsd %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store double %X, double* %ptr, align 8, !nontemporal !1
ret void
}
+;
+; 128-bit Vector Stores
+;
+
define void @test_nt4xfloat(<4 x float>* nocapture %ptr, <4 x float> %X) {
; SSE-LABEL: test_nt4xfloat:
; SSE: # BB#0: # %entry
@@ -73,6 +105,11 @@ define void @test_nt4xfloat(<4 x float>* nocapture %ptr, <4 x float> %X) {
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store <4 x float> %X, <4 x float>* %ptr, align 16, !nontemporal !1
ret void
@@ -88,11 +125,76 @@ define void @test_nt2xdouble(<2 x double>* nocapture %ptr, <2 x double> %X) {
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovntpd %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt2xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntpd %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store <2 x double> %X, <2 x double>* %ptr, align 16, !nontemporal !1
ret void
}
+define void @test_nt16xi8(<16 x i8>* nocapture %ptr, <16 x i8> %X) {
+; SSE-LABEL: test_nt16xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x i8> %X, <16 x i8>* %ptr, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xi16(<8 x i16>* nocapture %ptr, <8 x i16> %X) {
+; SSE-LABEL: test_nt8xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x i16> %X, <8 x i16>* %ptr, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test_nt4xi32(<4 x i32>* nocapture %ptr, <4 x i32> %X) {
+; SSE-LABEL: test_nt4xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt4xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <4 x i32> %X, <4 x i32>* %ptr, align 16, !nontemporal !1
+ ret void
+}
+
define void @test_nt2xi64(<2 x i64>* nocapture %ptr, <2 x i64> %X) {
; SSE-LABEL: test_nt2xi64:
; SSE: # BB#0: # %entry
@@ -103,9 +205,984 @@ define void @test_nt2xi64(<2 x i64>* nocapture %ptr, <2 x i64> %X) {
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt2xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store <2 x i64> %X, <2 x i64>* %ptr, align 16, !nontemporal !1
ret void
}
+;
+; 128-bit Vector Loads
+;
+
+define <4 x float> @test_load_nt4xfloat(<4 x float>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt4xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt4xfloat:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt4xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_load_nt4xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x float>, <4 x float>* %ptr, align 16, !nontemporal !1
+ ret <4 x float> %0
+}
+
+define <2 x double> @test_load_nt2xdouble(<2 x double>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt2xdouble:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt2xdouble:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movapd (%rdi), %xmm0
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt2xdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_load_nt2xdouble:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt2xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <2 x double>, <2 x double>* %ptr, align 16, !nontemporal !1
+ ret <2 x double> %0
+}
+
+define <16 x i8> @test_load_nt16xi8(<16 x i8>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt16xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt16xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x i8>, <16 x i8>* %ptr, align 16, !nontemporal !1
+ ret <16 x i8> %0
+}
+
+define <8 x i16> @test_load_nt8xi16(<8 x i16>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt8xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt8xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x i16>, <8 x i16>* %ptr, align 16, !nontemporal !1
+ ret <8 x i16> %0
+}
+
+define <4 x i32> @test_load_nt4xi32(<4 x i32>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt4xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt4xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x i32>, <4 x i32>* %ptr, align 16, !nontemporal !1
+ ret <4 x i32> %0
+}
+
+define <2 x i64> @test_load_nt2xi64(<2 x i64>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt2xi64:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt2xi64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt2xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <2 x i64>, <2 x i64>* %ptr, align 16, !nontemporal !1
+ ret <2 x i64> %0
+}
+
+;
+; 256-bit Vector Stores
+;
+
+define void @test_nt8xfloat(<8 x float>* nocapture %ptr, <8 x float> %X) {
+; SSE-LABEL: test_nt8xfloat:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x float> %X, <8 x float>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt4xdouble(<4 x double>* nocapture %ptr, <4 x double> %X) {
+; SSE-LABEL: test_nt4xdouble:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: movntpd %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt4xdouble:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <4 x double> %X, <4 x double>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt32xi8(<32 x i8>* nocapture %ptr, <32 x i8> %X) {
+; SSE-LABEL: test_nt32xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt32xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt32xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <32 x i8> %X, <32 x i8>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt16xi16(<16 x i16>* nocapture %ptr, <16 x i16> %X) {
+; SSE-LABEL: test_nt16xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x i16> %X, <16 x i16>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xi32(<8 x i32>* nocapture %ptr, <8 x i32> %X) {
+; SSE-LABEL: test_nt8xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x i32> %X, <8 x i32>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt4xi64(<4 x i64>* nocapture %ptr, <4 x i64> %X) {
+; SSE-LABEL: test_nt4xi64:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt4xi64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <4 x i64> %X, <4 x i64>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+;
+; 256-bit Vector Loads
+;
+
+define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xfloat:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xfloat:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xfloat:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x float>, <8 x float>* %ptr, align 32, !nontemporal !1
+ ret <8 x float> %0
+}
+
+define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt4xdouble:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: movapd 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt4xdouble:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movapd (%rdi), %xmm0
+; SSE4A-NEXT: movapd 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt4xdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt4xdouble:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovapd (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt4xdouble:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x double>, <4 x double>* %ptr, align 32, !nontemporal !1
+ ret <4 x double> %0
+}
+
+define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt32xi8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt32xi8:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt32xi8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt32xi8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt32xi8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt32xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <32 x i8>, <32 x i8>* %ptr, align 32, !nontemporal !1
+ ret <32 x i8> %0
+}
+
+define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt16xi16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt16xi16:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt16xi16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt16xi16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt16xi16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x i16>, <16 x i16>* %ptr, align 32, !nontemporal !1
+ ret <16 x i16> %0
+}
+
+define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xi32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xi32:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xi32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xi32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xi32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x i32>, <8 x i32>* %ptr, align 32, !nontemporal !1
+ ret <8 x i32> %0
+}
+
+define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt4xi64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt4xi64:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt4xi64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt4xi64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt4xi64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x i64>, <4 x i64>* %ptr, align 32, !nontemporal !1
+ ret <4 x i64> %0
+}
+
+;
+; 512-bit Vector Stores
+;
+
+define void @test_nt16xfloat(<16 x float>* nocapture %ptr, <16 x float> %X) {
+; SSE-LABEL: test_nt16xfloat:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm2, 32(%rdi)
+; SSE-NEXT: movntps %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntps %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x float> %X, <16 x float>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xdouble(<8 x double>* nocapture %ptr, <8 x double> %X) {
+; SSE-LABEL: test_nt8xdouble:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: movntpd %xmm1, 16(%rdi)
+; SSE-NEXT: movntpd %xmm2, 32(%rdi)
+; SSE-NEXT: movntpd %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xdouble:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX-NEXT: vmovntpd %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntpd %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x double> %X, <8 x double>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt64xi8(<64 x i8>* nocapture %ptr, <64 x i8> %X) {
+; SSE-LABEL: test_nt64xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt64xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_nt64xi8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_nt64xi8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: retq
+entry:
+ store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt32xi16(<32 x i16>* nocapture %ptr, <32 x i16> %X) {
+; SSE-LABEL: test_nt32xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt32xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_nt32xi16:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_nt32xi16:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: retq
+entry:
+ store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt16xi32(<16 x i32>* nocapture %ptr, <16 x i32> %X) {
+; SSE-LABEL: test_nt16xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x i32> %X, <16 x i32>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xi64(<8 x i64>* nocapture %ptr, <8 x i64> %X) {
+; SSE-LABEL: test_nt8xi64:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xi64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x i64> %X, <8 x i64>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+;
+; 512-bit Vector Loads
+;
+
+define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt16xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt16xfloat:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt16xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt16xfloat:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt16xfloat:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x float>, <16 x float>* %ptr, align 64, !nontemporal !1
+ ret <16 x float> %0
+}
+
+define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xdouble:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: movapd 16(%rdi), %xmm1
+; SSE2-NEXT: movapd 32(%rdi), %xmm2
+; SSE2-NEXT: movapd 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xdouble:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movapd (%rdi), %xmm0
+; SSE4A-NEXT: movapd 16(%rdi), %xmm1
+; SSE4A-NEXT: movapd 32(%rdi), %xmm2
+; SSE4A-NEXT: movapd 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xdouble:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovapd (%rdi), %ymm0
+; AVX1-NEXT: vmovapd 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xdouble:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x double>, <8 x double>* %ptr, align 64, !nontemporal !1
+ ret <8 x double> %0
+}
+
+define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt64xi8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt64xi8:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt64xi8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt64xi8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt64xi8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_load_nt64xi8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_load_nt64xi8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %0 = load <64 x i8>, <64 x i8>* %ptr, align 64, !nontemporal !1
+ ret <64 x i8> %0
+}
+
+define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt32xi16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt32xi16:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt32xi16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt32xi16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt32xi16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_load_nt32xi16:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_load_nt32xi16:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %0 = load <32 x i16>, <32 x i16>* %ptr, align 64, !nontemporal !1
+ ret <32 x i16> %0
+}
+
+define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt16xi32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt16xi32:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt16xi32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt16xi32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt16xi32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x i32>, <16 x i32>* %ptr, align 64, !nontemporal !1
+ ret <16 x i32> %0
+}
+
+define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xi64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xi64:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xi64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xi64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xi64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x i64>, <8 x i64>* %ptr, align 64, !nontemporal !1
+ ret <8 x i64> %0
+}
+
!1 = !{i32 1}
diff --git a/test/CodeGen/X86/fast-isel-stackcheck.ll b/test/CodeGen/X86/fast-isel-stackcheck.ll
index 3b7318fa77d9..1398b3006699 100644
--- a/test/CodeGen/X86/fast-isel-stackcheck.ll
+++ b/test/CodeGen/X86/fast-isel-stackcheck.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx"
; CHECK-LABEL: foo:
; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
-; CHECK-NOT: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
define void @foo() #0 {
entry:
%_tags = alloca [3 x i32], align 4
@@ -16,8 +16,10 @@ entry:
}
; CHECK-LABEL: bar:
-; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
-; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %{{r.x}}
+; CHECK-DAG: movq ___stack_chk_guard@GOTPCREL(%rip), %[[GUARD:r.x]]
+; CHECK-DAG: movq {{[0-9]+}}(%rsp), %[[CANARY:r.x]]
+; CHECK: subq %[[CANARY]], %[[GUARD]]
define void @bar() #1 {
entry:
%vt = alloca [2 x double], align 16
diff --git a/test/CodeGen/X86/fast-isel-vecload.ll b/test/CodeGen/X86/fast-isel-vecload.ll
index 48eebf526f19..c5323f1c14f6 100644
--- a/test/CodeGen/X86/fast-isel-vecload.ll
+++ b/test/CodeGen/X86/fast-isel-vecload.ll
@@ -1,5 +1,6 @@
; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE --check-prefix=ALL
; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=ALL
+; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL
; Verify that fast-isel knows how to select aligned/unaligned vector loads.
; Also verify that the selected load instruction is in the correct domain.
@@ -183,3 +184,23 @@ entry:
%0 = load <2 x double>, <2 x double>* %V
ret <2 x double> %0
}
+
+define <8 x i64> @test_v8i64_alignment(<8 x i64>* %V) {
+; KNL-LABEL: test_v8i64_alignment:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa64 (%rdi), %zmm0
+; KNL-NEXT: retq
+entry:
+ %0 = load <8 x i64>, <8 x i64>* %V, align 64
+ ret <8 x i64> %0
+}
+
+define <8 x i64> @test_v8i64(<8 x i64>* %V) {
+; KNL-LABEL: test_v8i64:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu64 (%rdi), %zmm0
+; KNL-NEXT: retq
+entry:
+ %0 = load <8 x i64>, <8 x i64>* %V, align 4
+ ret <8 x i64> %0
+}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d748cba2f8f8..ad0f11f4dc00 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort=1 | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-verbose 2>&1 >/dev/null | FileCheck %s --check-prefix=STDERR --allow-empty
; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort=1 | FileCheck %s --check-prefix=AVX
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -312,3 +313,10 @@ define void @allocamaterialize() {
call void @takesi32ptr(i32* %a)
ret void
}
+
+; STDERR-NOT: FastISel missed terminator: ret void
+; CHECK-LABEL: win64ccfun
+define x86_64_win64cc void @win64ccfun(i32 %i) {
+; CHECK: ret
+ ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index 8049c72ec018..8cddee5a7cd0 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -1,4 +1,5 @@
; RUN: llc -fast-isel -O0 -mcpu=generic -mtriple=i386-apple-darwin10 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -fast-isel -O0 -mcpu=generic -mtriple=i386-apple-darwin10 -relocation-model=pic < %s -fast-isel-verbose 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
; This should use flds to set the return value.
; CHECK-LABEL: test0:
@@ -18,11 +19,38 @@ define void @test1({i32, i32, i32, i32}* sret %p) nounwind {
ret void
}
+; This should pop 8 bytes on return.
+; CHECK-LABEL: thiscallfun:
+; CHECK: retl $8
+define x86_thiscallcc i32 @thiscallfun(i32* %this, i32 %a, i32 %b) nounwind {
+; STDERR-NOT: FastISel missed terminator: ret i32 12345
+ ret i32 12345
+}
+
+; Here, the callee pop doesn't fit the 16 bit immediate -- see x86-big-ret.ll
+; This checks that -fast-isel doesn't miscompile this.
+; CHECK-LABEL: thiscall_large:
+; CHECK: popl %ecx
+; CHECK-NEXT: addl $65536, %esp
+; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: retl
+define x86_thiscallcc void @thiscall_large(i32* %this, [65533 x i8]* byval %b) nounwind {
+ ret void
+}
+
+; This should pop 4 bytes on return.
+; CHECK-LABEL: stdcallfun:
+; CHECK: retl $4
+define x86_stdcallcc i32 @stdcallfun(i32 %a) nounwind {
+; STDERR-NOT: FastISel missed terminator: ret i32 54321
+ ret i32 54321
+}
+
; Properly initialize the pic base.
; CHECK-LABEL: test2:
; CHECK-NOT: HHH
-; CHECK: call{{.*}}L2$pb
-; CHECK-NEXT: L2$pb:
+; CHECK: call{{.*}}L5$pb
+; CHECK-NEXT: L5$pb:
; CHECK-NEXT: pop
; CHECK: HHH
; CHECK: retl
@@ -75,6 +103,7 @@ entry:
; SDag-ISel's arg push:
; CHECK: movl %esp, [[REGISTER:%[a-z]+]]
; CHECK: movl $42, ([[REGISTER]])
-; CHECK: movl __imp__test5dllimport
+; CHECK: movl L_test5dllimport$non_lazy_ptr-L8$pb(%eax), %eax
+
}
declare dllimport i32 @test5dllimport(i32)
diff --git a/test/CodeGen/X86/fastmath-float-half-conversion.ll b/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 29308735cca2..637fcc215958 100644
--- a/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -5,7 +5,7 @@ define zeroext i16 @test1_fast(double %d) #0 {
; ALL-LABEL: test1_fast:
; F16C-NOT: callq {{_+}}truncdfhf2
; F16C: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX: callq {{_+}}truncdfhf2
; ALL: ret
entry:
@@ -19,7 +19,7 @@ define zeroext i16 @test2_fast(x86_fp80 %d) #0 {
; F16C: fldt
; F16C-NEXT: fstps
; F16C-NEXT: vmovss
-; F16C-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX: callq {{_+}}truncxfhf2
; ALL: ret
entry:
diff --git a/test/CodeGen/X86/fixup-bw-copy.ll b/test/CodeGen/X86/fixup-bw-copy.ll
new file mode 100644
index 000000000000..9067dfd29c17
--- /dev/null
+++ b/test/CodeGen/X86/fixup-bw-copy.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=1 -mtriple=x86_64-- < %s | FileCheck --check-prefix=X64 --check-prefix=BWON64 %s
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=0 -mtriple=x86_64-- < %s | FileCheck --check-prefix=X64 --check-prefix=BWOFF64 %s
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=1 -mtriple=i386-- < %s | FileCheck --check-prefix=X32 --check-prefix=BWON32 %s
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=0 -mtriple=i386-- < %s | FileCheck --check-prefix=X32 --check-prefix=BWOFF32 %s
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+
+define i8 @test_movb(i8 %a0) {
+; BWON64-LABEL: test_movb:
+; BWON64: # BB#0:
+; BWON64-NEXT: movl %edi, %eax
+; BWON64-NEXT: retq
+;
+; BWOFF64-LABEL: test_movb:
+; BWOFF64: # BB#0:
+; BWOFF64-NEXT: movb %dil, %al
+; BWOFF64-NEXT: retq
+;
+; X32-LABEL: test_movb:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: retl
+ ret i8 %a0
+}
+
+define i16 @test_movw(i16 %a0) {
+; BWON64-LABEL: test_movw:
+; BWON64: # BB#0:
+; BWON64-NEXT: movl %edi, %eax
+; BWON64-NEXT: retq
+;
+; BWOFF64-LABEL: test_movw:
+; BWOFF64: # BB#0:
+; BWOFF64-NEXT: movw %di, %ax
+; BWOFF64-NEXT: retq
+;
+; BWON32-LABEL: test_movw:
+; BWON32: # BB#0:
+; BWON32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; BWON32-NEXT: retl
+;
+; BWOFF32-LABEL: test_movw:
+; BWOFF32: # BB#0:
+; BWOFF32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; BWOFF32-NEXT: retl
+ ret i16 %a0
+}
+
+; Verify we don't mess with H-reg copies (only generated in 32-bit mode).
+define i8 @test_movb_hreg(i16 %a0) {
+; X64-LABEL: test_movb_hreg:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl $8, %eax
+; X64-NEXT: addb %dil, %al
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-LABEL: test_movb_hreg:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addb %al, %ah
+; X32-NEXT: movb %ah, %al
+; X32-NEXT: retl
+ %tmp0 = trunc i16 %a0 to i8
+ %tmp1 = lshr i16 %a0, 8
+ %tmp2 = trunc i16 %tmp1 to i8
+ %tmp3 = add i8 %tmp0, %tmp2
+ ret i8 %tmp3
+}
diff --git a/test/CodeGen/X86/fixup-bw-copy.mir b/test/CodeGen/X86/fixup-bw-copy.mir
new file mode 100644
index 000000000000..beff513cdbf5
--- /dev/null
+++ b/test/CodeGen/X86/fixup-bw-copy.mir
@@ -0,0 +1,156 @@
+# RUN: llc -run-pass x86-fixup-bw-insts -mtriple=x86_64-- -o - %s | FileCheck %s
+
+# Verify that we correctly deal with the flag edge cases when replacing
+# copies by bigger copies, which is a pretty unusual transform.
+
+--- |
+ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+ define i8 @test_movb_killed(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impuse(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impdef_gr64(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impdef_gr32(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impdef_gr16(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i16 @test_movw_impdef_gr32(i16 %a0) {
+ ret i16 %a0
+ }
+
+ define i16 @test_movw_impdef_gr64(i16 %a0) {
+ ret i16 %a0
+ }
+
+...
+
+---
+name: test_movb_killed
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr killed %dil
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impuse
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr %dil, implicit %edi
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impdef_gr64
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil, implicit-def %rax
+ %al = MOV8rr %dil, implicit-def %rax
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impdef_gr32
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr %dil, implicit-def %eax
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impdef_gr16
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr %dil, implicit-def %ax
+ RETQ killed %al
+
+...
+
+---
+name: test_movw_impdef_gr32
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %di
+ %ax = MOV16rr %di, implicit-def %eax
+ RETQ killed %ax
+
+...
+
+---
+name: test_movw_impdef_gr64
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %di, implicit-def %rax
+ %ax = MOV16rr %di, implicit-def %rax
+ RETQ killed %ax
+
+...
diff --git a/test/CodeGen/X86/fixup-bw-inst.ll b/test/CodeGen/X86/fixup-bw-inst.ll
new file mode 100644
index 000000000000..6f83e6362d56
--- /dev/null
+++ b/test/CodeGen/X86/fixup-bw-inst.ll
@@ -0,0 +1,126 @@
+; RUN: llc -fixup-byte-word-insts=1 -march=x86-64 < %s | \
+; RUN: FileCheck -check-prefix CHECK -check-prefix BWON %s
+; RUN: llc -fixup-byte-word-insts=0 -march=x86-64 < %s | \
+; RUN: FileCheck -check-prefix CHECK -check-prefix BWOFF %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+
+; This has byte loads interspersed with byte stores, in a single
+; basic-block loop. The upper portion should be dead, so the movb loads
+; should have been changed into movzbl instead.
+; CHECK-LABEL: foo1
+; load:
+; BWON: movzbl
+; BWOFF: movb
+; store:
+; CHECK: movb
+; load:
+; BWON: movzbl
+; BWOFF: movb
+; store:
+; CHECK: movb
+; CHECK: ret
+define void @foo1(i32 %count,
+ %struct.A* noalias nocapture %q,
+ %struct.A* noalias nocapture %p)
+ nounwind uwtable noinline ssp {
+ %1 = icmp sgt i32 %count, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0
+ %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
+ %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
+ br label %a4
+
+a4: ; preds = %4, %.lr.ph
+ %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+ %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
+ %a5 = load i8, i8* %2, align 1
+ %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
+ store i8 %a5, i8* %a7, align 1
+ %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
+ %a6 = load i8, i8* %3, align 1
+ store i8 %a6, i8* %a8, align 1
+ %a9 = add nsw i32 %i.02, 1
+ %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
+ %exitcond = icmp eq i32 %a9, %count
+ br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge: ; preds = %4, %0
+ ret void
+}
+
+%struct.B = type { i16, i16, i16, i16, i16, i16, i16, i16 }
+
+; This has word loads interspersed with word stores.
+; The upper portion should be dead, so the movw loads should have
+; been changed into movzwl instead.
+; CHECK-LABEL: foo2
+; load:
+; BWON: movzwl
+; BWOFF: movw
+; store:
+; CHECK: movw
+; load:
+; BWON: movzwl
+; BWOFF: movw
+; store:
+; CHECK: movw
+; CHECK: ret
+define void @foo2(i32 %count,
+ %struct.B* noalias nocapture %q,
+ %struct.B* noalias nocapture %p)
+ nounwind uwtable noinline ssp {
+ %1 = icmp sgt i32 %count, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0
+ %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
+ %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
+ br label %a4
+
+a4: ; preds = %4, %.lr.ph
+ %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+ %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %a10, %a4 ]
+ %a5 = load i16, i16* %2, align 2
+ %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
+ store i16 %a5, i16* %a7, align 2
+ %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
+ %a6 = load i16, i16* %3, align 2
+ store i16 %a6, i16* %a8, align 2
+ %a9 = add nsw i32 %i.02, 1
+ %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
+ %exitcond = icmp eq i32 %a9, %count
+ br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge: ; preds = %4, %0
+ ret void
+}
+
+; This test contains nothing but a simple byte load and store. Since
+; movb encodes smaller, we do not want to use movzbl unless in a tight loop.
+; So this test checks that movb is used.
+; CHECK-LABEL: foo3:
+; CHECK: movb
+; CHECK: movb
+define void @foo3(i8 *%dst, i8 *%src) {
+ %t0 = load i8, i8 *%src, align 1
+ store i8 %t0, i8 *%dst, align 1
+ ret void
+}
+
+; This test contains nothing but a simple word load and store. Since
+; movw and movzwl are the same size, we should always choose to use
+; movzwl instead.
+; CHECK-LABEL: foo4:
+; BWON: movzwl
+; BWOFF: movw
+; CHECK: movw
+define void @foo4(i16 *%dst, i16 *%src) {
+ %t0 = load i16, i16 *%src, align 2
+ store i16 %t0, i16 *%dst, align 2
+ ret void
+}
diff --git a/test/CodeGen/X86/float-conv-elim.ll b/test/CodeGen/X86/float-conv-elim.ll
index 3feff851d91a..7ccad2b80c8b 100644
--- a/test/CodeGen/X86/float-conv-elim.ll
+++ b/test/CodeGen/X86/float-conv-elim.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mcpu=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-linux-gnu -march=x86-64 -mcpu=x86-64 < %s | FileCheck %s
; Make sure the float conversion is folded away as it should be.
; CHECK-LABEL: foo
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 76a4acf00f90..62d1b826b545 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
@@ -22,7 +23,7 @@ define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%res = fadd float %x, %a2
@@ -83,7 +84,7 @@ define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%res = fadd double %x, %a2
@@ -148,7 +149,7 @@ define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%res = fsub float %x, %a2
@@ -209,7 +210,7 @@ define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
@@ -274,7 +275,7 @@ define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fnmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%res = fsub float %a2, %x
@@ -335,7 +336,7 @@ define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fnmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %a2, %x
@@ -400,7 +401,7 @@ define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fnmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%y = fsub float -0.000000e+00, %x
@@ -464,7 +465,7 @@ define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fnmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%y = fsub double -0.000000e+00, %x
@@ -533,7 +534,7 @@ define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x
; AVX512: # BB#0:
; AVX512-NEXT: vmovaps (%rdi), %xmm2
; AVX512-NEXT: vfmadd213ps %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0
%y = fmul <4 x float> %x, %a1
@@ -556,7 +557,7 @@ define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, <
; AVX512: # BB#0:
; AVX512-NEXT: vmovapd (%rdi), %xmm2
; AVX512-NEXT: vfmsub213pd %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%x = load <2 x double>, <2 x double>* %a0
%y = fmul <2 x double> %x, %a1
@@ -829,7 +830,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vfmadd213ss %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%t1 = fsub float 1.0, %t
%tx = fmul float %x, %t
@@ -853,7 +854,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
;
; AVX512-LABEL: test_v4f32_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %xmm2, %xmm3
; AVX512-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm3
; AVX512-NEXT: vfmadd213ps %xmm3, %xmm2, %xmm0
; AVX512-NEXT: retq
@@ -879,7 +880,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
;
; AVX512-LABEL: test_v8f32_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %ymm2, %ymm3
; AVX512-NEXT: vfnmadd213ps %ymm1, %ymm1, %ymm3
; AVX512-NEXT: vfmadd213ps %ymm3, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -907,7 +908,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vfmadd213sd %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%t1 = fsub double 1.0, %t
%tx = fmul double %x, %t
@@ -931,7 +932,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
;
; AVX512-LABEL: test_v2f64_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %xmm2, %xmm3
; AVX512-NEXT: vfnmadd213pd %xmm1, %xmm1, %xmm3
; AVX512-NEXT: vfmadd213pd %xmm3, %xmm2, %xmm0
; AVX512-NEXT: retq
@@ -957,7 +958,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
;
; AVX512-LABEL: test_v4f64_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %ymm2, %ymm3
; AVX512-NEXT: vfnmadd213pd %ymm1, %ymm1, %ymm3
; AVX512-NEXT: vfmadd213pd %ymm3, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -1101,7 +1102,7 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
; AVX512: # BB#0:
; AVX512-NEXT: vfmadd231ps {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
%m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
@@ -1128,7 +1129,7 @@ define double @test_f64_fneg_fmul(double %x, double %y) #0 {
; AVX512: # BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%m = fmul nsz double %x, %y
%n = fsub double -0.0, %m
@@ -1150,7 +1151,7 @@ define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
;
; AVX512-LABEL: test_v4f32_fneg_fmul:
; AVX512: # BB#0:
-; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpxord %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%m = fmul nsz <4 x float> %x, %y
@@ -1173,7 +1174,7 @@ define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
;
; AVX512-LABEL: test_v4f64_fneg_fmul:
; AVX512: # BB#0:
-; AVX512-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; AVX512-NEXT: vpxord %ymm2, %ymm2, %ymm2
; AVX512-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%m = fmul nsz <4 x double> %x, %y
diff --git a/test/CodeGen/X86/fold-push.ll b/test/CodeGen/X86/fold-push.ll
index eaf91351021f..9d3afd1c449b 100644
--- a/test/CodeGen/X86/fold-push.ll
+++ b/test/CodeGen/X86/fold-push.ll
@@ -14,7 +14,7 @@ define void @test(i32 %a, i32 %b) optsize nounwind {
; SLM: movl (%esp), [[RELOAD:%e..]]
; SLM-NEXT: pushl [[RELOAD]]
; CHECK: calll
-; CHECK-NEXT: addl $4, %esp
+; CHECK-NEXT: addl $8, %esp
%c = add i32 %a, %b
call void @foo(i32 %c)
call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index 62fed4219387..5c481197c3b4 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -6,10 +6,7 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386--netbsd"
; CHECK-LABEL: fn1
-; CHECK: shldl {{.*#+}} 4-byte Folded Spill
-; CHECK: orl {{.*#+}} 4-byte Folded Reload
-; CHECK: shldl {{.*#+}} 4-byte Folded Spill
-; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: imull {{.*#+}} 4-byte Folded Reload
; CHECK: orl {{.*#+}} 4-byte Folded Reload
diff --git a/test/CodeGen/X86/fold-vector-sext-zext.ll b/test/CodeGen/X86/fold-vector-sext-zext.ll
index aeaab4479085..6299280eb98d 100644
--- a/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; Verify that the backend correctly folds a sign/zero extend of a vector where
@@ -6,8 +7,11 @@
; simple loads from constant pool of the result. That is because the resulting
; vector should be known at static time.
-
define <4 x i16> @test1() {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -15,11 +19,12 @@ define <4 x i16> @test1() {
%5 = sext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test1
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i16> @test2() {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -27,11 +32,12 @@ define <4 x i16> @test2() {
%5 = sext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test2
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test3() {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -39,11 +45,12 @@ define <4 x i32> @test3() {
%5 = sext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test3
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test4() {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -51,12 +58,12 @@ define <4 x i32> @test4() {
%5 = sext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test4
-; CHECK: vmovaps
-; CHECK-NEXT: ret
-
define <4 x i64> @test5() {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [0,18446744073709551615,2,18446744073709551613]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -64,12 +71,12 @@ define <4 x i64> @test5() {
%5 = sext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test5
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i64> @test6() {
+; CHECK-LABEL: test6:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <u,18446744073709551615,u,18446744073709551613>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -77,12 +84,12 @@ define <4 x i64> @test6() {
%5 = sext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test6
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test7() {
+; CHECK-LABEL: test7:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -94,11 +101,12 @@ define <8 x i16> @test7() {
%9 = sext <8 x i8> %4 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test7
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test8() {
+; CHECK-LABEL: test8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <0,4294967295,2,4294967293,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -110,12 +118,12 @@ define <8 x i32> @test8() {
%9 = sext <8 x i8> %4 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test8
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test9() {
+; CHECK-LABEL: test9:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 undef, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 undef, i32 2
@@ -127,11 +135,12 @@ define <8 x i16> @test9() {
%9 = sext <8 x i8> %4 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test9
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test10() {
+; CHECK-LABEL: test10:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,u,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 undef, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -143,13 +152,12 @@ define <8 x i32> @test10() {
%9 = sext <8 x i8> %4 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test10
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
-
define <4 x i16> @test11() {
+; CHECK-LABEL: test11:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -157,11 +165,12 @@ define <4 x i16> @test11() {
%5 = zext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test11
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test12() {
+; CHECK-LABEL: test12:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -169,11 +178,12 @@ define <4 x i32> @test12() {
%5 = zext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test12
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i64> @test13() {
+; CHECK-LABEL: test13:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -181,12 +191,12 @@ define <4 x i64> @test13() {
%5 = zext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test13
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i16> @test14() {
+; CHECK-LABEL: test14:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -194,11 +204,12 @@ define <4 x i16> @test14() {
%5 = zext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test14
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test15() {
+; CHECK-LABEL: test15:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,u,2,u>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 undef, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -206,11 +217,12 @@ define <4 x i32> @test15() {
%5 = zext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test15
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i64> @test16() {
+; CHECK-LABEL: test16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <u,255,2,u>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -218,12 +230,12 @@ define <4 x i64> @test16() {
%5 = zext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test16
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test17() {
+; CHECK-LABEL: test17:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249]
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -235,11 +247,12 @@ define <8 x i16> @test17() {
%9 = zext <8 x i8> %8 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test17
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test18() {
+; CHECK-LABEL: test18:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249]
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -251,12 +264,12 @@ define <8 x i32> @test18() {
%9 = zext <8 x i8> %8 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test18
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test19() {
+; CHECK-LABEL: test19:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253,u,251,u,249>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 undef, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 undef, i32 2
@@ -268,11 +281,12 @@ define <8 x i16> @test19() {
%9 = zext <8 x i8> %8 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test19
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test20() {
+; CHECK-LABEL: test20:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,253,4,u,6,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 undef, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -284,8 +298,3 @@ define <8 x i32> @test20() {
%9 = zext <8 x i8> %8 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test20
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
-
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
index d0cf34170081..8d42680e199b 100644
--- a/test/CodeGen/X86/force-align-stack-alloca.ll
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -32,15 +32,21 @@ define i64 @g(i32 %i) nounwind {
; CHECK: movl %{{...}}, %esp
; CHECK-NOT: {{[^ ,]*}}, %esp
;
-; Next we set up the memset call, and then undo it.
+; Next we set up the memset call.
; CHECK: subl $20, %esp
; CHECK-NOT: {{[^ ,]*}}, %esp
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
; CHECK: calll memset
-; CHECK-NEXT: addl $32, %esp
+;
+; Deallocating 32 bytes of outgoing call frame for memset and
+; allocating 28 bytes for calling f yields a 4-byte adjustment:
+; CHECK-NEXT: addl $4, %esp
; CHECK-NOT: {{[^ ,]*}}, %esp
;
-; Next we set up the call to 'f'.
-; CHECK: subl $28, %esp
+; And move on to call 'f', and then restore the stack.
+; CHECK: pushl
; CHECK-NOT: {{[^ ,]*}}, %esp
; CHECK: calll f
; CHECK-NEXT: addl $32, %esp
diff --git a/test/CodeGen/X86/fp-logic.ll b/test/CodeGen/X86/fp-logic.ll
index 64c3f6b79a23..9ab6751d6548 100644
--- a/test/CodeGen/X86/fp-logic.ll
+++ b/test/CodeGen/X86/fp-logic.ll
@@ -262,3 +262,51 @@ define float @movmsk(float %x) {
ret float %bc2
}
+define double @bitcast_fabs(double %x) {
+; CHECK-LABEL: bitcast_fabs:
+; CHECK: # BB#0:
+; CHECK-NEXT: andpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast double %x to i64
+ %and = and i64 %bc1, 9223372036854775807
+ %bc2 = bitcast i64 %and to double
+ ret double %bc2
+}
+
+define float @bitcast_fneg(float %x) {
+; CHECK-LABEL: bitcast_fneg:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast float %x to i32
+ %xor = xor i32 %bc1, 2147483648
+ %bc2 = bitcast i32 %xor to float
+ ret float %bc2
+}
+
+define <2 x double> @bitcast_fabs_vec(<2 x double> %x) {
+; CHECK-LABEL: bitcast_fabs_vec:
+; CHECK: # BB#0:
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast <2 x double> %x to <2 x i64>
+ %and = and <2 x i64> %bc1, <i64 9223372036854775807, i64 9223372036854775807>
+ %bc2 = bitcast <2 x i64> %and to <2 x double>
+ ret <2 x double> %bc2
+}
+
+define <4 x float> @bitcast_fneg_vec(<4 x float> %x) {
+; CHECK-LABEL: bitcast_fneg_vec:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast <4 x float> %x to <4 x i32>
+ %xor = xor <4 x i32> %bc1, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
+ %bc2 = bitcast <4 x i32> %xor to <4 x float>
+ ret <4 x float> %bc2
+}
+
diff --git a/test/CodeGen/X86/fp-une-cmp.ll b/test/CodeGen/X86/fp-une-cmp.ll
index 7f772d11da9a..653040053c27 100644
--- a/test/CodeGen/X86/fp-une-cmp.ll
+++ b/test/CodeGen/X86/fp-une-cmp.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
; <rdar://problem/7859988>
; Make sure we don't generate more jumps than we need to. We used to generate
@@ -19,25 +21,115 @@
; addsd ...
; LBB0_2:
-; CHECK: func
-; CHECK: jne [[LABEL:.*]]
-; CHECK-NEXT: jp [[LABEL]]
-; CHECK-NOT: jmp
+define double @rdar_7859988(double %x, double %y) nounwind readnone optsize ssp {
+; CHECK-LABEL: rdar_7859988:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: mulsd %xmm1, %xmm0
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: jne .LBB0_2
+; CHECK-NEXT: jp .LBB0_2
+; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: .LBB0_2: # %bb2
+; CHECK-NEXT: retq
-define float @func(float %x, float %y) nounwind readnone optsize ssp {
entry:
- %0 = fpext float %x to double
- %1 = fpext float %y to double
- %2 = fmul double %0, %1
- %3 = fcmp une double %2, 0.000000e+00
- br i1 %3, label %bb2, label %bb1
+ %mul = fmul double %x, %y
+ %cmp = fcmp une double %mul, 0.000000e+00
+ br i1 %cmp, label %bb2, label %bb1
bb1:
- %4 = fadd double %2, -1.000000e+00
+ %add = fadd double %mul, -1.000000e+00
br label %bb2
bb2:
- %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
- %.0 = fptrunc double %.0.in to float
- ret float %.0
+ %phi = phi double [ %add, %bb1 ], [ %mul, %entry ]
+ ret double %phi
}
+
+define double @profile_metadata(double %x, double %y) {
+; CHECK-LABEL: profile_metadata:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: mulsd %xmm1, %xmm0
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: jne .LBB1_1
+; CHECK-NEXT: jp .LBB1_1
+; CHECK-NEXT: .LBB1_2: # %bb2
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB1_1: # %bb1
+; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: jmp .LBB1_2
+
+entry:
+ %mul = fmul double %x, %y
+ %cmp = fcmp une double %mul, 0.000000e+00
+ br i1 %cmp, label %bb1, label %bb2, !prof !1
+
+bb1:
+ %add = fadd double %mul, -1.000000e+00
+ br label %bb2
+
+bb2:
+ %phi = phi double [ %add, %bb1 ], [ %mul, %entry ]
+ ret double %phi
+}
+
+; Test if the negation of the non-equality check between floating points are
+; translated to jnp followed by jne.
+
+define void @foo(float %f) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-NEXT: jne .LBB2_2
+; CHECK-NEXT: jnp .LBB2_1
+; CHECK-NEXT: .LBB2_2: # %if.then
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: .LBB2_1: # %if.end
+; CHECK-NEXT: retq
+entry:
+ %cmp = fcmp une float %f, 0.000000e+00
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ tail call void @a()
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; Test that an FP oeq/une conditional branch can be inverted successfully even
+; when the true and false targets are the same (PR27750).
+;
+; CHECK-LABEL: pr27750
+; CHECK: ucomiss
+; CHECK-NEXT: jne [[TARGET:.*]]
+; CHECK-NEXT: jp [[TARGET]]
+define void @pr27750(i32* %b, float %x, i1 %y) {
+entry:
+ br label %for.cond
+
+for.cond:
+ br label %for.cond1
+
+for.cond1:
+ br i1 %y, label %for.body3.lr.ph, label %for.end
+
+for.body3.lr.ph:
+ store i32 0, i32* %b, align 4
+ br label %for.end
+
+for.end:
+; After block %for.cond gets eliminated, the two target blocks of this
+; conditional block are the same.
+ %tobool = fcmp une float %x, 0.000000e+00
+ br i1 %tobool, label %for.cond, label %for.cond1
+}
+
+declare void @a()
+
+!1 = !{!"branch_weights", i32 1, i32 1000}
diff --git a/test/CodeGen/X86/fp128-cast.ll b/test/CodeGen/X86/fp128-cast.ll
index 73878e31d0ef..2d872498dfc7 100644
--- a/test/CodeGen/X86/fp128-cast.ll
+++ b/test/CodeGen/X86/fp128-cast.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -O2 -mtriple=i686-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=X32
; Check soft floating point conversion function calls.
@@ -17,11 +18,17 @@ entry:
%conv = fpext float %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestFPExtF32_F128:
-; CHECK: movss vf32(%rip), %xmm0
-; CHECK-NEXT: callq __extendsftf2
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPExtF32_F128:
+; X32: flds vf32
+; X32: fstps
+; X32: calll __extendsftf2
+; X32: retl
+;
+; X64-LABEL: TestFPExtF32_F128:
+; X64: movss vf32(%rip), %xmm0
+; X64-NEXT: callq __extendsftf2
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestFPExtF64_F128() {
@@ -30,11 +37,17 @@ entry:
%conv = fpext double %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestFPExtF64_F128:
-; CHECK: movsd vf64(%rip), %xmm0
-; CHECK-NEXT: callq __extenddftf2
-; CHECK-NEXT: movapd %xmm0, vf128(%rip)
-; CHECK: ret
+; X32-LABEL: TestFPExtF64_F128:
+; X32: fldl vf64
+; X32: fstpl
+; X32: calll __extenddftf2
+; X32: retl
+;
+; X64-LABEL: TestFPExtF64_F128:
+; X64: movsd vf64(%rip), %xmm0
+; X64-NEXT: callq __extenddftf2
+; X64-NEXT: movapd %xmm0, vf128(%rip)
+; X64: ret
}
define void @TestFPToSIF128_I32() {
@@ -43,11 +56,15 @@ entry:
%conv = fptosi fp128 %0 to i32
store i32 %conv, i32* @vi32, align 4
ret void
-; CHECK-LABEL: TestFPToSIF128_I32:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixtfsi
-; CHECK-NEXT: movl %eax, vi32(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToSIF128_I32:
+; X32: calll __fixtfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToSIF128_I32:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixtfsi
+; X64-NEXT: movl %eax, vi32(%rip)
+; X64: retq
}
define void @TestFPToUIF128_U32() {
@@ -56,11 +73,15 @@ entry:
%conv = fptoui fp128 %0 to i32
store i32 %conv, i32* @vu32, align 4
ret void
-; CHECK-LABEL: TestFPToUIF128_U32:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixunstfsi
-; CHECK-NEXT: movl %eax, vu32(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToUIF128_U32:
+; X32: calll __fixunstfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToUIF128_U32:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixunstfsi
+; X64-NEXT: movl %eax, vu32(%rip)
+; X64: retq
}
define void @TestFPToSIF128_I64() {
@@ -70,12 +91,16 @@ entry:
%conv1 = sext i32 %conv to i64
store i64 %conv1, i64* @vi64, align 8
ret void
-; CHECK-LABEL: TestFPToSIF128_I64:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixtfsi
-; CHECK-NEXT: cltq
-; CHECK-NEXT: movq %rax, vi64(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToSIF128_I64:
+; X32: calll __fixtfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToSIF128_I64:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixtfsi
+; X64-NEXT: cltq
+; X64-NEXT: movq %rax, vi64(%rip)
+; X64: retq
}
define void @TestFPToUIF128_U64() {
@@ -85,12 +110,16 @@ entry:
%conv1 = zext i32 %conv to i64
store i64 %conv1, i64* @vu64, align 8
ret void
-; CHECK-LABEL: TestFPToUIF128_U64:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixunstfsi
-; CHECK-NEXT: movl %eax, %eax
-; CHECK-NEXT: movq %rax, vu64(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToUIF128_U64:
+; X32: calll __fixunstfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToUIF128_U64:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixunstfsi
+; X64-NEXT: movl %eax, %eax
+; X64-NEXT: movq %rax, vu64(%rip)
+; X64: retq
}
define void @TestFPTruncF128_F32() {
@@ -99,11 +128,16 @@ entry:
%conv = fptrunc fp128 %0 to float
store float %conv, float* @vf32, align 4
ret void
-; CHECK-LABEL: TestFPTruncF128_F32:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __trunctfsf2
-; CHECK-NEXT: movss %xmm0, vf32(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPTruncF128_F32:
+; X32: calll __trunctfsf2
+; X32: fstps vf32
+; X32: retl
+;
+; X64-LABEL: TestFPTruncF128_F32:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __trunctfsf2
+; X64-NEXT: movss %xmm0, vf32(%rip)
+; X64: retq
}
define void @TestFPTruncF128_F64() {
@@ -112,11 +146,16 @@ entry:
%conv = fptrunc fp128 %0 to double
store double %conv, double* @vf64, align 8
ret void
-; CHECK-LABEL: TestFPTruncF128_F64:
-; CHECK: movapd vf128(%rip), %xmm0
-; CHECK-NEXT: callq __trunctfdf2
-; CHECK-NEXT: movsd %xmm0, vf64(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPTruncF128_F64:
+; X32: calll __trunctfdf2
+; X32: fstpl vf64
+; X32: retl
+;
+; X64-LABEL: TestFPTruncF128_F64:
+; X64: movapd vf128(%rip), %xmm0
+; X64-NEXT: callq __trunctfdf2
+; X64-NEXT: movsd %xmm0, vf64(%rip)
+; X64: retq
}
define void @TestSIToFPI32_F128() {
@@ -125,11 +164,15 @@ entry:
%conv = sitofp i32 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestSIToFPI32_F128:
-; CHECK: movl vi32(%rip), %edi
-; CHECK-NEXT: callq __floatsitf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestSIToFPI32_F128:
+; X32: calll __floatsitf
+; X32: retl
+;
+; X64-LABEL: TestSIToFPI32_F128:
+; X64: movl vi32(%rip), %edi
+; X64-NEXT: callq __floatsitf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestUIToFPU32_F128() #2 {
@@ -138,11 +181,15 @@ entry:
%conv = uitofp i32 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestUIToFPU32_F128:
-; CHECK: movl vu32(%rip), %edi
-; CHECK-NEXT: callq __floatunsitf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestUIToFPU32_F128:
+; X32: calll __floatunsitf
+; X32: retl
+;
+; X64-LABEL: TestUIToFPU32_F128:
+; X64: movl vu32(%rip), %edi
+; X64-NEXT: callq __floatunsitf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestSIToFPI64_F128(){
@@ -151,11 +198,15 @@ entry:
%conv = sitofp i64 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestSIToFPI64_F128:
-; CHECK: movq vi64(%rip), %rdi
-; CHECK-NEXT: callq __floatditf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestSIToFPI64_F128:
+; X32: calll __floatditf
+; X32: retl
+;
+; X64-LABEL: TestSIToFPI64_F128:
+; X64: movq vi64(%rip), %rdi
+; X64-NEXT: callq __floatditf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestUIToFPU64_F128() #2 {
@@ -164,11 +215,15 @@ entry:
%conv = uitofp i64 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestUIToFPU64_F128:
-; CHECK: movq vu64(%rip), %rdi
-; CHECK-NEXT: callq __floatunditf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestUIToFPU64_F128:
+; X32: calll __floatunditf
+; X32: retl
+;
+; X64-LABEL: TestUIToFPU64_F128:
+; X64: movq vu64(%rip), %rdi
+; X64-NEXT: callq __floatunditf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define i32 @TestConst128(fp128 %v) {
@@ -176,11 +231,16 @@ entry:
%cmp = fcmp ogt fp128 %v, 0xL00000000000000003FFF000000000000
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: TestConst128:
-; CHECK: movaps {{.*}}, %xmm1
-; CHECK-NEXT: callq __gttf2
-; CHECK-NEXT: test
-; CHECK: retq
+; X32-LABEL: TestConst128:
+; X32: calll __gttf2
+; X32: retl
+;
+; X64-LABEL: TestConst128:
+; X64: movaps {{.*}}, %xmm1
+; X64-NEXT: callq __gttf2
+; X64-NEXT: xorl
+; X64-NEXT: test
+; X64: retq
}
; C code:
@@ -207,17 +267,21 @@ entry:
%cmp = icmp eq i32 %or, 0
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: TestBits128:
-; CHECK: movaps %xmm0, %xmm1
-; CHECK-NEXT: callq __multf3
-; CHECK-NEXT: movaps %xmm0, (%rsp)
-; CHECK-NEXT: movq (%rsp),
-; CHECK-NEXT: movq %
-; CHECK-NEXT: shrq $32,
-; CHECK: orl
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK: retq
+; X32-LABEL: TestBits128:
+; X32: calll __multf3
+; X32: retl
+;
+; X64-LABEL: TestBits128:
+; X64: movaps %xmm0, %xmm1
+; X64-NEXT: callq __multf3
+; X64-NEXT: movaps %xmm0, (%rsp)
+; X64-NEXT: movq (%rsp),
+; X64-NEXT: movq %
+; X64-NEXT: shrq $32,
+; X64: xorl %eax, %eax
+; X64-NEXT: orl
+; X64-NEXT: sete %al
+; X64: retq
;
; If TestBits128 fails due to any llvm or clang change,
; please make sure the original simplified C code will
@@ -244,12 +308,19 @@ entry:
%add = add i128 %or, 3
%0 = bitcast i128 %add to fp128
ret fp128 %0
-; CHECK-LABEL: TestPair128:
-; CHECK: addq $3, %rsi
-; CHECK: movq %rsi, -24(%rsp)
-; CHECK: movq %rdi, -16(%rsp)
-; CHECK: movaps -24(%rsp), %xmm0
-; CHECK-NEXT: retq
+; X32-LABEL: TestPair128:
+; X32: addl
+; X32-NEXT: adcl
+; X32-NEXT: adcl
+; X32-NEXT: adcl
+; X32: retl
+;
+; X64-LABEL: TestPair128:
+; X64: addq $3, %rsi
+; X64: movq %rsi, -24(%rsp)
+; X64: movq %rdi, -16(%rsp)
+; X64: movaps -24(%rsp), %xmm0
+; X64-NEXT: retq
}
define fp128 @TestTruncCopysign(fp128 %x, i32 %n) {
@@ -266,12 +337,24 @@ if.then: ; preds = %entry
cleanup: ; preds = %entry, %if.then
%retval.0 = phi fp128 [ %conv1, %if.then ], [ %x, %entry ]
ret fp128 %retval.0
-; CHECK-LABEL: TestTruncCopysign:
-; CHECK: callq __trunctfdf2
-; CHECK-NEXT: andpd {{.*}}, %xmm0
-; CHECK-NEXT: orpd {{.*}}, %xmm0
-; CHECK-NEXT: callq __extenddftf2
-; CHECK: retq
+; X32-LABEL: TestTruncCopysign:
+; X32: calll __trunctfdf2
+; X32: fstpl
+; X32: flds
+; X32: flds
+; X32: fstp
+; X32: fldz
+; X32: fstp
+; X32: fstpl
+; X32: calll __extenddftf2
+; X32: retl
+;
+; X64-LABEL: TestTruncCopysign:
+; X64: callq __trunctfdf2
+; X64-NEXT: andpd {{.*}}, %xmm0
+; X64-NEXT: orpd {{.*}}, %xmm0
+; X64-NEXT: callq __extenddftf2
+; X64: retq
}
declare double @copysign(double, double) #1
diff --git a/test/CodeGen/X86/fp128-compare.ll b/test/CodeGen/X86/fp128-compare.ll
index b5d4fbe1b74e..6ad3b74aeafa 100644
--- a/test/CodeGen/X86/fp128-compare.ll
+++ b/test/CodeGen/X86/fp128-compare.ll
@@ -8,8 +8,9 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128GT:
; CHECK: callq __gttf2
-; CHECK: setg %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: setg %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -20,9 +21,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128GE:
; CHECK: callq __getf2
+; CHECK: xorl %ecx, %ecx
; CHECK: testl %eax, %eax
-; CHECK: setns %al
-; CHECK: movzbl %al, %eax
+; CHECK: setns %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -48,9 +50,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128LE:
; CHECK: callq __letf2
-; CHECK-NEXT: testl %eax, %eax
-; CHECK: setle %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: testl %eax, %eax
+; CHECK: setle %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -61,9 +64,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128EQ:
; CHECK: callq __eqtf2
-; CHECK-NEXT: testl %eax, %eax
-; CHECK: sete %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: testl %eax, %eax
+; CHECK: sete %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -74,9 +78,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128NE:
; CHECK: callq __netf2
-; CHECK-NEXT: testl %eax, %eax
-; CHECK: setne %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: testl %eax, %eax
+; CHECK: setne %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -86,8 +91,8 @@ entry:
%cond = select i1 %cmp, fp128 %x, fp128 %y
ret fp128 %cond
; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm1
; CHECK: movaps %xmm0
+; CHECK: movaps %xmm1
; CHECK: callq __gttf2
; CHECK: movaps {{.*}}, %xmm0
; CHECK: testl %eax, %eax
diff --git a/test/CodeGen/X86/fp128-select.ll b/test/CodeGen/X86/fp128-select.ll
new file mode 100644
index 000000000000..dc41d5095a71
--- /dev/null
+++ b/test/CodeGen/X86/fp128-select.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s --check-prefix=MMX
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=MMX
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define void @test_select(fp128* %p, fp128* %q, i1 zeroext %c) {
+; MMX-LABEL: test_select:
+; MMX: # BB#0:
+; MMX-NEXT: testb %dl, %dl
+; MMX-NEXT: jne .LBB0_1
+; MMX-NEXT: # BB#2:
+; MMX-NEXT: movaps {{.*}}(%rip), %xmm0
+; MMX-NEXT: movaps %xmm0, (%rsi)
+; MMX-NEXT: retq
+; MMX-NEXT: .LBB0_1:
+; MMX-NEXT: movaps (%rdi), %xmm0
+; MMX-NEXT: movaps %xmm0, (%rsi)
+; MMX-NEXT: retq
+;
+; CHECK-LABEL: test_select:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovneq (%rdi), %rax
+; CHECK-NEXT: movabsq $9223231299366420480, %rcx # imm = 0x7FFF800000000000
+; CHECK-NEXT: cmovneq 8(%rdi), %rcx
+; CHECK-NEXT: movq %rcx, 8(%rsi)
+; CHECK-NEXT: movq %rax, (%rsi)
+; CHECK-NEXT: retq
+ %a = load fp128, fp128* %p, align 2
+ %r = select i1 %c, fp128 %a, fp128 0xL00000000000000007FFF800000000000
+ store fp128 %r, fp128* %q
+ ret void
+}
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
index 2ee67dc190bd..874cc7ce7f3f 100644
--- a/test/CodeGen/X86/fpstack-debuginstr-kill.ll
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -43,11 +43,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!24, !25}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !21, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !21, imports: !2)
!1 = !DIFile(filename: "fpu_ieee.cpp", directory: "x87stackifier")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "fpuop_arithmetic", linkageName: "_Z16fpuop_arithmeticjj", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !5, scope: !6, type: !7, variables: !10)
+!4 = distinct !DISubprogram(name: "fpuop_arithmetic", linkageName: "_Z16fpuop_arithmeticjj", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 13, file: !5, scope: !6, type: !7, variables: !10)
!5 = !DIFile(filename: "f1.cpp", directory: "x87stackifier")
!6 = !DIFile(filename: "f1.cpp", directory: "x87stackifier")
!7 = !DISubroutineType(types: !8)
@@ -60,7 +59,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_extended", line: 3, file: !5, baseType: !15)
!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_register", line: 2, file: !5, baseType: !16)
!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "uae_f64", line: 1, file: !5, baseType: !17)
-!17 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
+!17 = !DIBasicType(tag: DW_TAG_base_type, name: "long double", size: 128, align: 128, encoding: DW_ATE_float)
!18 = !DILocalVariable(name: "a", line: 15, scope: !4, file: !6, type: !19)
!19 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!20 = !DILocalVariable(name: "value", line: 16, scope: !4, file: !6, type: !14)
diff --git a/test/CodeGen/X86/frame-order.ll b/test/CodeGen/X86/frame-order.ll
new file mode 100644
index 000000000000..33aaee2951c3
--- /dev/null
+++ b/test/CodeGen/X86/frame-order.ll
@@ -0,0 +1,122 @@
+; RUN: llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print < %s | FileCheck %s
+; RUN: opt -strip -S < %s | llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print | FileCheck %s
+
+; This test checks if the code is generated correctly with and without debug info.
+
+; This LL file was generated by running 'clang -g -gcodeview' on the
+; following code:
+; 1: extern "C" volatile int x;
+; 2: extern "C" void capture(int *p);
+; 3: static __forceinline inline void will_be_inlined() {
+; 4: int v = 3;
+; 5: capture(&v);
+; 6: }
+; 7: extern "C" void f(int param) {
+; 8: if (param) {
+; 9: int a = 42;
+; 10: will_be_inlined();
+; 11: capture(&a);
+; 12: } else {
+; 13: int b = 42;
+; 14: will_be_inlined();
+; 15: capture(&b);
+; 16: }
+; 17: }
+
+; ModuleID = 't.cpp'
+
+; Function Attrs: nounwind uwtable
+define void @f(i32 %param) #0 !dbg !4 {
+entry:
+ %v.i1 = alloca i32, align 4
+ call void @llvm.dbg.declare(metadata i32* %v.i1, metadata !15, metadata !16), !dbg !17
+ %v.i = alloca i32, align 4
+ call void @llvm.dbg.declare(metadata i32* %v.i, metadata !15, metadata !16), !dbg !21
+ %param.addr = alloca i32, align 4
+ %a = alloca i32, align 4
+ %b = alloca i32, align 4
+ store i32 %param, i32* %param.addr, align 4
+ call void @llvm.dbg.declare(metadata i32* %param.addr, metadata !24, metadata !16), !dbg !25
+ %0 = load i32, i32* %param.addr, align 4, !dbg !26
+ %tobool = icmp ne i32 %0, 0, !dbg !26
+ br i1 %tobool, label %if.then, label %if.else, !dbg !27
+
+;CHECK: movl [[REG:.*]], 20(%rsp)
+;CHECK: je [[LABEL:.*]]
+
+if.then: ; preds = %entry
+ call void @llvm.dbg.declare(metadata i32* %a, metadata !28, metadata !16), !dbg !29
+ store i32 42, i32* %a, align 4, !dbg !29
+ store i32 3, i32* %v.i, align 4, !dbg !21
+ call void @capture(i32* %v.i) #3, !dbg !30
+ call void @capture(i32* %a), !dbg !31
+ br label %if.end, !dbg !32
+
+;CHECK: movl $3, 12(%rsp)
+
+if.else: ; preds = %entry
+ call void @llvm.dbg.declare(metadata i32* %b, metadata !33, metadata !16), !dbg !34
+ store i32 42, i32* %b, align 4, !dbg !34
+ store i32 3, i32* %v.i1, align 4, !dbg !17
+ call void @capture(i32* %v.i1) #3, !dbg !35
+ call void @capture(i32* %b), !dbg !36
+ br label %if.end
+
+;CHECK: [[LABEL]]:
+;CHECK: movl $3, 16(%rsp)
+
+if.end: ; preds = %if.else, %if.then
+ ret void, !dbg !37
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @capture(i32*) #2
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12, !13}
+!llvm.ident = !{!14}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.9.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.cpp", directory: "D:\5Csrc\5Cllvm\5Cbuild")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 7, type: !5, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = distinct !DISubprogram(name: "will_be_inlined", linkageName: "\01?will_be_inlined@@YAXXZ", scope: !1, file: !1, line: 3, type: !9, isLocal: true, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !{i32 2, !"CodeView", i32 1}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{i32 1, !"PIC Level", i32 2}
+!14 = !{!"clang version 3.9.0 "}
+!15 = !DILocalVariable(name: "v", scope: !8, file: !1, line: 4, type: !7)
+!16 = !DIExpression()
+!17 = !DILocation(line: 4, column: 7, scope: !8, inlinedAt: !18)
+!18 = distinct !DILocation(line: 14, column: 5, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 12, column: 10)
+!20 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 7)
+!21 = !DILocation(line: 4, column: 7, scope: !8, inlinedAt: !22)
+!22 = distinct !DILocation(line: 10, column: 5, scope: !23)
+!23 = distinct !DILexicalBlock(scope: !20, file: !1, line: 8, column: 14)
+!24 = !DILocalVariable(name: "param", arg: 1, scope: !4, file: !1, line: 7, type: !7)
+!25 = !DILocation(line: 7, column: 23, scope: !4)
+!26 = !DILocation(line: 8, column: 7, scope: !20)
+!27 = !DILocation(line: 8, column: 7, scope: !4)
+!28 = !DILocalVariable(name: "a", scope: !23, file: !1, line: 9, type: !7)
+!29 = !DILocation(line: 9, column: 9, scope: !23)
+!30 = !DILocation(line: 5, column: 3, scope: !8, inlinedAt: !22)
+!31 = !DILocation(line: 11, column: 5, scope: !23)
+!32 = !DILocation(line: 12, column: 3, scope: !23)
+!33 = !DILocalVariable(name: "b", scope: !19, file: !1, line: 13, type: !7)
+!34 = !DILocation(line: 13, column: 9, scope: !19)
+!35 = !DILocation(line: 5, column: 3, scope: !8, inlinedAt: !18)
+!36 = !DILocation(line: 15, column: 5, scope: !19)
+!37 = !DILocation(line: 17, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/ga-offset.ll b/test/CodeGen/X86/ga-offset.ll
index 934c14921e99..3613cf8bf598 100644
--- a/test/CodeGen/X86/ga-offset.ll
+++ b/test/CodeGen/X86/ga-offset.ll
@@ -1,18 +1,11 @@
-; RUN: llc < %s -march=x86 > %t
-; RUN: not grep lea %t
-; RUN: not grep add %t
-; RUN: grep mov %t | count 1
-; RUN: llc < %s -mtriple=x86_64-linux -relocation-model=static > %t
-; RUN: not grep lea %t
-; RUN: not grep add %t
-; RUN: grep mov %t | count 1
-
-; This store should fold to a single mov instruction.
+; RUN: llc < %s -mtriple=x86_64-linux -relocation-model=static | FileCheck %s
@ptr = global i32* null
@dst = global [131072 x i32] zeroinitializer
define void @foo() nounwind {
+; This store should fold to a single mov instruction.
+; CHECK: movq $dst+64, ptr(%rip)
store i32* getelementptr ([131072 x i32], [131072 x i32]* @dst, i32 0, i32 16), i32** @ptr
ret void
}
diff --git a/test/CodeGen/X86/ga-offset2.ll b/test/CodeGen/X86/ga-offset2.ll
new file mode 100644
index 000000000000..bc4a3493ff6f
--- /dev/null
+++ b/test/CodeGen/X86/ga-offset2.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s
+
+@var = external hidden global i32
+@p = external hidden global i32*
+
+define void @f() {
+; CHECK: movl $_var+40, _p
+ store i32* getelementptr (i32, i32* @var, i64 10), i32** @p
+ ret void
+}
diff --git a/test/CodeGen/X86/global-access-pie.ll b/test/CodeGen/X86/global-access-pie.ll
new file mode 100644
index 000000000000..0e29d605476d
--- /dev/null
+++ b/test/CodeGen/X86/global-access-pie.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X32 %s
+
+; External Linkage
+@a = global i32 0, align 4
+
+define i32 @my_access_global_a() #0 {
+; X32-LABEL: my_access_global_a:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl a@GOTOFF(%eax), %eax
+; X64-LABEL: my_access_global_a:
+; X64: movl a(%rip), %eax
+
+entry:
+ %0 = load i32, i32* @a, align 4
+ ret i32 %0
+}
+
+; WeakAny Linkage
+@b = weak global i32 0, align 4
+
+define i32 @my_access_global_b() #0 {
+; X32-LABEL: my_access_global_b:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl b@GOTOFF(%eax), %eax
+; X64-LABEL: my_access_global_b:
+; X64: movl b(%rip), %eax
+
+entry:
+ %0 = load i32, i32* @b, align 4
+ ret i32 %0
+}
+
+; Internal Linkage
+@c = internal global i32 0, align 4
+
+define i32 @my_access_global_c() #0 {
+; X32-LABEL: my_access_global_c:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl c@GOTOFF(%eax), %eax
+; X64-LABEL: my_access_global_c:
+; X64: movl c(%rip), %eax
+
+entry:
+ %0 = load i32, i32* @c, align 4
+ ret i32 %0
+}
+
+; External Linkage, only declaration.
+@d = external global i32, align 4
+
+define i32 @my_access_global_load_d() #0 {
+; X32-LABEL: my_access_global_load_d:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl d@GOT(%eax), %eax
+; X32-NEXT: movl (%eax), %eax
+; X64-LABEL: my_access_global_load_d:
+; X64: movq d@GOTPCREL(%rip), %rax
+; X64-NEXT: movl (%rax), %eax
+
+entry:
+ %0 = load i32, i32* @d, align 4
+ ret i32 %0
+}
+
+; External Linkage, only declaration, store a value.
+
+define i32 @my_access_global_store_d() #0 {
+; X32-LABEL: my_access_global_store_d:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl d@GOT(%eax), %eax
+; X32-NEXT: movl $2, (%eax)
+; X64-LABEL: my_access_global_store_d:
+; X64: movq d@GOTPCREL(%rip), %rax
+; X64-NEXT: movl $2, (%rax)
+
+entry:
+ store i32 2, i32* @d, align 4
+ ret i32 0
+}
+
+; External Linkage, function pointer access.
+declare i32 @access_fp(i32 ()*)
+declare i32 @foo()
+
+define i32 @my_access_fp_foo() #0 {
+; X32-LABEL: my_access_fp_foo:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %ebx
+; X32-NEXT: movl foo@GOT(%ebx), %eax
+; X64-LABEL: my_access_fp_foo:
+; X64: movq foo@GOTPCREL(%rip), %rdi
+
+entry:
+ %call = call i32 @access_fp(i32 ()* @foo)
+ ret i32 %call
+}
+
+; LinkOnceODR Linkage, function pointer access.
+
+$bar = comdat any
+
+define linkonce_odr i32 @bar() comdat {
+entry:
+ ret i32 0
+}
+
+define i32 @my_access_fp_bar() #0 {
+; X32-LABEL: my_access_fp_bar:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %ebx
+; X32-NEXT: leal bar@GOTOFF(%ebx), %eax
+; X64-LABEL: my_access_fp_bar:
+; X64: leaq bar(%rip), %rdi
+
+entry:
+ %call = call i32 @access_fp(i32 ()* @bar)
+ ret i32 %call
+}
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 92440f2b3316..ea6df468ceb2 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -93,11 +93,13 @@ bb7:
; DARWIN64: Lfunc_end
; DARWIN64-NEXT: .cfi_endproc
; DARWIN64-NOT: .section
+; DARWIN64: .data_region jt32
; DARWIN64: LJTI{{.*}}:
; DARWIN64-NEXT: .long
; DARWIN64-NEXT: .long
; DARWIN64-NEXT: .long
; DARWIN64-NEXT: .long
+; DARWIN64-NEXT: .end_data_region
; DARWIN64-NEXT: .section __TEXT,__gcc_except_tab
; int G1;
@@ -241,13 +243,13 @@ bb7:
; DARWIN: .section __DATA,__data{{$}}
; DARWIN: .globl _G10
; DARWIN: .weak_definition _G10
-; DARWIN: .align 5
+; DARWIN: .p2align 5
; DARWIN: _G10:
; DARWIN: .space 400
; LINUX: .bss
; LINUX: .weak G10
-; LINUX: .align 32
+; LINUX: .p2align 5
; LINUX: G10:
; LINUX: .zero 400
@@ -298,3 +300,32 @@ bb7:
; WIN32-SECTIONS: .section .rdata,"dr",one_only,_G15
; WIN32-SECTIONS: _G15:
+
+@G16 = unnamed_addr constant i256 0
+
+; LINUX: .section .rodata.cst32,"aM",@progbits,32
+; LINUX: G16:
+
+; LINUX-SECTIONS: .section .rodata.cst32,"aM",@progbits,32
+; LINUX-SECTIONS: G16:
+
+; WIN32-SECTIONS: .section .rdata,"dr",one_only,_G16
+; WIN32-SECTIONS: _G16:
+
+; PR26570
+
+@G17 = internal global i8 0
+; LINUX: .type G17,@object
+; LINUX: .local G17
+; LINUX: .comm G17,1,1
+
+; DARWIN: .zerofill __DATA,__bss,_G17,1,0
+
+; LINUX-SECTIONS: .type G17,@object
+; LINUX-SECTIONS: .section .bss.G17,"aw",@nobits
+; LINUX-SECTIONS: .byte 0
+; LINUX-SECTIONS: .size G17, 1
+
+; WIN32-SECTIONS: .section .bss,"bw",one_only,_G17
+; WIN32-SECTIONS: _G17:
+; WIN32-SECTIONS:.byte 0
diff --git a/test/CodeGen/X86/h-registers-3.ll b/test/CodeGen/X86/h-registers-3.ll
index 58b02b7df21f..819f21625abf 100644
--- a/test/CodeGen/X86/h-registers-3.ll
+++ b/test/CodeGen/X86/h-registers-3.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep mov | count 1
-; RUN: llc < %s -march=x86-64 | grep mov | count 1
-; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | grep mov | count 1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=X32
define zeroext i8 @foo() nounwind ssp {
entry:
@@ -8,6 +8,28 @@ entry:
%1 = lshr i16 %0, 8
%2 = trunc i16 %1 to i8
ret i8 %2
+
+; X86-LABEL: foo
+; X86: calll
+; X86-NEXT: movb %ah, %al
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+
+; X64-LABEL: foo
+; X64: callq
+; X64-NEXT: # kill
+; X64-NEXT: shrl $8, %eax
+; X64-NEXT: # kill
+; X64-NEXT: popq
+; X64-NEXT: retq
+
+; X32-LABEL: foo
+; X32: callq
+; X32-NEXT: # kill
+; X32-NEXT: shrl $8, %eax
+; X32-NEXT: # kill
+; X32-NEXT: popq
+; X32-NEXT: retq
}
declare zeroext i16 @bar(...)
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index c6bac5858807..517a663bc815 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -1,11 +1,19 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE3
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3,+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSSE3
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -march=x86-64 -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hadd_ps_test1:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_ps_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
%vecext1 = extractelement <4 x float> %A, i32 1
%add = fadd float %vecext, %vecext1
@@ -24,12 +32,17 @@ define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hadd_ps_test1
-; CHECK: haddps
-; CHECK-NEXT: ret
-
define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hadd_ps_test2:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_ps_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%add = fadd float %vecext, %vecext1
@@ -48,12 +61,17 @@ define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hadd_ps_test2
-; CHECK: haddps
-; CHECK-NEXT: ret
-
define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hsub_ps_test1:
+; SSE: # BB#0:
+; SSE-NEXT: hsubps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_ps_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
%vecext1 = extractelement <4 x float> %A, i32 1
%sub = fsub float %vecext, %vecext1
@@ -72,12 +90,17 @@ define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hsub_ps_test1
-; CHECK: hsubps
-; CHECK-NEXT: ret
-
define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hsub_ps_test2:
+; SSE: # BB#0:
+; SSE-NEXT: hsubps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_ps_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%sub = fsub float %vecext, %vecext1
@@ -96,12 +119,46 @@ define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hsub_ps_test2
-; CHECK: hsubps
-; CHECK-NEXT: ret
-
define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phadd_d_test1:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: addl %eax, %edi
+; SSE3-NEXT: movd %edi, %xmm0
+; SSE3-NEXT: movd %edx, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %esi, %xmm2
+; SSE3-NEXT: movd %ecx, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phadd_d_test1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phadd_d_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%add = add i32 %vecext, %vecext1
@@ -120,15 +177,46 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phadd_d_test1
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; AVX: vphaddd
-; AVX2 vphaddd
-; CHECK: ret
-
define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phadd_d_test2:
+; SSE3: # BB#0:
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %esi, %xmm0
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: movd %xmm1, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %ecx, %xmm1
+; SSE3-NEXT: movd %edx, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phadd_d_test2:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phadd_d_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
%vecext1 = extractelement <4 x i32> %A, i32 3
%add = add i32 %vecext, %vecext1
@@ -147,15 +235,46 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phadd_d_test2
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; AVX: vphaddd
-; AVX2 vphaddd
-; CHECK: ret
-
define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phsub_d_test1:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: subl %ecx, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: subl %edx, %ecx
+; SSE3-NEXT: movd %xmm1, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: subl %esi, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: subl %edi, %esi
+; SSE3-NEXT: movd %esi, %xmm0
+; SSE3-NEXT: movd %ecx, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %edx, %xmm2
+; SSE3-NEXT: movd %eax, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phsub_d_test1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phsub_d_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%sub = sub i32 %vecext, %vecext1
@@ -174,15 +293,46 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phsub_d_test1
-; SSE3-NOT: phsubd
-; SSSE3: phsubd
-; AVX: vphsubd
-; AVX2 vphsubd
-; CHECK: ret
-
define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phsub_d_test2:
+; SSE3: # BB#0:
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: subl %ecx, %eax
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: subl %edx, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: subl %esi, %edx
+; SSE3-NEXT: movd %edx, %xmm0
+; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: subl %edx, %eax
+; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phsub_d_test2:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phsub_d_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
%vecext1 = extractelement <4 x i32> %A, i32 3
%sub = sub i32 %vecext, %vecext1
@@ -201,15 +351,17 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phsub_d_test2
-; SSE3-NOT: phsubd
-; SSSE3: phsubd
-; AVX: vphsubd
-; AVX2 vphsubd
-; CHECK: ret
-
define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hadd_pd_test1:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_pd_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
%vecext1 = extractelement <2 x double> %A, i32 1
%add = fadd double %vecext, %vecext1
@@ -220,12 +372,17 @@ define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hadd_pd_test1
-; CHECK: haddpd
-; CHECK-NEXT: ret
-
define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hadd_pd_test2:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_pd_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 1
%vecext1 = extractelement <2 x double> %A, i32 0
%add = fadd double %vecext, %vecext1
@@ -236,12 +393,17 @@ define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hadd_pd_test2
-; CHECK: haddpd
-; CHECK-NEXT: ret
-
define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hsub_pd_test1:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_pd_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
%vecext1 = extractelement <2 x double> %A, i32 1
%sub = fsub double %vecext, %vecext1
@@ -252,12 +414,17 @@ define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hsub_pd_test1
-; CHECK: hsubpd
-; CHECK-NEXT: ret
-
define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hsub_pd_test2:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_pd_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %B, i32 0
%vecext1 = extractelement <2 x double> %B, i32 1
%sub = fsub double %vecext, %vecext1
@@ -268,12 +435,23 @@ define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hsub_pd_test2
-; CHECK: hsubpd
-; CHECK-NEXT: ret
-
define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
+; SSE-LABEL: avx_vhadd_pd_test:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm1, %xmm0
+; SSE-NEXT: haddpd %xmm3, %xmm2
+; SSE-NEXT: movapd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhadd_pd_test:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%add = fadd double %vecext, %vecext1
@@ -292,19 +470,23 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_vhadd_pd_test
-; SSE3: haddpd
-; SSE3-NEXT: haddpd
-; SSSE3: haddpd
-; SSSE3: haddpd
-; AVX: vhaddpd
-; AVX: vhaddpd
-; AVX2: vhaddpd
-; AVX2: vhaddpd
-; CHECK: ret
-
define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
+; SSE-LABEL: avx_vhsub_pd_test:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm1, %xmm0
+; SSE-NEXT: hsubpd %xmm3, %xmm2
+; SSE-NEXT: movapd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhsub_pd_test:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%sub = fsub double %vecext, %vecext1
@@ -323,19 +505,86 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_vhsub_pd_test
-; SSE3: hsubpd
-; SSE3-NEXT: hsubpd
-; SSSE3: hsubpd
-; SSSE3-NEXT: hsubpd
-; AVX: vhsubpd
-; AVX: vhsubpd
-; AVX2: vhsubpd
-; AVX2: vhsubpd
-; CHECK: ret
-
define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
+; SSE3-LABEL: avx2_vphadd_d_test:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm4, %r8d
+; SSE3-NEXT: addl %ecx, %r8d
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm4, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r9d
+; SSE3-NEXT: addl %edx, %r9d
+; SSE3-NEXT: movd %xmm1, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %esi, %r10d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: addl %esi, %edi
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %edi, %xmm0
+; SSE3-NEXT: movd %r9d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r8d, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: movd %esi, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT: movd %edx, %xmm3
+; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_vphadd_d_test:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: phaddd %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_vphadd_d_test:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_vphadd_d_test:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %A, i32 0
%vecext1 = extractelement <8 x i32> %A, i32 1
%add = add i32 %vecext, %vecext1
@@ -370,17 +619,154 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
ret <8 x i32> %vecinit29
}
-; CHECK-LABEL: avx2_vphadd_d_test
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; SSSE3-NEXT: phaddd
-; AVX: vphaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; AVX2: vphaddd
-; CHECK: ret
define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
+; SSE3-LABEL: avx2_vphadd_w_test:
+; SSE3: # BB#0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: .Ltmp0:
+; SSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: .Ltmp1:
+; SSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: .Ltmp2:
+; SSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: .Ltmp3:
+; SSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: .Ltmp4:
+; SSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE3-NEXT: pushq %rbx
+; SSE3-NEXT: .Ltmp5:
+; SSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE3-NEXT: .Ltmp6:
+; SSE3-NEXT: .cfi_offset %rbx, -56
+; SSE3-NEXT: .Ltmp7:
+; SSE3-NEXT: .cfi_offset %r12, -48
+; SSE3-NEXT: .Ltmp8:
+; SSE3-NEXT: .cfi_offset %r13, -40
+; SSE3-NEXT: .Ltmp9:
+; SSE3-NEXT: .cfi_offset %r14, -32
+; SSE3-NEXT: .Ltmp10:
+; SSE3-NEXT: .cfi_offset %r15, -24
+; SSE3-NEXT: .Ltmp11:
+; SSE3-NEXT: .cfi_offset %rbp, -16
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pextrw $1, %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm0, %eax
+; SSE3-NEXT: pextrw $3, %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pextrw $4, %xmm0, %eax
+; SSE3-NEXT: pextrw $5, %xmm0, %r10d
+; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: pextrw $6, %xmm0, %eax
+; SSE3-NEXT: pextrw $7, %xmm0, %r13d
+; SSE3-NEXT: addl %eax, %r13d
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pextrw $1, %xmm1, %r14d
+; SSE3-NEXT: addl %eax, %r14d
+; SSE3-NEXT: pextrw $2, %xmm1, %eax
+; SSE3-NEXT: pextrw $3, %xmm1, %ebp
+; SSE3-NEXT: addl %eax, %ebp
+; SSE3-NEXT: pextrw $4, %xmm1, %eax
+; SSE3-NEXT: pextrw $5, %xmm1, %ebx
+; SSE3-NEXT: addl %eax, %ebx
+; SSE3-NEXT: pextrw $6, %xmm1, %eax
+; SSE3-NEXT: pextrw $7, %xmm1, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pextrw $1, %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm2, %eax
+; SSE3-NEXT: pextrw $3, %xmm2, %r12d
+; SSE3-NEXT: addl %eax, %r12d
+; SSE3-NEXT: pextrw $4, %xmm2, %eax
+; SSE3-NEXT: pextrw $5, %xmm2, %r15d
+; SSE3-NEXT: addl %eax, %r15d
+; SSE3-NEXT: pextrw $6, %xmm2, %eax
+; SSE3-NEXT: pextrw $7, %xmm2, %r8d
+; SSE3-NEXT: addl %eax, %r8d
+; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: pextrw $1, %xmm3, %r9d
+; SSE3-NEXT: addl %eax, %r9d
+; SSE3-NEXT: pextrw $2, %xmm3, %eax
+; SSE3-NEXT: pextrw $3, %xmm3, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: pextrw $4, %xmm3, %eax
+; SSE3-NEXT: pextrw $5, %xmm3, %edi
+; SSE3-NEXT: addl %eax, %edi
+; SSE3-NEXT: pextrw $6, %xmm3, %ecx
+; SSE3-NEXT: pextrw $7, %xmm3, %eax
+; SSE3-NEXT: addl %ecx, %eax
+; SSE3-NEXT: movd %edx, %xmm8
+; SSE3-NEXT: movd %r13d, %xmm3
+; SSE3-NEXT: movd %ebp, %xmm9
+; SSE3-NEXT: movd %r11d, %xmm4
+; SSE3-NEXT: movd %ebx, %xmm10
+; SSE3-NEXT: movd %r10d, %xmm7
+; SSE3-NEXT: movd %r14d, %xmm11
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT: movd %eax, %xmm12
+; SSE3-NEXT: movd %r8d, %xmm6
+; SSE3-NEXT: movd %esi, %xmm13
+; SSE3-NEXT: movd %r12d, %xmm5
+; SSE3-NEXT: movd %edi, %xmm14
+; SSE3-NEXT: movd %r15d, %xmm2
+; SSE3-NEXT: movd %r9d, %xmm15
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: popq %r12
+; SSE3-NEXT: popq %r13
+; SSE3-NEXT: popq %r14
+; SSE3-NEXT: popq %r15
+; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_vphadd_w_test:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm1, %xmm0
+; SSSE3-NEXT: phaddw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_vphadd_w_test:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_vphadd_w_test:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <16 x i16> %a, i32 0
%vecext1 = extractelement <16 x i16> %a, i32 1
%add = add i16 %vecext, %vecext1
@@ -447,20 +833,58 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
ret <16 x i16> %vecinit108
}
-; CHECK-LABEL: avx2_vphadd_w_test
-; SSE3-NOT: phaddw
-; SSSE3: phaddw
-; SSSE3-NEXT: phaddw
-; AVX: vphaddw
-; AVX: vphaddw
-; AVX2: vphaddw
-; AVX2: vphaddw
-; CHECK: ret
-
; Verify that we don't select horizontal subs in the following functions.
define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: not_a_hsub_1:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE-NEXT: movd %xmm2, %ecx
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %ecx
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: movd %xmm1, %esi
+; SSE-NEXT: subl %esi, %edx
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE-NEXT: movd %xmm0, %esi
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm0, %edi
+; SSE-NEXT: subl %edi, %esi
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movd %edx, %xmm2
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: not_a_hsub_1:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpextrd $1, %xmm0, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpextrd $2, %xmm0, %ecx
+; AVX-NEXT: vpextrd $3, %xmm0, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vpextrd $1, %xmm1, %edx
+; AVX-NEXT: vmovd %xmm1, %esi
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: vpextrd $3, %xmm1, %esi
+; AVX-NEXT: vpextrd $2, %xmm1, %edi
+; AVX-NEXT: subl %edi, %esi
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%sub = sub i32 %vecext, %vecext1
@@ -479,12 +903,45 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: not_a_hsub_1
-; CHECK-NOT: phsubd
-; CHECK: ret
-
define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: not_a_hsub_2:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm0, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: movapd %xmm0, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE-NEXT: subss %xmm3, %xmm2
+; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT: subss %xmm3, %xmm0
+; SSE-NEXT: movaps %xmm1, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0]
+; SSE-NEXT: subss %xmm4, %xmm3
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: subss %xmm3, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: not_a_hsub_2:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%sub = fsub float %vecext, %vecext1
@@ -503,12 +960,28 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: not_a_hsub_2
-; CHECK-NOT: hsubps
-; CHECK: ret
-
define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: not_a_hsub_3:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm1, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: subsd %xmm2, %xmm1
+; SSE-NEXT: movapd %xmm0, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: subsd %xmm0, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: not_a_hsub_3:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %B, i32 0
%vecext1 = extractelement <2 x double> %B, i32 1
%sub = fsub double %vecext, %vecext1
@@ -519,15 +992,21 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: not_a_hsub_3
-; CHECK-NOT: hsubpd
-; CHECK: ret
-
; Test AVX horizontal add/sub of packed single/double precision
; floating point values from 256-bit vectors.
define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: avx_vhadd_ps:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm2, %xmm0
+; SSE-NEXT: haddps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhadd_ps:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -562,17 +1041,18 @@ define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
%vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
ret <8 x float> %vecinit29
}
-; CHECK-LABEL: avx_vhadd_ps
-; SSE3: haddps
-; SSE3-NEXT: haddps
-; SSSE3: haddps
-; SSSE3-NEXT: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK: ret
-
define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: avx_vhsub_ps:
+; SSE: # BB#0:
+; SSE-NEXT: hsubps %xmm2, %xmm0
+; SSE-NEXT: hsubps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhsub_ps:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%sub = fsub float %vecext, %vecext1
@@ -607,17 +1087,18 @@ define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
%vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
ret <8 x float> %vecinit29
}
-; CHECK-LABEL: avx_vhsub_ps
-; SSE3: hsubps
-; SSE3-NEXT: hsubps
-; SSSE3: hsubps
-; SSSE3-NEXT: hsubps
-; AVX: vhsubps
-; AVX2: vhsubps
-; CHECK: ret
-
define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: avx_hadd_pd:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm2, %xmm0
+; SSE-NEXT: haddpd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_hadd_pd:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
%vecext1 = extractelement <4 x double> %a, i32 1
%add = fadd double %vecext, %vecext1
@@ -636,17 +1117,18 @@ define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_hadd_pd
-; SSE3: haddpd
-; SSE3-NEXT: haddpd
-; SSSE3: haddpd
-; SSSE3-NEXT: haddpd
-; AVX: vhaddpd
-; AVX2: vhaddpd
-; CHECK: ret
-
define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: avx_hsub_pd:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm2, %xmm0
+; SSE-NEXT: hsubpd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_hsub_pd:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
%vecext1 = extractelement <4 x double> %a, i32 1
%sub = fsub double %vecext, %vecext1
@@ -665,19 +1147,83 @@ define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_hsub_pd
-; SSE3: hsubpd
-; SSE3-NEXT: hsubpd
-; SSSE3: hsubpd
-; SSSE3-NEXT: hsubpd
-; AVX: vhsubpd
-; AVX2: vhsubpd
-; CHECK: ret
-
; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
+; SSE3-LABEL: avx2_hadd_d:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm4, %r8d
+; SSE3-NEXT: addl %ecx, %r8d
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm4, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r9d
+; SSE3-NEXT: addl %edx, %r9d
+; SSE3-NEXT: movd %xmm2, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %esi, %r10d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: addl %esi, %edi
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %edi, %xmm0
+; SSE3-NEXT: movd %r9d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r8d, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: movd %esi, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT: movd %edx, %xmm3
+; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_hadd_d:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm2, %xmm0
+; SSSE3-NEXT: phaddd %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_hadd_d:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_hadd_d:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -712,18 +1258,149 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
ret <8 x i32> %vecinit29
}
-; CHECK-LABEL: avx2_hadd_d
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; SSSE3-NEXT: phaddd
-; AVX: vphaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; AVX2-NOT: vphaddd
-; CHECK: ret
-
define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
+; SSE3-LABEL: avx2_hadd_w:
+; SSE3: # BB#0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: .Ltmp12:
+; SSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: .Ltmp13:
+; SSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: .Ltmp14:
+; SSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: .Ltmp15:
+; SSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: .Ltmp16:
+; SSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE3-NEXT: pushq %rbx
+; SSE3-NEXT: .Ltmp17:
+; SSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE3-NEXT: .Ltmp18:
+; SSE3-NEXT: .cfi_offset %rbx, -56
+; SSE3-NEXT: .Ltmp19:
+; SSE3-NEXT: .cfi_offset %r12, -48
+; SSE3-NEXT: .Ltmp20:
+; SSE3-NEXT: .cfi_offset %r13, -40
+; SSE3-NEXT: .Ltmp21:
+; SSE3-NEXT: .cfi_offset %r14, -32
+; SSE3-NEXT: .Ltmp22:
+; SSE3-NEXT: .cfi_offset %r15, -24
+; SSE3-NEXT: .Ltmp23:
+; SSE3-NEXT: .cfi_offset %rbp, -16
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pextrw $1, %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm0, %eax
+; SSE3-NEXT: pextrw $3, %xmm0, %r15d
+; SSE3-NEXT: addl %eax, %r15d
+; SSE3-NEXT: pextrw $4, %xmm0, %eax
+; SSE3-NEXT: pextrw $5, %xmm0, %r14d
+; SSE3-NEXT: addl %eax, %r14d
+; SSE3-NEXT: pextrw $6, %xmm0, %eax
+; SSE3-NEXT: pextrw $7, %xmm0, %r13d
+; SSE3-NEXT: addl %eax, %r13d
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pextrw $1, %xmm1, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm1, %eax
+; SSE3-NEXT: pextrw $3, %xmm1, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pextrw $4, %xmm1, %eax
+; SSE3-NEXT: pextrw $5, %xmm1, %r10d
+; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: pextrw $6, %xmm1, %eax
+; SSE3-NEXT: pextrw $7, %xmm1, %r12d
+; SSE3-NEXT: addl %eax, %r12d
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pextrw $1, %xmm2, %ebx
+; SSE3-NEXT: addl %eax, %ebx
+; SSE3-NEXT: pextrw $2, %xmm2, %eax
+; SSE3-NEXT: pextrw $3, %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: pextrw $4, %xmm2, %esi
+; SSE3-NEXT: pextrw $5, %xmm2, %r8d
+; SSE3-NEXT: addl %esi, %r8d
+; SSE3-NEXT: pextrw $6, %xmm2, %esi
+; SSE3-NEXT: pextrw $7, %xmm2, %edx
+; SSE3-NEXT: addl %esi, %edx
+; SSE3-NEXT: movd %xmm3, %edi
+; SSE3-NEXT: pextrw $1, %xmm3, %r9d
+; SSE3-NEXT: addl %edi, %r9d
+; SSE3-NEXT: pextrw $2, %xmm3, %ebp
+; SSE3-NEXT: pextrw $3, %xmm3, %edi
+; SSE3-NEXT: addl %ebp, %edi
+; SSE3-NEXT: pextrw $4, %xmm3, %eax
+; SSE3-NEXT: pextrw $5, %xmm3, %ebp
+; SSE3-NEXT: addl %eax, %ebp
+; SSE3-NEXT: pextrw $6, %xmm3, %esi
+; SSE3-NEXT: pextrw $7, %xmm3, %eax
+; SSE3-NEXT: addl %esi, %eax
+; SSE3-NEXT: movd %edx, %xmm8
+; SSE3-NEXT: movd %r13d, %xmm3
+; SSE3-NEXT: movd %ecx, %xmm9
+; SSE3-NEXT: movd %r15d, %xmm4
+; SSE3-NEXT: movd %r8d, %xmm10
+; SSE3-NEXT: movd %r14d, %xmm7
+; SSE3-NEXT: movd %ebx, %xmm11
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT: movd %eax, %xmm12
+; SSE3-NEXT: movd %r12d, %xmm6
+; SSE3-NEXT: movd %edi, %xmm13
+; SSE3-NEXT: movd %r11d, %xmm5
+; SSE3-NEXT: movd %ebp, %xmm14
+; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r9d, %xmm15
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: popq %r12
+; SSE3-NEXT: popq %r13
+; SSE3-NEXT: popq %r14
+; SSE3-NEXT: popq %r15
+; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_hadd_w:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm2, %xmm0
+; SSSE3-NEXT: phaddw %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_hadd_w:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_hadd_w:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <16 x i16> %a, i32 0
%vecext1 = extractelement <16 x i16> %a, i32 1
%add = add i16 %vecext, %vecext1
@@ -790,13 +1467,3 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
ret <16 x i16> %vecinit108
}
-; CHECK-LABEL: avx2_hadd_w
-; SSE3-NOT: phaddw
-; SSSE3: phaddw
-; SSSE3-NEXT: phaddw
-; AVX: vphaddw
-; AVX: vphaddw
-; AVX2: vphaddw
-; AVX2-NOT: vphaddw
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
index dfe5fff72d07..5e2e50893d03 100644
--- a/test/CodeGen/X86/haddsub-undef.ll
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -1,10 +1,20 @@
-; RUN: llc < %s -march=x86-64 -mattr=ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -march=x86-64 -mattr=avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -march=x86-64 -mattr=avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test1_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test1_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -19,14 +29,17 @@ define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
%vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: test1_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -41,14 +54,17 @@ define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: test2_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test3_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test3_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -63,38 +79,57 @@ define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
%vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
ret <4 x float> %vecinit9
}
-; CHECK-LABEL: test3_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test4_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test4_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
%vecinit = insertelement <4 x float> undef, float %add, i32 0
ret <4 x float> %vecinit
}
-; CHECK-LABEL: test4_undef
-; CHECK-NOT: haddps
-; CHECK: ret
-
define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test5_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: addsd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test5_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %a, i32 0
%vecext1 = extractelement <2 x double> %a, i32 1
%add = fadd double %vecext, %vecext1
%vecinit = insertelement <2 x double> undef, double %add, i32 0
ret <2 x double> %vecinit
}
-; CHECK-LABEL: test5_undef
-; CHECK-NOT: haddpd
-; CHECK: ret
-
define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test6_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test6_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -105,14 +140,17 @@ define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test6_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test7_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test7_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %b, i32 0
%vecext1 = extractelement <4 x float> %b, i32 1
%add = fadd float %vecext, %vecext1
@@ -123,14 +161,30 @@ define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test7_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test8_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: addss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test8_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -141,12 +195,17 @@ define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test8_undef
-; CHECK-NOT: haddps
-; CHECK: ret
-
define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test9_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test9_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -157,11 +216,17 @@ define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test9_undef
-; CHECK: haddps
-; CHECK-NEXT: ret
define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test10_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test10_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -172,14 +237,21 @@ define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
ret <8 x float> %vecinit5
}
-; CHECK-LABEL: test10_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test11_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE-NEXT: addss %xmm3, %xmm1
+; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test11_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -190,13 +262,17 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
ret <8 x float> %vecinit5
}
-; CHECK-LABEL: test11_undef
-; SSE-NOT: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK: ret
define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test12_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test12_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -207,14 +283,18 @@ define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
ret <8 x float> %vecinit5
}
-; CHECK-LABEL: test12_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test13_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test13_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add1 = fadd float %vecext, %vecext1
@@ -233,15 +313,22 @@ define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
%vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
ret <8 x float> %vecinit4
}
-; CHECK-LABEL: test13_undef
-; SSE: haddps
-; SSE-NOT: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test14_undef:
+; SSE: # BB#0:
+; SSE-NEXT: phaddd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test14_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test14_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -252,17 +339,45 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
ret <8 x i32> %vecinit5
}
-; CHECK-LABEL: test14_undef
-; SSE: phaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; CHECK-NOT: phaddd
-; CHECK: ret
; On AVX2, the following sequence can be folded into a single horizontal add.
-; If the Subtarget doesn't support AVX2, then we avoid emitting two packed
+; If the Subtarget doesn't support AVX2, then we avoid emitting two packed
; integer horizontal adds instead of two scalar adds followed by vector inserts.
define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test15_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT: movd %xmm0, %ecx
+; SSE-NEXT: addl %eax, %ecx
+; SSE-NEXT: movd %xmm3, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: addl %eax, %edx
+; SSE-NEXT: movd %ecx, %xmm0
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test15_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
+; AVX1-NEXT: addl %eax, %ecx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; AVX1-NEXT: addl %eax, %edx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test15_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -273,13 +388,22 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
ret <8 x i32> %vecinit5
}
-; CHECK-LABEL: test15_undef
-; SSE-NOT: phaddd
-; AVX-NOT: vphaddd
-; AVX2: vphaddd
-; CHECK: ret
define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test16_undef:
+; SSE: # BB#0:
+; SSE-NEXT: phaddd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test16_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test16_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -290,14 +414,24 @@ define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
ret <8 x i32> %vecinit5
}
-; CHECK-LABEL: test16_undef
-; SSE: phaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test17_undef:
+; SSE: # BB#0:
+; SSE-NEXT: phaddd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test17_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test17_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add1 = add i32 %vecext, %vecext1
@@ -316,10 +450,3 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
ret <8 x i32> %vecinit4
}
-; CHECK-LABEL: test17_undef
-; SSE: phaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; CHECK-NOT: haddps
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll
index 6e65c6c739ca..8e28433d2ac2 100644
--- a/test/CodeGen/X86/haddsub.ll
+++ b/test/CodeGen/X86/haddsub.ll
@@ -1,293 +1,392 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,-avx | FileCheck %s -check-prefix=SSE3
-; RUN: llc < %s -march=x86-64 -mattr=-sse3,+avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: haddpd1:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddpd1:
-; AVX: vhaddpd
-define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
%b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
%r = fadd <2 x double> %a, %b
ret <2 x double> %r
}
+define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: haddpd2:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddpd2:
-; AVX: vhaddpd
-define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
%b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
%r = fadd <2 x double> %a, %b
ret <2 x double> %r
}
+define <2 x double> @haddpd3(<2 x double> %x) {
; SSE3-LABEL: haddpd3:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddpd3:
-; AVX: vhaddpd
-define <2 x double> @haddpd3(<2 x double> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%r = fadd <2 x double> %a, %b
ret <2 x double> %r
}
+define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: haddps1:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps1:
-; AVX: vhaddps
-define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: haddps2:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps2:
-; AVX: vhaddps
-define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
%b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps3(<4 x float> %x) {
; SSE3-LABEL: haddps3:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps3:
-; AVX: vhaddps
-define <4 x float> @haddps3(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps4(<4 x float> %x) {
; SSE3-LABEL: haddps4:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps4:
-; AVX: vhaddps
-define <4 x float> @haddps4(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps5(<4 x float> %x) {
; SSE3-LABEL: haddps5:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps5:
-; AVX: vhaddps
-define <4 x float> @haddps5(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps6(<4 x float> %x) {
; SSE3-LABEL: haddps6:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps6:
-; AVX: vhaddps
-define <4 x float> @haddps6(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps7(<4 x float> %x) {
; SSE3-LABEL: haddps7:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps7:
-; AVX: vhaddps
-define <4 x float> @haddps7(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: hsubpd1:
-; SSE3-NOT: vhsubpd
-; SSE3: hsubpd
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubpd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubpd1:
-; AVX: vhsubpd
-define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
%b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
%r = fsub <2 x double> %a, %b
ret <2 x double> %r
}
+define <2 x double> @hsubpd2(<2 x double> %x) {
; SSE3-LABEL: hsubpd2:
-; SSE3-NOT: vhsubpd
-; SSE3: hsubpd
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubpd %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubpd2:
-; AVX: vhsubpd
-define <2 x double> @hsubpd2(<2 x double> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%r = fsub <2 x double> %a, %b
ret <2 x double> %r
}
+define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: hsubps1:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps1:
-; AVX: vhsubps
-define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @hsubps2(<4 x float> %x) {
; SSE3-LABEL: hsubps2:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps2:
-; AVX: vhsubps
-define <4 x float> @hsubps2(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @hsubps3(<4 x float> %x) {
; SSE3-LABEL: hsubps3:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps3:
-; AVX: vhsubps
-define <4 x float> @hsubps3(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @hsubps4(<4 x float> %x) {
; SSE3-LABEL: hsubps4:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps4:
-; AVX: vhsubps
-define <4 x float> @hsubps4(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhaddps1:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm2, %xmm0
+; SSE3-NEXT: haddps %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddps1:
-; AVX: vhaddps
-define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = fadd <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhaddps2:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm2, %xmm0
+; SSE3-NEXT: haddps %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddps2:
-; AVX: vhaddps
-define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
%b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
%r = fadd <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhaddps3(<8 x float> %x) {
; SSE3-LABEL: vhaddps3:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: haddps %xmm1, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddps3:
-; AVX: vhaddps
-define <8 x float> @vhaddps3(<8 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = fadd <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhsubps1:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm2, %xmm0
+; SSE3-NEXT: hsubps %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhsubps1:
-; AVX: vhsubps
-define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = fsub <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhsubps3(<8 x float> %x) {
; SSE3-LABEL: vhsubps3:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: hsubps %xmm1, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhsubps3:
-; AVX: vhsubps
-define <8 x float> @vhsubps3(<8 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = fsub <8 x float> %a, %b
ret <8 x float> %r
}
+define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
; SSE3-LABEL: vhaddpd1:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm2, %xmm0
+; SSE3-NEXT: haddpd %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddpd1:
-; AVX: vhaddpd
-define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%r = fadd <4 x double> %a, %b
ret <4 x double> %r
}
+define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
; SSE3-LABEL: vhsubpd1:
-; SSE3-NOT: vhsubpd
-; SSE3: hsubpd
-; SSE3: hsubpd
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubpd %xmm2, %xmm0
+; SSE3-NEXT: hsubpd %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhsubpd1:
-; AVX: vhsubpd
-define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%r = fsub <4 x double> %a, %b
ret <4 x double> %r
}
-; CHECK-LABEL: haddps_v2f32
-; CHECK: haddps %xmm{{[0-9]+}}, %xmm0
-; CHECK-NEXT: retq
define <2 x float> @haddps_v2f32(<4 x float> %v0) {
+; SSE3-LABEL: haddps_v2f32:
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
+; AVX-LABEL: haddps_v2f32:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%v0.0 = extractelement <4 x float> %v0, i32 0
%v0.1 = extractelement <4 x float> %v0, i32 1
%v0.2 = extractelement <4 x float> %v0, i32 2
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
index 3b2518e28f58..717ddbfa6fdc 100644
--- a/test/CodeGen/X86/half.ll
+++ b/test/CodeGen/X86/half.ll
@@ -1,12 +1,17 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWON
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWOFF
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C -check-prefix=BWON
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -asm-verbose=false -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefix=CHECK-I686
define void @test_load_store(half* %in, half* %out) {
; CHECK-LABEL: test_load_store:
-; CHECK: movw (%rdi), [[TMP:%[a-z0-9]+]]
-; CHECK: movw [[TMP]], (%rsi)
+; BWON: movzwl (%rdi), %eax
+; BWOFF: movw (%rdi), %ax
+; CHECK: movw %ax, (%rsi)
%val = load half, half* %in
store half %val, half* %out
ret void
@@ -14,7 +19,8 @@ define void @test_load_store(half* %in, half* %out) {
define i16 @test_bitcast_from_half(half* %addr) {
; CHECK-LABEL: test_bitcast_from_half:
-; CHECK: movzwl (%rdi), %eax
+; BWON: movzwl (%rdi), %eax
+; BWOFF: movw (%rdi), %ax
%val = load half, half* %addr
%val_int = bitcast half %val to i16
ret i16 %val_int
@@ -102,7 +108,7 @@ define void @test_sitofp_i64(i64 %a, half* %p) #0 {
; CHECK_LIBCALL-NEXT: retq
; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]]
-; CHECK-F16C-NEXT: vcvtps2ph $0, [[REG0]], [[REG0]]
+; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]]
; CHECK-F16C-NEXT: vmovd [[REG0]], %eax
; CHECK-F16C-NEXT: movw %ax, (%rsi)
; CHECK-F16C-NEXT: retq
@@ -175,7 +181,7 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
; CHECK-LIBCALL-NEXT: popq [[ADDR]]
-; CHECK-F16C-NEXT: vcvtps2ph $0, [[REG1]], [[REG4:%[a-z0-9]+]]
+; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG1]], [[REG4:%[a-z0-9]+]]
; CHECK-F16C-NEXT: vmovd [[REG4]], %eax
; CHECK-F16C-NEXT: movw %ax, (%rsi)
; CHECK-NEXT: retq
@@ -260,4 +266,51 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) {
ret void
}
+declare float @test_floatret();
+
+; On i686, if SSE2 is available, the return value from test_floatret is loaded
+; to f80 and then rounded to f32. The DAG combiner should not combine this
+; fp_round and the subsequent fptrunc from float to half.
+define half @test_f80trunc_nodagcombine() #0 {
+; CHECK-LABEL: test_f80trunc_nodagcombine:
+; CHECK-I686-NOT: calll __truncxfhf2
+ %1 = call float @test_floatret()
+ %2 = fptrunc float %1 to half
+ ret half %2
+}
+
+; CHECK-LABEL: test_sitofp_fadd_i32:
+
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movl %edi, %ebx
+; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp)
+; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: addss 12(%rsp), %xmm0
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+
+; CHECK-F16C-NEXT: movswl (%rsi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm1
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-F16C-NEXT: retq
+
+define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
+ %tmp0 = load half, half* %b
+ %tmp1 = sitofp i32 %a to half
+ %tmp2 = fadd half %tmp0, %tmp1
+ %tmp3 = fpext half %tmp2 to float
+ ret float %tmp3
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll
index e3808e754228..fbc4cd9d4f9c 100644
--- a/test/CodeGen/X86/hipe-cc.ll
+++ b/test/CodeGen/X86/hipe-cc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s
; Check the HiPE calling convention works (x86-32)
@@ -73,5 +73,23 @@ define cc 11 void @baz() nounwind {
ret void
}
+; Sanity-check the tail call sequence. Number of arguments was chosen as to
+; expose a bug where the tail call sequence clobbered the stack.
+define cc 11 { i32, i32, i32 } @tailcaller(i32 %hp, i32 %p) nounwind {
+ ; CHECK: movl $15, %eax
+ ; CHECK-NEXT: movl $31, %edx
+ ; CHECK-NEXT: movl $47, %ecx
+ ; CHECK-NEXT: popl %edi
+ ; CHECK-NEXT: jmp tailcallee
+ %ret = tail call cc11 { i32, i32, i32 } @tailcallee(i32 %hp, i32 %p, i32 15,
+ i32 31, i32 47, i32 63) nounwind
+ ret { i32, i32, i32 } %ret
+}
+
+!hipe.literals = !{ !0, !1, !2 }
+!0 = !{ !"P_NSP_LIMIT", i32 84 }
+!1 = !{ !"X86_LEAF_WORDS", i32 24 }
+!2 = !{ !"AMD64_LEAF_WORDS", i32 24 }
@clos = external constant i32
declare cc 11 void @bar(i32, i32, i32, i32, i32)
+declare cc 11 { i32, i32, i32 } @tailcallee(i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index 28d90399d857..43e2e1409fde 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s
; Check the HiPE calling convention works (x86-64)
@@ -83,5 +83,24 @@ define cc 11 void @baz() nounwind {
ret void
}
+; Sanity-check the tail call sequence. Number of arguments was chosen as to
+; expose a bug where the tail call sequence clobbered the stack.
+define cc 11 { i64, i64, i64 } @tailcaller(i64 %hp, i64 %p) #0 {
+ ; CHECK: movl $15, %esi
+ ; CHECK-NEXT: movl $31, %edx
+ ; CHECK-NEXT: movl $47, %ecx
+ ; CHECK-NEXT: movl $63, %r8d
+ ; CHECK-NEXT: popq %rax
+ ; CHECK-NEXT: jmp tailcallee
+ %ret = tail call cc11 { i64, i64, i64 } @tailcallee(i64 %hp, i64 %p, i64 15,
+ i64 31, i64 47, i64 63, i64 79) #1
+ ret { i64, i64, i64 } %ret
+}
+
+!hipe.literals = !{ !0, !1, !2 }
+!0 = !{ !"P_NSP_LIMIT", i32 160 }
+!1 = !{ !"X86_LEAF_WORDS", i32 24 }
+!2 = !{ !"AMD64_LEAF_WORDS", i32 24 }
@clos = external constant i64
declare cc 11 void @bar(i64, i64, i64, i64, i64, i64)
+declare cc 11 { i64, i64, i64 } @tailcallee(i64, i64, i64, i64, i64, i64, i64)
diff --git a/test/CodeGen/X86/hipe-prologue.ll b/test/CodeGen/X86/hipe-prologue.ll
index 2f16423600c9..8588dff9bc63 100644
--- a/test/CodeGen/X86/hipe-prologue.ll
+++ b/test/CodeGen/X86/hipe-prologue.ll
@@ -24,8 +24,8 @@ define {i32, i32} @test_basic(i32 %hp, i32 %p) {
define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) {
; X32-Linux-LABEL: test_basic_hipecc:
- ; X32-Linux: leal -156(%esp), %ebx
- ; X32-Linux-NEXT: cmpl 76(%ebp), %ebx
+ ; X32-Linux: leal -140(%esp), %ebx
+ ; X32-Linux-NEXT: cmpl 120(%ebp), %ebx
; X32-Linux-NEXT: jb .LBB1_1
; X32-Linux: ret
@@ -34,8 +34,8 @@ define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) {
; X32-Linux-NEXT: calll inc_stack_0
; X64-Linux-LABEL: test_basic_hipecc:
- ; X64-Linux: leaq -232(%rsp), %r14
- ; X64-Linux-NEXT: cmpq 144(%rbp), %r14
+ ; X64-Linux: leaq -184(%rsp), %r14
+ ; X64-Linux-NEXT: cmpq 120(%rbp), %r14
; X64-Linux-NEXT: jb .LBB1_1
; X64-Linux: ret
@@ -65,3 +65,8 @@ define cc 11 {i32,i32,i32} @test_nocall_hipecc(i32 %hp,i32 %p,i32 %x,i32 %y) {
%6 = insertvalue {i32, i32, i32} %5, i32 %p, 2
ret {i32, i32, i32} %6
}
+
+!hipe.literals = !{ !0, !1, !2 }
+!0 = !{ !"P_NSP_LIMIT", i32 120 }
+!1 = !{ !"X86_LEAF_WORDS", i32 24 }
+!2 = !{ !"AMD64_LEAF_WORDS", i32 18 }
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 6798c2b30c3b..5ade5b470b54 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,5 +1,10 @@
; REQUIRES: asserts
-; RUN: llc < %s -stats -O2 2>&1 | grep "1 machine-licm"
+; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machine-licm.*hoisted"
+; For test:
+; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_
+; and 1 for objc_msgSend from the GOT
+; For test_multi_def:
+; 2 invariant load (full multiply, both loads should be hoisted.)
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.2"
@@ -27,4 +32,32 @@ for.end: ; preds = %for.body
declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+define void @test_multi_def(i64* dereferenceable(8) %x1,
+ i64* dereferenceable(8) %x2,
+ i128* %y, i64 %count) nounwind {
+entry:
+ br label %for.body
+
+for.check:
+ %inc = add nsw i64 %i, 1
+ %done = icmp sge i64 %inc, %count
+ br i1 %done, label %exit, label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %inc, %for.check ]
+ %x1_load = load i64, i64* %x1, align 8, !invariant.load !0
+ %x1_zext = zext i64 %x1_load to i128
+ %x2_load = load i64, i64* %x2, align 8, !invariant.load !0
+ %x2_zext = zext i64 %x2_load to i128
+ %x_prod = mul i128 %x1_zext, %x2_zext
+ %y_elem = getelementptr inbounds i128, i128* %y, i64 %i
+ %y_load = load i128, i128* %y_elem, align 8
+ %y_plus = add i128 %x_prod, %y_load
+ store i128 %y_plus, i128* %y_elem, align 8
+ br label %for.check
+
+exit:
+ ret void
+}
+
!0 = !{}
diff --git a/test/CodeGen/X86/hoist-spill-lpad.ll b/test/CodeGen/X86/hoist-spill-lpad.ll
new file mode 100644
index 000000000000..3171f6f9f6fd
--- /dev/null
+++ b/test/CodeGen/X86/hoist-spill-lpad.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+;
+; PR27612. The following spill is hoisted from two locations: the fall
+; through succ block and the landingpad block of a call which may throw
+; exception. If it is not hoisted before the call, the spill will be
+; missing on the landingpad path.
+;
+; CHECK-LABEL: _Z3foov:
+; CHECK: movq %rbx, (%rsp) # 8-byte Spill
+; CHECK-NEXT: callq _Z3goov
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global [20 x i64] zeroinitializer, align 16
+@_ZTIi = external constant i8*
+
+; Function Attrs: uwtable
+define void @_Z3foov() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+ %tmp = load i64, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 1), align 8
+ invoke void @_Z3goov()
+ to label %try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %tmp1 = landingpad { i8*, i32 }
+ cleanup
+ catch i8* bitcast (i8** @_ZTIi to i8*)
+ %tmp2 = extractvalue { i8*, i32 } %tmp1, 1
+ %tmp3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+ %matches = icmp eq i32 %tmp2, %tmp3
+ br i1 %matches, label %catch, label %ehcleanup
+
+catch: ; preds = %lpad
+ %tmp4 = extractvalue { i8*, i32 } %tmp1, 0
+ %tmp5 = tail call i8* @__cxa_begin_catch(i8* %tmp4)
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 2), align 16
+ tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{memory},~{dirflag},~{fpsr},~{flags}"()
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 3), align 8
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont: ; preds = %catch, %entry
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 4), align 16
+ tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{memory},~{dirflag},~{fpsr},~{flags}"()
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 5), align 8
+ ret void
+
+ehcleanup: ; preds = %lpad
+ resume { i8*, i32 } %tmp1
+}
+
+declare void @_Z3goov()
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
diff --git a/test/CodeGen/X86/hoist-spill.ll b/test/CodeGen/X86/hoist-spill.ll
new file mode 100644
index 000000000000..db9c4105a020
--- /dev/null
+++ b/test/CodeGen/X86/hoist-spill.ll
@@ -0,0 +1,121 @@
+; RUN: llc < %s | FileCheck %s
+
+; grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}'
+; Check no spills to the same stack slot after hoisting.
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global i32*, align 8
+@b = external global i32, align 4
+@d = external global i32*, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @fn1(i32 %p1) {
+entry:
+ %tmp = load i32*, i32** @d, align 8
+ %tmp1 = load i32*, i32** @a, align 8
+ %tmp2 = sext i32 %p1 to i64
+ br label %for.cond
+
+for.cond: ; preds = %for.inc14, %entry
+ %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ]
+ %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ]
+ %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
+ %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
+ %tmp3 = icmp sgt i32 undef, 0
+ %smax52 = select i1 %tmp3, i32 undef, i32 0
+ %tmp4 = zext i32 %smax52 to i64
+ %tmp5 = icmp sgt i64 undef, %tmp4
+ %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+ %tmp6 = add nsw i64 %smax53, 1
+ %tmp7 = sub nsw i64 %tmp6, %tmp4
+ %tmp8 = add nsw i64 %tmp7, -8
+ %tmp9 = sub i32 undef, %indvar
+ %tmp10 = icmp sgt i64 %tmp2, 0
+ %smax40 = select i1 %tmp10, i64 %tmp2, i64 0
+ %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40
+ %indvars.iv30 = add i32 %indvars.iv30.in, -1
+ %tmp11 = icmp sgt i32 %indvars.iv30, 0
+ %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0
+ %tmp12 = zext i32 %smax to i64
+ %sub = sub nsw i32 %p1, %c.0
+ %cmp = icmp sgt i32 %sub, 0
+ %sub. = select i1 %cmp, i32 %sub, i32 0
+ %cmp326 = icmp sgt i32 %k.0, %p1
+ br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader
+
+for.body.preheader: ; preds = %for.cond
+ br label %for.body
+
+for.cond4.preheader: ; preds = %for.body, %for.cond
+ %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ]
+ %cmp528 = icmp sgt i32 %sub., %p1
+ br i1 %cmp528, label %for.inc14, label %for.body6.preheader
+
+for.body6.preheader: ; preds = %for.cond4.preheader
+ br i1 undef, label %for.body6, label %min.iters.checked
+
+min.iters.checked: ; preds = %for.body6.preheader
+ br i1 undef, label %for.body6, label %vector.memcheck
+
+vector.memcheck: ; preds = %min.iters.checked
+ %bound1 = icmp ule i32* undef, %scevgep41
+ %memcheck.conflict = and i1 undef, %bound1
+ br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader
+
+vector.body.preheader: ; preds = %vector.memcheck
+ %lcmp.mod = icmp eq i64 undef, 0
+ br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.prol: ; preds = %vector.body.prol, %vector.body.preheader
+ %prol.iter.cmp = icmp eq i64 undef, 0
+ br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.preheader.split: ; preds = %vector.body.prol, %vector.body.preheader
+ %tmp13 = icmp ult i64 %tmp8, 24
+ br i1 %tmp13, label %middle.block, label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.body.preheader.split
+ %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ]
+ %index.next = add i64 %index, 8
+ %offset.idx.1 = add i64 %tmp12, %index.next
+ %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1
+ %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+ %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4
+ %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1
+ %tmp17 = bitcast i32* %tmp16 to <4 x i32>*
+ store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4
+ %index.next.3 = add i64 %index, 32
+ br i1 undef, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body, %vector.body.preheader.split
+ br i1 undef, label %for.inc14, label %for.body6
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ]
+ %add = add nsw i32 %k.127, 1
+ %tmp18 = load i32, i32* undef, align 4
+ store i32 %tmp18, i32* @b, align 4
+ br i1 undef, label %for.body, label %for.cond4.preheader
+
+for.body6: ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
+ %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
+ %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32
+ %tmp19 = load i32, i32* %arrayidx8, align 4
+ %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32
+ store i32 %tmp19, i32* %arrayidx10, align 4
+ %cmp5 = icmp slt i64 %indvars.iv32, undef
+ br i1 %cmp5, label %for.body6, label %for.inc14
+
+for.inc14: ; preds = %for.body6, %middle.block, %for.cond4.preheader
+ %inc15 = add nuw nsw i32 %c.0, 1
+ %indvar.next = add i32 %indvar, 1
+ br label %for.cond
+}
diff --git a/test/CodeGen/X86/i16lshr8pat.ll b/test/CodeGen/X86/i16lshr8pat.ll
new file mode 100644
index 000000000000..7f2da8e29538
--- /dev/null
+++ b/test/CodeGen/X86/i16lshr8pat.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=x86 -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; This test checks to make sure the lshr in %then1 block gets expanded using
+; GR16_ABCD pattern rather than GR32_ABCD pattern. By using the 16-bit pattern
+; this doesn't make the register liveness information look like the whole
+; 32-bit register is a live value, and allows generally better live register
+; analysis.
+; CHECK-LABEL: bb.1.then1:
+; CHECK-NOT: IMPLICIT_DEF
+; CHECK-NOT: INSERT_SUBREG
+; CHECK: sub_8bit_hi
+; CHECK-LABEL: bb.2.endif1:
+
+define i16 @foo4(i32 %prec, i8 *%dst, i16 *%src) {
+entry:
+ %cnd = icmp ne i32 %prec, 0
+ %t0 = load i16, i16 *%src, align 2
+ br i1 %cnd, label %then1, label %endif1
+
+then1:
+ %shr = lshr i16 %t0, 8
+ %conv = trunc i16 %shr to i8
+ store i8 %conv, i8 *%dst, align 1
+ br label %endif1
+
+endif1:
+ %t2 = phi i16 [0, %then1], [%t0, %entry]
+ ret i16 %t2
+}
diff --git a/test/CodeGen/X86/i386-setjmp-pic.ll b/test/CodeGen/X86/i386-setjmp-pic.ll
new file mode 100644
index 000000000000..43a8a0ec76cb
--- /dev/null
+++ b/test/CodeGen/X86/i386-setjmp-pic.ll
@@ -0,0 +1,23 @@
+; RUN: llc -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx"
+
+; Check that the register used as base pointer for setjmp
+; is properly initialized.
+; The test used to fail with the machine verifier complaining
+; that the global base pointer is not initialized.
+; PR26742.
+;
+; CHECK: test:
+; CHECK: calll [[BP_SETUP_LABEL:L[$0-9a-zA-Z_-]+]]
+; CHECK: [[BP_SETUP_LABEL]]:
+; CHECK-NEXT: popl [[BP:%[a-z]+]]
+;
+; CHECK: leal [[BLOCK_ADDR:LBB[$0-9a-zA-Z_-]+]]-[[BP_SETUP_LABEL]]([[BP]]),
+define i32 @test(i8* %tmp) {
+entry:
+ %tmp9 = call i32 @llvm.eh.sjlj.setjmp(i8* %tmp)
+ ret i32 %tmp9
+}
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*)
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
index 748c397143c5..2c3e384b70a6 100644
--- a/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -1,7 +1,7 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: llc %s -o - -enable-shrink-wrap=true -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386-apple-macosx"
+target triple = "i386-apple-macosx10.5"
@a = common global i32 0, align 4
@d = internal unnamed_addr global i1 false
@@ -64,7 +64,7 @@ target triple = "i386-apple-macosx"
; CHECK-NEXT: cmovnel {{%[a-z]+}}, [[CONV]]
;
; Skip all the crust of vaarg lowering.
-; CHECK: calll L_varfunc$stub
+; CHECK: calll _varfunc
; Set the return value to 0.
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: addl $20, %esp
diff --git a/test/CodeGen/X86/i386-tlscall-fastregalloc.ll b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
index 775c0c1b3784..86f6f5872d0f 100644
--- a/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
+++ b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
@@ -10,15 +10,20 @@ target triple = "i386-apple-macosx10.10"
; PR26485.
;
; CHECK-LABEL: f:
+; Get c.
+; C is spilled because of the scheduling of the instructions,
+; but a smarter regalloc wouldn't have spilled it.
+; CHECK: movl L_c{{[^,]*}}, [[C_ADDR:%[a-z]+]]
+; CHECK-NEXT: movl [[C_ADDR]], [[C_SPILLED:[0-8]+\(%esp\)]]
; Get p.
-; CHECK: movl _p@{{[0-9a-zA-Z]+}}, [[P_ADDR:%[a-z]+]]
+; CHECK-NEXT: movl _p@{{[0-9a-zA-Z]+}}, [[P_ADDR:%[a-z]+]]
; CHECK-NEXT: calll *([[P_ADDR]])
; At this point eax contiains the address of p.
; Load c address.
; Make sure we do not clobber eax.
-; CHECK-NEXT: movl L_c{{[^,]*}}, [[C_ADDR:%e[b-z]x+]]
+; CHECK-NEXT: movl [[C_SPILLED]], [[C_ADDR_RELOADED:%e[b-z]x+]]
; Store c address into p.
-; CHECK-NEXT: movl [[C_ADDR]], (%eax)
+; CHECK-NEXT: movl [[C_ADDR_RELOADED]], (%eax)
define void @f() #0 {
entry:
store i8* @c, i8** @p, align 4
diff --git a/test/CodeGen/X86/i686-win-shrink-wrapping.ll b/test/CodeGen/X86/i686-win-shrink-wrapping.ll
new file mode 100644
index 000000000000..1a2cb8476623
--- /dev/null
+++ b/test/CodeGen/X86/i686-win-shrink-wrapping.ll
@@ -0,0 +1,44 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+%struct.S = type { i32 }
+
+; Check that we do not use a basic block that has EFLAGS as live-in
+; if we need to realign the stack.
+; PR27531.
+; CHECK-LABEL: stackRealignment:
+; Prologue code.
+; CHECK: pushl
+; Make sure we actually perform some stack realignment.
+; CHECK: andl ${{[-0-9]+}}, %esp
+; This is the end of the entry block.
+; The prologue should have happened before that point because past
+; this point, EFLAGS is live.
+; CHECK: jg
+define x86_thiscallcc void @stackRealignment(%struct.S* %this) {
+entry:
+ %data = alloca [1 x i32], align 4
+ %d = alloca double, align 8
+ %tmp = bitcast [1 x i32]* %data to i8*
+ %arrayinit.begin = getelementptr inbounds [1 x i32], [1 x i32]* %data, i32 0, i32 0
+ %x_ = getelementptr inbounds %struct.S, %struct.S* %this, i32 0, i32 0
+ %tmp1 = load i32, i32* %x_, align 4
+ %cmp = icmp sgt i32 %tmp1, 32
+ %cond = select i1 %cmp, i32 42, i32 128
+ store i32 %cond, i32* %arrayinit.begin, align 4
+ %cmp3 = icmp slt i32 %tmp1, 32
+ br i1 %cmp3, label %cleanup, label %if.end
+
+if.end: ; preds = %entry
+ %tmp2 = bitcast double* %d to i8*
+ call x86_thiscallcc void @bar(%struct.S* nonnull %this, i32* %arrayinit.begin, double* nonnull %d)
+ br label %cleanup
+
+cleanup: ; preds = %if.end, %entry
+ ret void
+}
+
+; Function Attrs: optsize
+declare x86_thiscallcc void @bar(%struct.S*, i32*, double*)
diff --git a/test/CodeGen/X86/ifunc-asm.ll b/test/CodeGen/X86/ifunc-asm.ll
new file mode 100644
index 000000000000..b65ba86a4f1a
--- /dev/null
+++ b/test/CodeGen/X86/ifunc-asm.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-as < %s -o - | llc -filetype=asm | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal i64 @foo_ifunc() {
+entry:
+ ret i64 0
+}
+; CHECK: .type foo_ifunc,@function
+; CHECK-NEXT: foo_ifunc:
+
+@foo = ifunc i32 (i32), i64 ()* @foo_ifunc
+; CHECK: .type foo,@function
+; CHECK-NEXT: .type foo,@gnu_indirect_function
+; CHECK-NEXT: foo = foo_ifunc
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index 8b905f5d23b6..9a8a3a4369d3 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-implicit-null-checks \
; RUN: | llvm-mc -triple x86_64-apple-macosx -filetype=obj -o - \
@@ -12,10 +12,10 @@
define i32 @imp_null_check_load(i32* %x) {
; CHECK-LABEL: _imp_null_check_load:
-; CHECK: Ltmp1:
+; CHECK: [[BB0_imp_null_check_load:L[^:]+]]:
; CHECK: movl (%rdi), %eax
; CHECK: retq
-; CHECK: Ltmp0:
+; CHECK: [[BB1_imp_null_check_load:LBB0_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -33,10 +33,10 @@ define i32 @imp_null_check_load(i32* %x) {
define i32 @imp_null_check_gep_load(i32* %x) {
; CHECK-LABEL: _imp_null_check_gep_load:
-; CHECK: Ltmp3:
+; CHECK: [[BB0_imp_null_check_gep_load:L[^:]+]]:
; CHECK: movl 128(%rdi), %eax
; CHECK: retq
-; CHECK: Ltmp2:
+; CHECK: [[BB1_imp_null_check_gep_load:LBB1_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -55,11 +55,11 @@ define i32 @imp_null_check_gep_load(i32* %x) {
define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
; CHECK-LABEL: _imp_null_check_add_result:
-; CHECK: Ltmp5:
+; CHECK: [[BB0_imp_null_check_add_result:L[^:]+]]:
; CHECK: addl (%rdi), %esi
; CHECK: movl %esi, %eax
; CHECK: retq
-; CHECK: Ltmp4:
+; CHECK: [[BB1_imp_null_check_add_result:LBB2_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -78,12 +78,12 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) {
; CHECK-LABEL: _imp_null_check_hoist_over_unrelated_load:
-; CHECK: Ltmp7:
+; CHECK: [[BB0_imp_null_check_hoist_over_unrelated_load:L[^:]+]]:
; CHECK: movl (%rdi), %eax
; CHECK: movl (%rsi), %ecx
; CHECK: movl %ecx, (%rdx)
; CHECK: retq
-; CHECK: Ltmp6:
+; CHECK: [[BB1_imp_null_check_hoist_over_unrelated_load:LBB3_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -103,12 +103,12 @@ define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z)
define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; CHECK-LABEL: _imp_null_check_via_mem_comparision
-; CHECK: Ltmp9:
+; CHECK: [[BB0_imp_null_check_via_mem_comparision:L[^:]+]]:
; CHECK: cmpl %esi, 4(%rdi)
; CHECK: jge LBB4_2
; CHECK: movl $100, %eax
; CHECK: retq
-; CHECK: Ltmp8:
+; CHECK: [[BB1_imp_null_check_via_mem_comparision:LBB4_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
; CHECK: LBB4_2:
@@ -158,9 +158,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp5-_imp_null_check_add_result
+; CHECK-NEXT: .long [[BB0_imp_null_check_add_result]]-_imp_null_check_add_result
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp4-_imp_null_check_add_result
+; CHECK-NEXT: .long [[BB1_imp_null_check_add_result]]-_imp_null_check_add_result
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_gep_load
@@ -171,9 +171,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp3-_imp_null_check_gep_load
+; CHECK-NEXT: .long [[BB0_imp_null_check_gep_load]]-_imp_null_check_gep_load
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp2-_imp_null_check_gep_load
+; CHECK-NEXT: .long [[BB1_imp_null_check_gep_load]]-_imp_null_check_gep_load
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_hoist_over_unrelated_load
@@ -184,9 +184,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp7-_imp_null_check_hoist_over_unrelated_load
+; CHECK-NEXT: .long [[BB0_imp_null_check_hoist_over_unrelated_load]]-_imp_null_check_hoist_over_unrelated_load
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp6-_imp_null_check_hoist_over_unrelated_load
+; CHECK-NEXT: .long [[BB1_imp_null_check_hoist_over_unrelated_load]]-_imp_null_check_hoist_over_unrelated_load
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_load
@@ -197,9 +197,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp1-_imp_null_check_load
+; CHECK-NEXT: .long [[BB0_imp_null_check_load]]-_imp_null_check_load
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp0-_imp_null_check_load
+; CHECK-NEXT: .long [[BB1_imp_null_check_load]]-_imp_null_check_load
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_via_mem_comparision
@@ -210,9 +210,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp9-_imp_null_check_via_mem_comparision
+; CHECK-NEXT: .long [[BB0_imp_null_check_via_mem_comparision]]-_imp_null_check_via_mem_comparision
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp8-_imp_null_check_via_mem_comparision
+; CHECK-NEXT: .long [[BB1_imp_null_check_via_mem_comparision]]-_imp_null_check_via_mem_comparision
; OBJDUMP: FaultMap table:
; OBJDUMP-NEXT: Version: 0x1
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
new file mode 100644
index 000000000000..9e83964247e7
--- /dev/null
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -0,0 +1,266 @@
+# RUN: llc -run-pass implicit-null-checks -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
+
+--- |
+ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-apple-macosx"
+
+ ;; Positive test
+ define i32 @imp_null_check_with_bitwise_op_0(i32* %x, i32 %val) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ ;; Negative test. The regalloc is such that we cannot hoist the
+ ;; instruction materializing 2200000 into %eax
+ define i32 @imp_null_check_with_bitwise_op_1(i32* %x, i32 %val, i32* %ptr) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 undef
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ ;; Negative test: IR is identical to
+ ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
+ define i32 @imp_null_check_with_bitwise_op_2(i32* %x, i32 %val) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ ;; Negative test: IR is identical to
+ ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
+ define i32 @imp_null_check_with_bitwise_op_3(i32* %x, i32 %val) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ !0 = !{}
+...
+---
+name: imp_null_check_with_bitwise_op_0
+# CHECK-LABEL: name: imp_null_check_with_bitwise_op_0
+alignment: 4
+allVRegsAllocated: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%esi' }
+# CHECK: bb.0.entry:
+# CHECK: %eax = MOV32ri 2200000
+# CHECK-NEXT: %eax = FAULTING_LOAD_OP %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %esi, %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %esi, %rdi
+
+ %eax = MOV32ri 2200000
+ %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP32rr killed %eax, killed %esi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ %eax = MOV32ri 200
+ RET 0, %eax
+
+ bb.3.is_null:
+ %eax = MOV32ri 42
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
+---
+name: imp_null_check_with_bitwise_op_1
+alignment: 4
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%esi' }
+ - { reg: '%rdx' }
+# CHECK: bb.0.entry:
+# CHECK: %eax = MOV32rm killed %rdx, 1, _, 0, _ :: (volatile load 4 from %ir.ptr)
+# CHECK-NEXT: TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %esi, %rdi, %rdx
+
+ %eax = MOV32rm killed %rdx, 1, _, 0, _ :: (volatile load 4 from %ir.ptr)
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %esi, %rdi
+
+ %eax = MOV32ri 2200000
+ %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP32rr killed %eax, killed %esi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ successors: %bb.3.is_null
+
+ %eax = MOV32ri 200
+
+ bb.3.is_null:
+ liveins: %eax, %ah, %al, %ax, %bh, %bl, %bp, %bpl, %bx, %eax, %ebp, %ebx, %rax, %rbp, %rbx, %r12, %r13, %r14, %r15, %r12b, %r13b, %r14b, %r15b, %r12d, %r13d, %r14d, %r15d, %r12w, %r13w, %r14w, %r15w
+
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
+---
+name: imp_null_check_with_bitwise_op_2
+# CHECK-LABEL: name: imp_null_check_with_bitwise_op_2
+alignment: 4
+allVRegsAllocated: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%esi' }
+# CHECK: bb.0.entry:
+# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %esi, %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %esi, %rdi
+
+ %eax = MOV32ri 2200000
+ %eax = ADD32ri killed %eax, 100, implicit-def dead %eflags
+ %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP32rr killed %eax, killed %esi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ %eax = MOV32ri 200
+ RET 0, %eax
+
+ bb.3.is_null:
+ %eax = MOV32ri 42
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
+---
+name: imp_null_check_with_bitwise_op_3
+# CHECK-LABEL: name: imp_null_check_with_bitwise_op_3
+alignment: 4
+allVRegsAllocated: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%rsi' }
+# CHECK: bb.0.entry:
+# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %rsi, %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %rsi, %rdi
+
+ %rdi = MOV64ri 5000
+ %rdi = AND64rm killed %rdi, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP64rr killed %rdi, killed %rsi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ %eax = MOV32ri 200
+ RET 0, %eax
+
+ bb.3.is_null:
+ %eax = MOV32ri 42
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll
index eba4e72f9330..f13d537d90b8 100644
--- a/test/CodeGen/X86/inalloca-ctor.ll
+++ b/test/CodeGen/X86/inalloca-ctor.ll
@@ -12,8 +12,8 @@ define void @g() {
entry:
%args = alloca inalloca %frame
%c = getelementptr %frame, %frame* %args, i32 0, i32 2
-; CHECK: movl $20, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: subl $16, %esp
; CHECK: movl %esp,
call void @Foo_ctor(%Foo* %c)
; CHECK: leal 12(%{{.*}}),
diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll
index 9a184e563b19..d90e5012ba45 100644
--- a/test/CodeGen/X86/inalloca-invoke.ll
+++ b/test/CodeGen/X86/inalloca-invoke.ll
@@ -21,7 +21,8 @@ blah:
%beg = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 0
%end = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 1
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: subl $20, %esp
; CHECK: movl %esp, %[[beg:[^ ]*]]
; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
index 4f7e4092a99c..69d94d8bfa74 100644
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -8,8 +8,8 @@ declare x86_stdcallcc void @i(i32 %a)
define void @g() {
; CHECK-LABEL: _g:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
diff --git a/test/CodeGen/X86/inalloca.ll b/test/CodeGen/X86/inalloca.ll
index e523c945a69f..134de2f58dda 100644
--- a/test/CodeGen/X86/inalloca.ll
+++ b/test/CodeGen/X86/inalloca.ll
@@ -8,8 +8,8 @@ define void @a() {
; CHECK-LABEL: _a:
entry:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
@@ -28,8 +28,8 @@ define void @b() {
; CHECK-LABEL: _b:
entry:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
@@ -49,8 +49,8 @@ define void @c() {
; CHECK-LABEL: _c:
entry:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
diff --git a/test/CodeGen/X86/indirect-hidden.ll b/test/CodeGen/X86/indirect-hidden.ll
index 9e1b7d373554..5f3885d00e5f 100644
--- a/test/CodeGen/X86/indirect-hidden.ll
+++ b/test/CodeGen/X86/indirect-hidden.ll
@@ -35,9 +35,9 @@ declare i32 @__gxx_personality_v0(...)
; CHECK: .section __IMPORT,__pointers,non_lazy_symbol_pointers
; CHECK-NOT: __DATA,__data
-; CHECK: .indirect_symbol _normal_typeid
+; CHECK: .indirect_symbol _hidden_typeid
; CHECK-NEXT: .long 0
; CHECK-NOT: __DATA,__data
-; CHECK: .indirect_symbol _hidden_typeid
+; CHECK: .indirect_symbol _normal_typeid
; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll
index 4e582de22a1f..65c1c0957adf 100644
--- a/test/CodeGen/X86/insertelement-zero.ll
+++ b/test/CodeGen/X86/insertelement-zero.ll
@@ -10,37 +10,72 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
-; SSE-LABEL: insert_v2f64_z1:
-; SSE: # BB#0:
-; SSE-NEXT: xorpd %xmm1, %xmm1
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v2f64_z1:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v2f64_z1:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorpd %xmm1, %xmm1
+; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v2f64_z1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorpd %xmm1, %xmm1
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v2f64_z1:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorpd %xmm1, %xmm1
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2f64_z1:
; AVX: # BB#0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = insertelement <2 x double> %a, double 0.0, i32 0
ret <2 x double> %1
}
define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
-; SSE-LABEL: insert_v4f64_0zz3:
-; SSE: # BB#0:
-; SSE-NEXT: xorpd %xmm2, %xmm2
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v4f64_0zz3:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorpd %xmm2, %xmm2
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v4f64_0zz3:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorpd %xmm2, %xmm2
+; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v4f64_0zz3:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorpd %xmm2, %xmm2
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v4f64_0zz3:
+; SSE41: # BB#0:
+; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE41-NEXT: xorpd %xmm2, %xmm2
+; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f64_0zz3:
; AVX: # BB#0:
-; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; AVX-NEXT: retq
%1 = insertelement <4 x double> %a, double 0.0, i32 1
%2 = insertelement <4 x double> %1, double 0.0, i32 2
@@ -68,15 +103,21 @@ define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
;
; SSE41-LABEL: insert_v2i64_z1:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrq $0, %rax, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_v2i64_z1:
-; AVX: # BB#0:
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_v2i64_z1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v2i64_z1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
%1 = insertelement <2 x i64> %a, i64 0, i32 0
ret <2 x i64> %1
}
@@ -102,24 +143,20 @@ define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
;
; SSE41-LABEL: insert_v4i64_01z3:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrq $0, %rax, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v4i64_01z3:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v4i64_01z3:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
%1 = insertelement <4 x i64> %a, i64 0, i32 2
ret <4 x i64> %1
@@ -150,13 +187,13 @@ define <4 x float> @insert_v4f32_01z3(<4 x float> %a) {
; SSE41-LABEL: insert_v4f32_01z3:
; SSE41: # BB#0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f32_01z3:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX-NEXT: retq
%1 = insertelement <4 x float> %a, float 0.0, i32 2
ret <4 x float> %1
@@ -191,16 +228,13 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
; SSE41: # BB#0:
; SSE41-NEXT: xorps %xmm2, %xmm2
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v8f32_z12345z7:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
; AVX-NEXT: retq
%1 = insertelement <8 x float> %a, float 0.0, i32 0
%2 = insertelement <8 x float> %1, float 0.0, i32 6
@@ -234,15 +268,21 @@ define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) {
;
; SSE41-LABEL: insert_v4i32_01z3:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrd $2, %eax, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_v4i32_01z3:
-; AVX: # BB#0:
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_v4i32_01z3:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v4i32_01z3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT: retq
%1 = insertelement <4 x i32> %a, i32 0, i32 2
ret <4 x i32> %1
}
@@ -280,29 +320,21 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
;
; SSE41-LABEL: insert_v8i32_z12345z7:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrd $0, %eax, %xmm0
-; SSE41-NEXT: pinsrd $2, %eax, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v8i32_z12345z7:
; AVX1: # BB#0:
-; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v8i32_z12345z7:
; AVX2: # BB#0:
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
; AVX2-NEXT: retq
%1 = insertelement <8 x i32> %a, i32 0, i32 0
%2 = insertelement <8 x i32> %1, i32 0, i32 6
@@ -310,18 +342,37 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
}
define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
-; SSE-LABEL: insert_v8i16_z12345z7:
-; SSE: # BB#0:
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: pinsrw $0, %eax, %xmm0
-; SSE-NEXT: pinsrw $6, %eax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v8i16_z12345z7:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: pinsrw $0, %eax, %xmm0
+; SSE2-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v8i16_z12345z7:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v8i16_z12345z7:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v8i16_z12345z7:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v8i16_z12345z7:
; AVX: # BB#0:
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> %a, i16 0, i32 0
%2 = insertelement <8 x i16> %1, i16 0, i32 6
@@ -329,35 +380,58 @@ define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
}
define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) {
-; SSE-LABEL: insert_v16i16_z12345z789ABZDEz:
-; SSE: # BB#0:
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: pinsrw $0, %eax, %xmm0
-; SSE-NEXT: pinsrw $6, %eax, %xmm0
-; SSE-NEXT: pinsrw $7, %eax, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: pinsrw $0, %eax, %xmm0
+; SSE2-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-NEXT: pinsrw $7, %eax, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSE3-NEXT: pinsrw $7, %eax, %xmm1
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $7, %eax, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
+; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v16i16_z12345z789ABZDEz:
; AVX1: # BB#0:
-; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v16i16_z12345z789ABZDEz:
; AVX2: # BB#0:
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%1 = insertelement <16 x i16> %a, i16 0, i32 0
diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll
index f2596b6347b9..b21fdec624bc 100644
--- a/test/CodeGen/X86/insertps-combine.ll
+++ b/test/CodeGen/X86/insertps-combine.ll
@@ -6,16 +6,12 @@
define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
; SSE-LABEL: shuffle_v4f32_0z27:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z27:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -28,16 +24,12 @@ define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0zz4:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz4:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -50,16 +42,12 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0z24:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -72,17 +60,12 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
define <4 x float> @shuffle_v4f32_0zz0(float %a) {
; SSE-LABEL: shuffle_v4f32_0zz0:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz0:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; AVX-NEXT: retq
%vecinit = insertelement <4 x float> undef, float %a, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
@@ -110,6 +93,132 @@ define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
ret <4 x float> %vecinit4
}
+define <4 x float> @insertps_undef_input0(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: insertps_undef_input0:
+; SSE: # BB#0:
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_undef_input0:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
+; AVX-NEXT: retq
+ %res0 = fadd <4 x float> %a0, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %res1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %res0, <4 x float> %a1, i8 21)
+ %res2 = shufflevector <4 x float> %res1, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %res2
+}
+
+define <4 x float> @insertps_undef_input1(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: insertps_undef_input1:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_undef_input1:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: retq
+ %res0 = fadd <4 x float> %a1, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %res1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %res0, i8 21)
+ %res2 = shufflevector <4 x float> %res1, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x float> %res2
+}
+
+define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1) nounwind {
+; SSE-LABEL: insertps_zero_from_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: addpd {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movapd %xmm1, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_zero_from_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovapd %xmm1, (%rdi)
+; AVX-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %a1
+ %2 = bitcast <2 x double> <double 1.0, double 2.0> to <4 x float>
+ %3 = fadd <2 x double> %1, <double 1.0, double 2.0>
+ %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 6, i32 2, i32 2, i32 3>
+ store <2 x double> %3, <2 x double> *%a1
+ ret <4 x float> %4
+}
+
+define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) nounwind {
+; SSE-LABEL: insertps_zero_from_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movdqa %xmm1, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_zero_from_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %a1
+ %2 = bitcast <2 x i64> <i64 1, i64 -2> to <4 x float>
+ %3 = add <2 x i64> %1, <i64 1, i64 -2>
+ %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 5, i32 2, i32 2, i32 3>
+ store <2 x i64> %3, <2 x i64> *%a1
+ ret <4 x float> %4
+}
+
+define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) nounwind {
+; SSE-LABEL: insertps_zero_from_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movdqa %xmm1, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_zero_from_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a1
+ %2 = bitcast <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3> to <4 x float>
+ %3 = add <8 x i16> %1, <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3>
+ %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 4, i32 2, i32 2, i32 3>
+ store <8 x i16> %3, <8 x i16> *%a1
+ ret <4 x float> %4
+}
+
+define <4 x float> @consecutive_load_insertps_04zz(float* %p) {
+; SSE-LABEL: consecutive_load_insertps_04zz:
+; SSE: # BB#0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: consecutive_load_insertps_04zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+ %p0 = getelementptr inbounds float, float* %p, i64 1
+ %p1 = getelementptr inbounds float, float* %p, i64 2
+ %s0 = load float, float* %p0
+ %s1 = load float, float* %p1
+ %v0 = insertelement <4 x float> undef, float %s0, i32 0
+ %v1 = insertelement <4 x float> undef, float %s1, i32 0
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v0, <4 x float> %v1, i8 28)
+ ret <4 x float> %res
+}
+
define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: extract_zero_insertps_z0z7:
; SSE: # BB#0:
diff --git a/test/CodeGen/X86/interval-update-remat.ll b/test/CodeGen/X86/interval-update-remat.ll
new file mode 100644
index 000000000000..4e80e34c9479
--- /dev/null
+++ b/test/CodeGen/X86/interval-update-remat.ll
@@ -0,0 +1,161 @@
+; RUN: llc -verify-regalloc -verify-machineinstrs < %s
+; PR27275: When enabling remat for vreg defined by PHIs, make sure the update
+; of the live range removes dead phi. Otherwise, we may end up with PHIs with
+; incorrect operands and that will trigger assertions or verifier failures
+; in later passes.
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@b = external global i64, align 8
+@d = external global i32, align 4
+@e = external global i64, align 8
+@h = external global i16, align 2
+@a = external global i8, align 1
+@g = external global i64, align 8
+@j = external global i32, align 4
+@f = external global i16, align 2
+@.str = external unnamed_addr constant [12 x i8], align 1
+
+define void @fn1() {
+entry:
+ %tmp = load i64, i64* @b, align 8
+ %or = or i64 0, 3299921317
+ %and = and i64 %or, %tmp
+ %tmp1 = load i32, i32* @d, align 4
+ br i1 undef, label %lor.rhs, label %lor.end
+
+lor.rhs: ; preds = %entry
+ %tobool3 = icmp ne i8 undef, 0
+ br label %lor.end
+
+lor.end: ; preds = %lor.rhs, %entry
+ %lor.ext = zext i1 undef to i32
+ %tmp2 = load i64, i64* @e, align 8
+ br i1 undef, label %lor.rhs5, label %lor.end7
+
+lor.rhs5: ; preds = %lor.end
+ br label %lor.end7
+
+lor.end7: ; preds = %lor.rhs5, %lor.end
+ %tmp3 = phi i1 [ true, %lor.end ], [ false, %lor.rhs5 ]
+ %neg13 = xor i64 %tmp, -1
+ %conv25 = zext i1 %tmp3 to i32
+ %tobool46 = icmp eq i64 %tmp, 0
+ %.pre = load i16, i16* @h, align 2
+ %tobool10 = icmp eq i16 %.pre, 0
+ %neg.us = xor i32 %tmp1, -1
+ %conv12.us = sext i32 %neg.us to i64
+ %tobool23.us = icmp eq i64 %tmp2, %and
+ %conv39.us = sext i32 %tmp1 to i64
+ br label %LABEL_mSmSDb
+
+LABEL_mSmSDb.loopexit: ; preds = %lor.end32.us
+ %conv42.us.lcssa = phi i32 [ %conv42.us, %lor.end32.us ]
+ store i64 undef, i64* @g, align 8
+ br label %LABEL_mSmSDb
+
+LABEL_mSmSDb: ; preds = %LABEL_mSmSDb.loopexit, %lor.end7
+ %tmp4 = phi i32 [ undef, %lor.end7 ], [ %conv42.us.lcssa, %LABEL_mSmSDb.loopexit ]
+ %tmp5 = phi i64 [ %tmp, %lor.end7 ], [ 0, %LABEL_mSmSDb.loopexit ]
+ br i1 %tobool10, label %LABEL_BRBRN.preheader, label %if.then
+
+if.then: ; preds = %LABEL_mSmSDb
+ store i8 undef, i8* @a, align 1
+ br label %LABEL_BRBRN.preheader
+
+LABEL_BRBRN.preheader: ; preds = %if.then, %LABEL_mSmSDb
+ %.pre63 = load i64, i64* @g, align 8
+ br i1 %tobool46, label %LABEL_BRBRN.us, label %LABEL_BRBRN.outer
+
+LABEL_BRBRN.outer: ; preds = %if.then47, %LABEL_BRBRN.preheader
+ %.ph = phi i32 [ 0, %if.then47 ], [ %tmp4, %LABEL_BRBRN.preheader ]
+ %.ph64 = phi i32 [ %conv50, %if.then47 ], [ %tmp1, %LABEL_BRBRN.preheader ]
+ %.ph65 = phi i64 [ %tmp16, %if.then47 ], [ %.pre63, %LABEL_BRBRN.preheader ]
+ %.ph66 = phi i64 [ 0, %if.then47 ], [ %tmp2, %LABEL_BRBRN.preheader ]
+ %.ph67 = phi i64 [ %.pre56.pre, %if.then47 ], [ %tmp5, %LABEL_BRBRN.preheader ]
+ %neg = xor i32 %.ph64, -1
+ %conv12 = sext i32 %neg to i64
+ %tobool23 = icmp eq i64 %.ph66, %and
+ %tmp6 = load i32, i32* @j, align 4
+ %shr = lshr i32 %conv25, %tmp6
+ %conv39 = sext i32 %.ph64 to i64
+ br label %LABEL_BRBRN
+
+LABEL_BRBRN.us: ; preds = %lor.end32.us, %LABEL_BRBRN.preheader
+ %tmp7 = phi i32 [ %conv42.us, %lor.end32.us ], [ %tmp4, %LABEL_BRBRN.preheader ]
+ %tmp8 = phi i64 [ undef, %lor.end32.us ], [ %.pre63, %LABEL_BRBRN.preheader ]
+ %tmp9 = phi i64 [ %tmp10, %lor.end32.us ], [ %tmp5, %LABEL_BRBRN.preheader ]
+ %mul.us = mul i64 %tmp8, %neg13
+ %mul14.us = mul i64 %mul.us, %conv12.us
+ %cmp.us = icmp sgt i64 %tmp2, %mul14.us
+ %conv16.us = zext i1 %cmp.us to i64
+ %xor.us = xor i64 %conv16.us, %tmp9
+ %rem18.us = urem i32 %lor.ext, %tmp7
+ %conv19.us = zext i32 %rem18.us to i64
+ br i1 %tobool23.us, label %lor.rhs24.us, label %lor.end32.us
+
+lor.rhs24.us: ; preds = %LABEL_BRBRN.us
+ br label %lor.end32.us
+
+lor.end32.us: ; preds = %lor.rhs24.us, %LABEL_BRBRN.us
+ %tmp10 = phi i64 [ -2, %LABEL_BRBRN.us ], [ -1, %lor.rhs24.us ]
+ %xor.us.not = xor i64 %xor.us, -1
+ %neg36.us = and i64 %conv19.us, %xor.us.not
+ %conv37.us = zext i32 %tmp7 to i64
+ %sub38.us = sub nsw i64 %neg36.us, %conv37.us
+ %mul40.us = mul nsw i64 %sub38.us, %conv39.us
+ %neg41.us = xor i64 %mul40.us, 4294967295
+ %conv42.us = trunc i64 %neg41.us to i32
+ %tobool43.us = icmp eq i8 undef, 0
+ br i1 %tobool43.us, label %LABEL_mSmSDb.loopexit, label %LABEL_BRBRN.us
+
+LABEL_BRBRN: ; preds = %lor.end32, %LABEL_BRBRN.outer
+ %tmp11 = phi i32 [ %conv42, %lor.end32 ], [ %.ph, %LABEL_BRBRN.outer ]
+ %tmp12 = phi i64 [ %neg21, %lor.end32 ], [ %.ph65, %LABEL_BRBRN.outer ]
+ %tmp13 = phi i64 [ %conv35, %lor.end32 ], [ %.ph67, %LABEL_BRBRN.outer ]
+ %mul = mul i64 %tmp12, %neg13
+ %mul14 = mul i64 %mul, %conv12
+ %cmp = icmp sgt i64 %.ph66, %mul14
+ %conv16 = zext i1 %cmp to i64
+ %xor = xor i64 %conv16, %tmp13
+ %rem18 = urem i32 %lor.ext, %tmp11
+ %conv19 = zext i32 %rem18 to i64
+ %neg21 = or i64 %xor, undef
+ br i1 %tobool23, label %lor.rhs24, label %lor.end32
+
+lor.rhs24: ; preds = %LABEL_BRBRN
+ %tmp14 = load volatile i16, i16* @f, align 2
+ %conv26 = sext i16 %tmp14 to i32
+ %and27 = and i32 %conv26, %shr
+ %conv28 = sext i32 %and27 to i64
+ %mul29 = mul nsw i64 %conv28, %tmp
+ %and30 = and i64 %mul29, %tmp13
+ %tobool31 = icmp ne i64 %and30, 0
+ br label %lor.end32
+
+lor.end32: ; preds = %lor.rhs24, %LABEL_BRBRN
+ %tmp15 = phi i1 [ true, %LABEL_BRBRN ], [ %tobool31, %lor.rhs24 ]
+ %lor.ext33 = zext i1 %tmp15 to i32
+ %neg34 = xor i32 %lor.ext33, -1
+ %conv35 = sext i32 %neg34 to i64
+ %xor.not = xor i64 %xor, -1
+ %neg36 = and i64 %conv19, %xor.not
+ %conv37 = zext i32 %tmp11 to i64
+ %sub38 = sub nsw i64 %neg36, %conv37
+ %mul40 = mul nsw i64 %sub38, %conv39
+ %neg41 = xor i64 %mul40, 4294967295
+ %conv42 = trunc i64 %neg41 to i32
+ %tobool43 = icmp eq i8 undef, 0
+ br i1 %tobool43, label %if.then47, label %LABEL_BRBRN
+
+if.then47: ; preds = %lor.end32
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i64 %conv39)
+ %tmp16 = load i64, i64* @g, align 8
+ %neg49 = xor i64 %tmp16, 4294967295
+ %conv50 = trunc i64 %neg49 to i32
+ %.pre56.pre = load i64, i64* @b, align 8
+ br label %LABEL_BRBRN.outer
+}
+
+declare void @printf(i8* nocapture readonly, ...)
diff --git a/test/CodeGen/X86/ipra-inline-asm.ll b/test/CodeGen/X86/ipra-inline-asm.ll
new file mode 100644
index 000000000000..e70b149e19e1
--- /dev/null
+++ b/test/CodeGen/X86/ipra-inline-asm.ll
@@ -0,0 +1,20 @@
+; RUN: llc -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Verify that bar does not clobber anything
+; CHECK-NOT: bar Clobbered Registers:{{.+}}
+; CHECK: bar Clobbered Registers:
+define void @bar() #0 {
+ ret void
+}
+
+; Verifies that inline assembly is correctly handled by giving a list of clobbered registers
+; CHECK: foo Clobbered Registers: AH AL AX CH CL CX DI DIL EAX ECX EDI RAX RCX RDI
+define void @foo() #0 {
+ call void asm sideeffect "", "~{eax},~{ecx},~{edi}"() #0
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/ipra-local-linkage.ll b/test/CodeGen/X86/ipra-local-linkage.ll
new file mode 100644
index 000000000000..a394ed3e3858
--- /dev/null
+++ b/test/CodeGen/X86/ipra-local-linkage.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s | FileCheck %s -check-prefix=NOIPRA
+; RUN: llc -enable-ipra < %s | FileCheck %s
+
+target triple = "x86_64--"
+
+define internal void @foo() norecurse {
+; When IPRA is not enabled R15 will be saved by foo as it is callee saved reg.
+; NOIPRA-LABEL: foo:
+; NOIPRA: pushq %r15
+; When IPRA is enabled none register should be saved as foo() is local function
+; so we optimize it to save no registers.
+; CHECK-LABEL: foo:
+; CHECK-NOT: pushq %r15
+ call void asm sideeffect "movl %r14d, %r15d", "~{r15}"()
+ ret void
+}
+
+define void @bar(i32 %X) {
+ call void asm sideeffect "movl %r12d, $0", "{r15}~{r12}"(i32 %X)
+ ; As R15 is clobbered by foo() when IPRA is enabled value of R15 should be
+ ; saved if register containing orignal value is also getting clobbered
+ ; and reloaded after foo(), here original value is loaded back into R15D after
+ ; call to foo.
+ call void @foo()
+ ; CHECK-LABEL: bar:
+ ; CHECK: callq foo
+ ; CHECK-NEXT: movl %eax, %r15d
+ call void asm sideeffect "movl $0, %r12d", "{r15}~{r12}"(i32 %X)
+ ret void
+}
diff --git a/test/CodeGen/X86/ipra-reg-usage.ll b/test/CodeGen/X86/ipra-reg-usage.ll
new file mode 100644
index 000000000000..ca97472bb820
--- /dev/null
+++ b/test/CodeGen/X86/ipra-reg-usage.ll
@@ -0,0 +1,12 @@
+; RUN: llc -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
+
+target triple = "x86_64-unknown-unknown"
+declare void @bar1()
+define preserve_allcc void @foo()#0 {
+; CHECK: foo Clobbered Registers: CS DS EFLAGS EIP EIZ ES FPSW FS GS IP RIP RIZ SS BND0 BND1 BND2 BND3 CR0 CR1 CR2 CR3 CR4 CR5 CR6 CR7 CR8 CR9 CR10 CR11 CR12 CR13 CR14 CR15 DR0 DR1 DR2 DR3 DR4 DR5 DR6 DR7 DR8 DR9 DR10 DR11 DR12 DR13 DR14 DR15 FP0 FP1 FP2 FP3 FP4 FP5 FP6 FP7 K0 K1 K2 K3 K4 K5 K6 K7 MM0 MM1 MM2 MM3 MM4 MM5 MM6 MM7 R11 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7 XMM16 XMM17 XMM18 XMM19 XMM20 XMM21 XMM22 XMM23 XMM24 XMM25 XMM26 XMM27 XMM28 XMM29 XMM30 XMM31 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15 YMM16 YMM17 YMM18 YMM19 YMM20 YMM21 YMM22 YMM23 YMM24 YMM25 YMM26 YMM27 YMM28 YMM29 YMM30 YMM31 ZMM0 ZMM1 ZMM2 ZMM3 ZMM4 ZMM5 ZMM6 ZMM7 ZMM8 ZMM9 ZMM10 ZMM11 ZMM12 ZMM13 ZMM14 ZMM15 ZMM16 ZMM17 ZMM18 ZMM19 ZMM20 ZMM21 ZMM22 ZMM23 ZMM24 ZMM25 ZMM26 ZMM27 ZMM28 ZMM29 ZMM30 ZMM31 R11B R11D R11W
+ call void @bar1()
+ call void @bar2()
+ ret void
+}
+declare void @bar2()
+attributes #0 = {nounwind}
diff --git a/test/CodeGen/X86/ipra-transform.ll b/test/CodeGen/X86/ipra-transform.ll
new file mode 100644
index 000000000000..362af8812346
--- /dev/null
+++ b/test/CodeGen/X86/ipra-transform.ll
@@ -0,0 +1,32 @@
+
+; RUN: llc < %s | FileCheck %s -check-prefix=NOIPRA
+; RUN: llc -enable-ipra < %s | FileCheck %s
+
+
+target triple = "x86_64-unknown-unknown"
+define void @bar1() {
+ ret void
+}
+define preserve_allcc void @foo()#0 {
+; Due to preserve_allcc foo() will save some registers at start of foo()
+; prefix NOIPRA will verify that.
+; NOIPRA-LABEL: foo:
+; NOIPRA: pushq %r10
+; NOIPRA-NEXT: pushq %r9
+; NOIPRA-NEXT: pushq %r8
+; NOIPRA: callq bar1
+; When IPRA is present above registers will not be saved and that is verified
+; by prefix CHECK.
+; CHECK: foo:
+; CHECK-NOT: pushq %r10
+; CHECK-NOT: pushq %r9
+; CHECK-NOT: pushq %r8
+; CHECK: callq bar1
+ call void @bar1()
+ call void @bar2()
+ ret void
+}
+define void @bar2() {
+ ret void
+}
+attributes #0 = {nounwind}
diff --git a/test/CodeGen/X86/lakemont.ll b/test/CodeGen/X86/lakemont.ll
new file mode 100644
index 000000000000..ddd24525f27a
--- /dev/null
+++ b/test/CodeGen/X86/lakemont.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86 -mcpu=lakemont | FileCheck %s
+
+; Make sure -mcpu=lakemont implies soft floats.
+define float @test(float %a, float %b) nounwind readnone {
+; CHECK-LABEL: test:
+; CHECK: __addsf3
+ %add = fadd float %a, %b
+ ret float %add
+}
diff --git a/test/CodeGen/X86/lea-opt-memop-check-1.ll b/test/CodeGen/X86/lea-opt-memop-check-1.ll
new file mode 100644
index 000000000000..08e510772a88
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=x86 -mtriple=i686-pc-win32 | FileCheck %s
+
+; PR26575
+; Assertion `(Disp->isImm() || Disp->isGlobal()) && (Other.Disp->isImm() || Other.Disp->isGlobal()) && "Address displacement operand is always an immediate or a global"' failed.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) argmemonly nounwind
+declare <2 x i64> @_mm_xor_si128(<2 x i64>, <2 x i64>) optsize
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+declare <4 x float> @_mm_castsi128_ps(<2 x i64>) optsize
+
+; Check that the LEA optimization pass works with CPI address displacements.
+define void @test1(i8* nocapture readonly %src, i32 %len) #0 {
+ %parts = alloca [4 x i32], align 4
+ %part0 = bitcast [4 x i32]* %parts to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %part0, i8* %src, i32 %len, i32 1, i1 false)
+ %call0 = tail call <2 x i64> @_mm_xor_si128(<2 x i64> undef, <2 x i64> <i64 -9187201950435737472, i64 -9187201950435737472>)
+ %tmp0 = tail call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> undef, <2 x i64> <i64 7631803798, i64 5708721108>, i8 16)
+ %call1 = tail call <4 x float> @_mm_castsi128_ps(<2 x i64> %tmp0)
+ ret void
+; CHECK-LABEL: test1:
+; CHECK: leal{{.*}}
+; CHECK: calll _memcpy
+; CHECK: movaps __xmm@{{[0-9a-f]+}}, %xmm1
+; CHECK: calll __mm_xor_si128
+; CHECK: pclmulqdq $16, __xmm@{{[0-9a-f]+}}, %xmm0
+; CHECK: jmp __mm_castsi128_ps
+}
+
+declare i32 @GetLastError(...)
+declare void @IsolationAwareDeactivateActCtx(i32, i32)
+declare i8* @llvm.localaddress()
+declare void @llvm.localescape(...)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+
+@IsolationAwarePrivateT_SqbjaYRiRY = common global i32 0, align 4
+
+; Check that the MCSymbol objects are created to be used in "\01?fin$0@0@test2@@".
+define void @test2() #0 {
+entry:
+ %fActivateActCtxSuccess = alloca i32, align 4
+ %proc = alloca i32, align 4
+ %ulpCookie = alloca i32, align 4
+ call void (...) @llvm.localescape(i32* nonnull %fActivateActCtxSuccess, i32* nonnull %proc, i32* nonnull %ulpCookie)
+ %tmp0 = tail call i8* @llvm.localaddress()
+ call fastcc void @"\01?fin$0@0@test2@@"(i8* %tmp0)
+ ret void
+; CHECK-LABEL: test2:
+; CHECK: Ltest2$frame_escape_0 = 8
+; CHECK: Ltest2$frame_escape_1 = 4
+; CHECK: Ltest2$frame_escape_2 = 0
+; CHECK: calll "?fin$0@0@test2@@"
+}
+
+; Check that the LEA optimization pass works with MCSymbol address displacements.
+define internal fastcc void @"\01?fin$0@0@test2@@"(i8* readonly %frame_pointer) unnamed_addr noinline nounwind optsize {
+entry:
+ %tmp0 = tail call i8* @llvm.localrecover(i8* bitcast (void ()* @test2 to i8*), i8* %frame_pointer, i32 1)
+ %proc = bitcast i8* %tmp0 to i32*
+ %tmp1 = tail call i8* @llvm.localrecover(i8* bitcast (void ()* @test2 to i8*), i8* %frame_pointer, i32 2)
+ %ulpCookie = bitcast i8* %tmp1 to i32*
+ %tmp2 = load i32, i32* @IsolationAwarePrivateT_SqbjaYRiRY, align 4
+ %tobool = icmp eq i32 %tmp2, 0
+ br i1 %tobool, label %if.end, label %land.lhs.true
+
+land.lhs.true:
+ %tmp3 = tail call i8* @llvm.localrecover(i8* bitcast (void ()* @test2 to i8*), i8* %frame_pointer, i32 0)
+ %fActivateActCtxSuccess = bitcast i8* %tmp3 to i32*
+ %tmp4 = load i32, i32* %fActivateActCtxSuccess, align 4
+ %tobool1 = icmp eq i32 %tmp4, 0
+ br i1 %tobool1, label %if.end, label %if.then
+
+if.then:
+ %tmp5 = load i32, i32* %proc, align 4
+ %tobool2 = icmp eq i32 %tmp5, 0
+ br i1 %tobool2, label %cond.end, label %cond.true
+
+cond.true:
+ %call = tail call i32 bitcast (i32 (...)* @GetLastError to i32 ()*)()
+ br label %cond.end
+
+cond.end:
+ %tmp6 = load i32, i32* %ulpCookie, align 4
+ tail call void @IsolationAwareDeactivateActCtx(i32 0, i32 %tmp6)
+ br label %if.end
+
+if.end:
+ ret void
+; CHECK-LABEL: "?fin$0@0@test2@@":
+; CHECK: cmpl $0, Ltest2$frame_escape_0([[REG1:%[a-z]+]])
+; CHECK: leal Ltest2$frame_escape_1([[REG1]]), [[REG2:%[a-z]+]]
+; CHECK: leal Ltest2$frame_escape_2([[REG1]]), [[REG3:%[a-z]+]]
+; CHECK: cmpl $0, ([[REG2]])
+; CHECK: pushl ([[REG3]])
+}
+
+attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/lea-opt-memop-check-2.ll b/test/CodeGen/X86/lea-opt-memop-check-2.ll
new file mode 100644
index 000000000000..f3fc95f8be3c
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-memop-check-2.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck %s
+
+; PR27502
+; UNREACHABLE: "Invalid address displacement operand"
+
+@buf = internal global [5 x i8*] zeroinitializer
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+
+define i32 @test() nounwind optsize {
+ %r = tail call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+ ret i32 %r
+; CHECK-LABEL: test:
+; CHECK: leaq .LBB0_3(%rip), %r[[REG:[a-z]+]]
+; CHECK: movq %r[[REG]], buf+8(%rip)
+; CHECK: #EH_SjLj_Setup .LBB0_3
+; CHECK: xorl %e[[REG]], %e[[REG]]
+; CHECK: jmp .LBB0_2
+; CHECK-LABEL: .LBB0_3: # Block address taken
+; CHECK-LABEL: .LBB0_2:
+}
diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll
index 8096bfabd6cf..9e0e34b1e09a 100644
--- a/test/CodeGen/X86/lea-opt.ll
+++ b/test/CodeGen/X86/lea-opt.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -enable-x86-lea-opt | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK -check-prefix=ENABLED
+; RUN: llc --disable-x86-lea-opt < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK -check-prefix=DISABLED
%struct.anon1 = type { i32, i32, i32 }
%struct.anon2 = type { i32, [32 x i32], i32 }
@@ -34,16 +35,18 @@ sw.bb.2: ; preds = %entry
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
ret void
; CHECK-LABEL: test1:
-; CHECK: leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
-; CHECK: movl arr1(,[[REG1]],4), {{.*}}
-; CHECK: leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
-; CHECK: subl arr1+4(,[[REG1]],4), {{.*}}
-; CHECK: leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
-; CHECK: addl arr1+8(,[[REG1]],4), {{.*}}
+; CHECK: shlq $2, [[REG1:%[a-z]+]]
+; CHECK: movl arr1([[REG1]],[[REG1]],2), {{.*}}
+; CHECK: leaq arr1+4([[REG1]],[[REG1]],2), [[REG2:%[a-z]+]]
+; CHECK: subl arr1+4([[REG1]],[[REG1]],2), {{.*}}
+; DISABLED: leaq arr1+8([[REG1]],[[REG1]],2), [[REG3:%[a-z]+]]
+; CHECK: addl arr1+8([[REG1]],[[REG1]],2), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
}
define void @test2(i64 %x) nounwind optsize {
@@ -74,16 +77,21 @@ sw.bb.2: ; preds = %entry
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
ret void
; CHECK-LABEL: test2:
-; CHECK: leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
-; CHECK: leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
-; CHECK: movl -4([[REG2]]), {{.*}}
-; CHECK: subl ([[REG2]]), {{.*}}
-; CHECK: leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
-; CHECK: addl ([[REG3]]), {{.*}}
+; CHECK: shlq $2, [[REG1:%[a-z]+]]
+; DISABLED: movl arr1([[REG1]],[[REG1]],2), {{.*}}
+; CHECK: leaq arr1+4([[REG1]],[[REG1]],2), [[REG2:%[a-z]+]]
+; ENABLED: movl -4([[REG2]]), {{.*}}
+; ENABLED: subl ([[REG2]]), {{.*}}
+; ENABLED: addl 4([[REG2]]), {{.*}}
+; DISABLED: subl arr1+4([[REG1]],[[REG1]],2), {{.*}}
+; DISABLED: leaq arr1+8([[REG1]],[[REG1]],2), [[REG3:%[a-z]+]]
+; DISABLED: addl arr1+8([[REG1]],[[REG1]],2), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
}
; Check that LEA optimization pass takes into account a resultant address
@@ -109,7 +117,9 @@ sw.bb.1: ; preds = %entry
sw.bb.2: ; preds = %entry
store i32 333, i32* %a, align 4
- store i32 444, i32* %b, align 4
+ ; Make sure the REG3's definition LEA won't be removed as redundant.
+ %cvt = ptrtoint i32* %b to i32
+ store i32 %cvt, i32* %b, align 4
br label %sw.epilog
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
@@ -122,12 +132,14 @@ sw.epilog: ; preds = %sw.bb.2, %sw.bb.1,
; REG3's definition is closer to movl than REG2's, but the pass still chooses
; REG2 because it provides the resultant address displacement fitting 1 byte.
-; CHECK: movl ([[REG2]]), {{.*}}
-; CHECK: addl ([[REG3]]), {{.*}}
+; ENABLED: movl ([[REG2]]), {{.*}}
+; ENABLED: addl ([[REG3]]), {{.*}}
+; DISABLED: movl arr2+132([[REG1]]), {{.*}}
+; DISABLED: addl arr2([[REG1]]), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
; CHECK: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; CHECK: movl {{.*}}, ([[REG3]])
}
define void @test4(i64 %x) nounwind minsize {
@@ -158,12 +170,19 @@ sw.bb.2: ; preds = %entry
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
ret void
; CHECK-LABEL: test4:
-; CHECK: leaq arr1+4({{.*}}), [[REG2:%[a-z]+]]
-; CHECK: movl -4([[REG2]]), {{.*}}
-; CHECK: subl ([[REG2]]), {{.*}}
-; CHECK: addl 4([[REG2]]), {{.*}}
+; CHECK: imulq {{.*}}, [[REG1:%[a-z]+]]
+; DISABLED: movl arr1([[REG1]]), {{.*}}
+; CHECK: leaq arr1+4([[REG1]]), [[REG2:%[a-z]+]]
+; ENABLED: movl -4([[REG2]]), {{.*}}
+; ENABLED: subl ([[REG2]]), {{.*}}
+; ENABLED: addl 4([[REG2]]), {{.*}}
+; DISABLED: subl arr1+4([[REG1]]), {{.*}}
+; DISABLED: leaq arr1+8([[REG1]]), [[REG3:%[a-z]+]]
+; DISABLED: addl arr1+8([[REG1]]), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, 4([[REG2]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, 4([[REG2]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
}
diff --git a/test/CodeGen/X86/libcall-sret.ll b/test/CodeGen/X86/libcall-sret.ll
index 67b99ac239cd..4ef0a78ad798 100644
--- a/test/CodeGen/X86/libcall-sret.ll
+++ b/test/CodeGen/X86/libcall-sret.ll
@@ -10,14 +10,25 @@ define void @test_sret_libcall(i128 %l, i128 %r) {
; CHECK-LABEL: test_sret_libcall:
; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
- ; (aligned) place for the actual sret data is %esp + 40.
-; CHECK: leal 40(%esp), [[SRET_ADDR:%[a-z]+]]
-; CHECK: movl [[SRET_ADDR]], (%esp)
+ ; (aligned) place for the actual sret data is %esp + 20.
+; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]]
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl [[SRET_ADDR]]
+
; CHECK: calll __multi3
-; CHECK-DAG: movl 40(%esp), [[RES0:%[a-z]+]]
-; CHECK-DAG: movl 44(%esp), [[RES1:%[a-z]+]]
-; CHECK-DAG: movl 48(%esp), [[RES2:%[a-z]+]]
-; CHECK-DAG: movl 52(%esp), [[RES3:%[a-z]+]]
+
+; CHECK: addl $44, %esp
+; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
+; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]]
+; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]]
+; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]]
; CHECK-DAG: movl [[RES0]], var
; CHECK-DAG: movl [[RES1]], var+4
; CHECK-DAG: movl [[RES2]], var+8
diff --git a/test/CodeGen/X86/licm-dominance.ll b/test/CodeGen/X86/licm-dominance.ll
index 7e3c6fdf9514..f6f563c9bcb6 100644
--- a/test/CodeGen/X86/licm-dominance.ll
+++ b/test/CodeGen/X86/licm-dominance.ll
@@ -1,36 +1,55 @@
; RUN: llc -asm-verbose=true < %s | FileCheck %s
; MachineLICM should check dominance before hoisting instructions.
+; only the load of a0 is guaranteed to execute, so only it can be hoisted.
+; CHECK: movb (%rdi), [[a0reg:%[a-z0-9]+]]
+; CHECK: ## %for.body.i
+; CHECK: testb [[a0reg]], [[a0reg]]
; CHECK: ## in Loop:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
+; CHECK: cmpb $1, ({{%[a-z0-9]+}})
+; CHECK: cmpb $2, ({{%[a-z0-9]+}})
+; CHECK: cmpb $3, ({{%[a-z0-9]+}})
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-macosx10.7.2"
-define void @CMSColorWorldCreateParametricData() nounwind uwtable optsize ssp {
+define void @CMSColorWorldCreateParametricData(
+ i8* dereferenceable(1) %a0,
+ i8* dereferenceable(1) %a1,
+ i8* dereferenceable(1) %a2,
+ i8* dereferenceable(1) %a3,
+ i64 %count) nounwind uwtable optsize ssp readonly {
entry:
br label %for.body.i
-for.body.i:
- br i1 undef, label %for.inc.i, label %if.then26.i
-
-if.then26.i:
- br i1 undef, label %if.else.i.i, label %lor.lhs.false.i.i
-
-if.else.i.i:
- br i1 undef, label %lor.lhs.false.i.i, label %if.then116.i.i
-
-lor.lhs.false.i.i:
- br i1 undef, label %for.inc.i, label %if.then116.i.i
-
-if.then116.i.i:
- unreachable
-
-for.inc.i:
- %cmp17.i = icmp ult i64 undef, undef
+for.body.i:
+ %i = phi i64 [0, %entry], [%i.inc, %for.inc.i]
+ %0 = load i8, i8* %a0, !invariant.load !0
+ %cond0 = icmp eq i8 %0, 0
+ br i1 %cond0, label %for.inc.i, label %if.then26.i
+
+if.then26.i:
+ %1 = load i8, i8* %a1, !invariant.load !0
+ %cond1 = icmp eq i8 %1, 1
+ br i1 %cond1, label %if.else.i.i, label %lor.lhs.false.i.i
+
+if.else.i.i:
+ %2 = load i8, i8* %a2, !invariant.load !0
+ %cond2 = icmp eq i8 %2, 2
+ br i1 %cond2, label %lor.lhs.false.i.i, label %for.inc.i
+
+lor.lhs.false.i.i:
+ %3 = load i8, i8* %a3, !invariant.load !0
+ %cond3 = icmp eq i8 %3, 3
+ br i1 %cond3, label %for.inc.i, label %if.end28.i
+
+for.inc.i:
+ %i.inc = add nsw i64 %i, 1
+ %cmp17.i = icmp ult i64 %i.inc, %count
br i1 %cmp17.i, label %for.body.i, label %if.end28.i
-if.end28.i:
+if.end28.i:
ret void
}
+
+!0 = !{}
diff --git a/test/CodeGen/X86/licm-symbol.ll b/test/CodeGen/X86/licm-symbol.ll
index 0f115ddbb6c2..050289e27c90 100644
--- a/test/CodeGen/X86/licm-symbol.ll
+++ b/test/CodeGen/X86/licm-symbol.ll
@@ -6,7 +6,7 @@
; CHECK: pushl
; CHECK: movl $176, %esi
; CHECK: addl L___sF$non_lazy_ptr, %esi
-; CHECK: .align 4, 0x90
+; CHECK: .p2align 4, 0x90
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
target triple = "i386-apple-darwin8"
diff --git a/test/CodeGen/X86/loc-remat.ll b/test/CodeGen/X86/loc-remat.ll
new file mode 100644
index 000000000000..d91ba4b99267
--- /dev/null
+++ b/test/CodeGen/X86/loc-remat.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = common global i32 0, align 4
+
+define i32 @main() !dbg !4 {
+entry:
+ %0 = load volatile i32, i32* @x, align 4, !dbg !9, !tbaa !10
+ %add = add nsw i32 %0, 24, !dbg !9
+ store volatile i32 %add, i32* @x, align 4, !dbg !9, !tbaa !10
+ %1 = load volatile i32, i32* @x, align 4, !dbg !14, !tbaa !10
+ %add1 = add nsw i32 %1, 2, !dbg !14
+ store volatile i32 %add1, i32* @x, align 4, !dbg !14, !tbaa !10
+ %2 = load volatile i32, i32* @x, align 4, !dbg !15, !tbaa !10
+ %add2 = add nsw i32 %2, 3, !dbg !15
+ store volatile i32 %add2, i32* @x, align 4, !dbg !15, !tbaa !10
+ %3 = load volatile i32, i32* @x, align 4, !dbg !16, !tbaa !10
+ %add3 = add nsw i32 %3, 4, !dbg !16
+ store volatile i32 %add3, i32* @x, align 4, !dbg !16, !tbaa !10
+ tail call void @exit(i32 24), !dbg !17
+ unreachable, !dbg !17
+}
+
+; CHECK-LABEL: main:
+; CHECK: .loc 1 3
+; CHECK: .loc 1 4
+; CHECK: .loc 1 5
+; CHECK: .loc 1 6
+; CHECK: .loc 1 7
+; CHECK: .loc 1 8
+; CHECK-NEXT: movl $24, %edi
+; CHECK-NEXT: callq exit
+
+declare void @exit(i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 259383) (llvm/trunk 259385)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "t.c", directory: "/home/majnemer/llvm/src")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, variables: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !DILocation(line: 4, column: 5, scope: !4)
+!10 = !{!11, !11, i64 0}
+!11 = !{!"int", !12, i64 0}
+!12 = !{!"omnipotent char", !13, i64 0}
+!13 = !{!"Simple C/C++ TBAA"}
+!14 = !DILocation(line: 5, column: 5, scope: !4)
+!15 = !DILocation(line: 6, column: 5, scope: !4)
+!16 = !DILocation(line: 7, column: 5, scope: !4)
+!17 = !DILocation(line: 8, column: 3, scope: !4)
diff --git a/test/CodeGen/X86/local_stack_symbol_ordering.ll b/test/CodeGen/X86/local_stack_symbol_ordering.ll
new file mode 100644
index 000000000000..998c14565ce1
--- /dev/null
+++ b/test/CodeGen/X86/local_stack_symbol_ordering.ll
@@ -0,0 +1,184 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X32
+
+; CHECK-LABEL: foo
+
+; Check the functionality of the local stack symbol table ordering
+; heuristics.
+; The test has a bunch of locals of various sizes that are referenced a
+; different number of times.
+;
+; a : 120B, 9 uses, density = 0.075
+; aa : 4000B, 1 use, density = 0.00025
+; b : 4B, 1 use, density = 0.25
+; cc : 4000B, 2 uses density = 0.0005
+; d : 4B, 2 uses density = 0.5
+; e : 4B, 3 uses density = 0.75
+; f : 4B, 4 uses density = 1
+;
+; Given the size, number of uses and calculated density (uses / size), we're
+; going to hope that f gets allocated closest to the stack pointer,
+; followed by e, d, b, then a (to check for just a few).
+; We use gnu-inline asm between calls to prevent registerization of addresses
+; so that we get exact counts.
+;
+; The test is taken from something like this:
+; void foo()
+; {
+; int f; // 4 uses. 4 / 4 = 1
+; int a[30]; // 9 uses. 8 / 120 = 0.06
+; int aa[1000]; // 1 use. 1 / 4000 =
+; int e; // 3 uses. 3 / 4 = 0.75
+; int cc[1000]; // 2 uses. 2 / 4000 =
+; int b; // 1 use. 1 / 4 = 0.25
+; int d; // 2 uses. 2 / 4 = 0.5
+; int aaa[1000]; // 2 uses. 2 / 4000
+;
+;
+; check_a(&a);
+; bar1(&aaa);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; check_f(&f);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; bar3(&aa, &aaa, &cc);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar2(&a,&cc);
+; check_b(&b);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar2(&a, &f);
+; check_e(&e);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar2(&e, &f);
+; check_d(&d);
+; bar1(&a);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar3(&d, &e, &f);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; }
+;
+; X64: leaq 16(%rsp), %rdi
+; X64: callq check_a
+; X64: callq bar1
+; X64: callq bar1
+; X64: leaq (%rsp), %rdi
+; X64: callq check_f
+; X64: callq bar1
+; X64: callq bar3
+; X64: callq bar2
+; X64: leaq 12(%rsp), %rdi
+; X64: callq check_b
+; X64: callq bar1
+; X64: callq bar2
+; X64: leaq 4(%rsp), %rdi
+; X64: callq check_e
+; X64: callq bar1
+; X64: callq bar2
+; X64: leaq 8(%rsp), %rdi
+; X64: callq check_d
+
+; X32: leal 32(%esp)
+; X32: calll check_a
+; X32: calll bar1
+; X32: calll bar1
+; X32: leal 16(%esp)
+; X32: calll check_f
+; X32: calll bar1
+; X32: calll bar3
+; X32: calll bar2
+; X32: leal 28(%esp)
+; X32: calll check_b
+; X32: calll bar1
+; X32: calll bar2
+; X32: leal 20(%esp)
+; X32: calll check_e
+; X32: calll bar1
+; X32: calll bar2
+; X32: leal 24(%esp)
+; X32: calll check_d
+
+
+define void @foo() nounwind uwtable {
+entry:
+ %f = alloca i32, align 4
+ %a = alloca [30 x i32], align 16
+ %aa = alloca [1000 x i32], align 16
+ %e = alloca i32, align 4
+ %cc = alloca [1000 x i32], align 16
+ %b = alloca i32, align 4
+ %d = alloca i32, align 4
+ %aaa = alloca [1000 x i32], align 16
+ %0 = bitcast i32* %f to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #1
+ %1 = bitcast [30 x i32]* %a to i8*
+ call void @llvm.lifetime.start(i64 120, i8* %1) #1
+ %2 = bitcast [1000 x i32]* %aa to i8*
+ call void @llvm.lifetime.start(i64 4000, i8* %2) #1
+ %3 = bitcast i32* %e to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %3) #1
+ %4 = bitcast [1000 x i32]* %cc to i8*
+ call void @llvm.lifetime.start(i64 4000, i8* %4) #1
+ %5 = bitcast i32* %b to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %5) #1
+ %6 = bitcast i32* %d to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %6) #1
+ %7 = bitcast [1000 x i32]* %aaa to i8*
+ call void @llvm.lifetime.start(i64 4000, i8* %7) #1
+ %call = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @check_a to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ %call1 = call i32 ([1000 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([1000 x i32]*, ...)*)([1000 x i32]* %aaa)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call2 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ %call3 = call i32 (i32*, ...) bitcast (i32 (...)* @check_f to i32 (i32*, ...)*)(i32* %f)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call4 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ %call5 = call i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar3 to i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...)*)([1000 x i32]* %aa, [1000 x i32]* %aaa, [1000 x i32]* %cc)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call6 = call i32 ([30 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, [1000 x i32]*, ...)*)([30 x i32]* %a, [1000 x i32]* %cc)
+ %call7 = call i32 (i32*, ...) bitcast (i32 (...)* @check_b to i32 (i32*, ...)*)(i32* %b)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call8 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call9 = call i32 ([30 x i32]*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, i32*, ...)*)([30 x i32]* %a, i32* %f)
+ %call10 = call i32 (i32*, ...) bitcast (i32 (...)* @check_e to i32 (i32*, ...)*)(i32* %e)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call11 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call12 = call i32 (i32*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 (i32*, i32*, ...)*)(i32* %e, i32* %f)
+ %call13 = call i32 (i32*, ...) bitcast (i32 (...)* @check_d to i32 (i32*, ...)*)(i32* %d)
+ %call14 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call15 = call i32 (i32*, i32*, i32*, ...) bitcast (i32 (...)* @bar3 to i32 (i32*, i32*, i32*, ...)*)(i32* %d, i32* %e, i32* %f)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call16 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void @llvm.lifetime.end(i64 4000, i8* %7) #1
+ call void @llvm.lifetime.end(i64 4, i8* %6) #1
+ call void @llvm.lifetime.end(i64 4, i8* %5) #1
+ call void @llvm.lifetime.end(i64 4000, i8* %4) #1
+ call void @llvm.lifetime.end(i64 4, i8* %3) #1
+ call void @llvm.lifetime.end(i64 4000, i8* %2) #1
+ call void @llvm.lifetime.end(i64 120, i8* %1) #1
+ call void @llvm.lifetime.end(i64 4, i8* %0) #1
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare i32 @check_a(...) #2
+declare i32 @bar1(...) #2
+declare i32 @check_f(...) #2
+declare i32 @bar3(...) #2
+declare i32 @bar2(...) #2
+declare i32 @check_b(...) #2
+declare i32 @check_e(...) #2
+declare i32 @check_d(...) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
diff --git a/test/CodeGen/X86/localescape.ll b/test/CodeGen/X86/localescape.ll
index 3cd174df0b71..10ab8dd9672f 100644
--- a/test/CodeGen/X86/localescape.ll
+++ b/test/CodeGen/X86/localescape.ll
@@ -39,21 +39,19 @@ define void @print_framealloc_from_fp(i8* %fp) {
; X86-LABEL: print_framealloc_from_fp:
; X86: pushl %esi
-; X86: subl $8, %esp
-; X86: movl 16(%esp), %esi
-; X86: movl Lalloc_func$frame_escape_0(%esi), %eax
-; X86: movl %eax, 4(%esp)
-; X86: movl $_str, (%esp)
+; X86: movl 8(%esp), %esi
+; X86: pushl Lalloc_func$frame_escape_0(%esi)
+; X86: pushl $_str
; X86: calll _printf
-; X86: movl Lalloc_func$frame_escape_1(%esi), %eax
-; X86: movl %eax, 4(%esp)
-; X86: movl $_str, (%esp)
+; X86: addl $8, %esp
+; X86: pushl Lalloc_func$frame_escape_1(%esi)
+; X86: pushl $_str
; X86: calll _printf
+; X86: addl $8, %esp
; X86: movl $42, Lalloc_func$frame_escape_1(%esi)
; X86: movl $4, %eax
-; X86: movl Lalloc_func$frame_escape_1(%esi,%eax), %eax
-; X86: movl %eax, 4(%esp)
-; X86: movl $_str, (%esp)
+; X86: pushl Lalloc_func$frame_escape_1(%esi,%eax)
+; X86: pushl $_str
; X86: calll _printf
; X86: addl $8, %esp
; X86: popl %esi
@@ -132,12 +130,12 @@ define void @alloc_func_no_frameaddr() {
; X64: retq
; X86-LABEL: alloc_func_no_frameaddr:
-; X86: subl $12, %esp
-; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 8
-; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 4
-; X86: movl $42, 8(%esp)
-; X86: movl $13, 4(%esp)
-; X86: movl $0, (%esp)
+; X86: subl $8, %esp
+; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4
+; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0
+; X86: movl $42, 4(%esp)
+; X86: movl $13, (%esp)
+; X86: pushl $0
; X86: calll _print_framealloc_from_fp
-; X86: addl $12, %esp
+; X86: addl $12, %esp
; X86: retl
diff --git a/test/CodeGen/X86/lock-inst-encoding.ll b/test/CodeGen/X86/lock-inst-encoding.ll
deleted file mode 100644
index 5ce771f14ab2..000000000000
--- a/test/CodeGen/X86/lock-inst-encoding.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: llc -O0 --show-mc-encoding < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-
-; CHECK-LABEL: f1:
-; CHECK: addq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x01,0x37]
-; CHECK: ret
-define void @f1(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw add i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f2:
-; CHECK: subq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x29,0x37]
-; CHECK: ret
-define void @f2(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw sub i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f3:
-; CHECK: andq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x21,0x37]
-; CHECK: ret
-define void @f3(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw and i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f4:
-; CHECK: orq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x09,0x37]
-; CHECK: ret
-define void @f4(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw or i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f5:
-; CHECK: xorq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x31,0x37]
-; CHECK: ret
-define void @f5(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw xor i64* %a, i64 %b monotonic
- ret void
-}
diff --git a/test/CodeGen/X86/loop-blocks.ll b/test/CodeGen/X86/loop-blocks.ll
index a81ceb902ab4..1a1d11e6cb31 100644
--- a/test/CodeGen/X86/loop-blocks.ll
+++ b/test/CodeGen/X86/loop-blocks.ll
@@ -200,6 +200,34 @@ block102:
br label %loop
}
+; CHECK-LABEL: check_minsize:
+; CHECK: jmp .LBB4_1
+; CHECK-NOT: align
+; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: callq loop_latch
+; CHECK-NEXT: .LBB4_1:
+; CHECK-NEXT: callq loop_header
+
+
+define void @check_minsize() minsize nounwind {
+entry:
+ br label %loop
+
+loop:
+ call void @loop_header()
+ %t0 = tail call i32 @get()
+ %t1 = icmp slt i32 %t0, 0
+ br i1 %t1, label %done, label %bb
+
+bb:
+ call void @loop_latch()
+ br label %loop
+
+done:
+ call void @exit()
+ ret void
+}
+
declare void @bar99() nounwind
declare void @bar100() nounwind
declare void @bar101() nounwind
diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll
index 97451e5573fe..3980bee9a306 100644
--- a/test/CodeGen/X86/lsr-static-addr.ll
+++ b/test/CodeGen/X86/lsr-static-addr.ll
@@ -11,8 +11,8 @@
; CHECK-NEXT: incq %rax
-; ATOM: xorl %eax, %eax
; ATOM: movsd .LCPI0_0(%rip), %xmm0
+; ATOM: xorl %eax, %eax
; ATOM: align
; ATOM-NEXT: BB0_2:
; ATOM-NEXT: movsd A(,%rax,8)
diff --git a/test/CodeGen/X86/lzcnt-tzcnt.ll b/test/CodeGen/X86/lzcnt-tzcnt.ll
index aa9ae2b7b100..76e7429ab8da 100644
--- a/test/CodeGen/X86/lzcnt-tzcnt.ll
+++ b/test/CodeGen/X86/lzcnt-tzcnt.ll
@@ -72,39 +72,6 @@ define i64 @test6_ctlz(i64 %v) {
; CHECK-NEXT: ret
-define i16 @test7_ctlz(i16 %v) {
- %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test7_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test8_ctlz(i32 %v) {
- %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test8_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test9_ctlz(i64 %v) {
- %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test9_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test10_ctlz(i16* %ptr) {
%v = load i16, i16* %ptr
%cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
@@ -183,45 +150,6 @@ define i64 @test15_ctlz(i64* %ptr) {
; CHECK-NEXT: ret
-define i16 @test16_ctlz(i16* %ptr) {
- %v = load i16, i16* %ptr
- %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test16_ctlz
-; CHECK-NOT: movw
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test17_ctlz(i32* %ptr) {
- %v = load i32, i32* %ptr
- %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test17_ctlz
-; CHECK-NOT: movd
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test18_ctlz(i64* %ptr) {
- %v = load i64, i64* %ptr
- %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test18_ctlz
-; CHECK-NOT: movq
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test1_cttz(i16 %v) {
%cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
%tobool = icmp eq i16 %v, 0
@@ -288,39 +216,6 @@ define i64 @test6_cttz(i64 %v) {
; CHECK-NEXT: ret
-define i16 @test7_cttz(i16 %v) {
- %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test7_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test8_cttz(i32 %v) {
- %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test8_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test9_cttz(i64 %v) {
- %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test9_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test10_cttz(i16* %ptr) {
%v = load i16, i16* %ptr
%cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
@@ -399,77 +294,6 @@ define i64 @test15_cttz(i64* %ptr) {
; CHECK-NEXT: ret
-define i16 @test16_cttz(i16* %ptr) {
- %v = load i16, i16* %ptr
- %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test16_cttz
-; CHECK-NOT: movw
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test17_cttz(i32* %ptr) {
- %v = load i32, i32* %ptr
- %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test17_cttz
-; CHECK-NOT: movd
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test18_cttz(i64* %ptr) {
- %v = load i64, i64* %ptr
- %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test18_cttz
-; CHECK-NOT: movq
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-define i16 @test1b_ctlz(i16 %v) {
- %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
- %tobool = icmp ne i16 %v, 0
- %cond = select i1 %tobool, i16 16, i16 %cnt
- ret i16 %cond
-}
-; CHECK-LABEL: test1b_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test2b_ctlz(i32 %v) {
- %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
- %tobool = icmp ne i32 %v, 0
- %cond = select i1 %tobool, i32 32, i32 %cnt
- ret i32 %cond
-}
-; CHECK-LABEL: test2b_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test3b_ctlz(i64 %v) {
- %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
- %tobool = icmp ne i64 %v, 0
- %cond = select i1 %tobool, i64 64, i64 %cnt
- ret i64 %cond
-}
-; CHECK-LABEL: test3b_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test4b_ctlz(i16 %v) {
%cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
%tobool = icmp ne i16 %v, 0
@@ -503,39 +327,6 @@ define i64 @test6b_ctlz(i64 %v) {
; CHECK-NEXT: ret
-define i16 @test1b_cttz(i16 %v) {
- %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
- %tobool = icmp ne i16 %v, 0
- %cond = select i1 %tobool, i16 16, i16 %cnt
- ret i16 %cond
-}
-; CHECK-LABEL: test1b_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test2b_cttz(i32 %v) {
- %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
- %tobool = icmp ne i32 %v, 0
- %cond = select i1 %tobool, i32 32, i32 %cnt
- ret i32 %cond
-}
-; CHECK-LABEL: test2b_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test3b_cttz(i64 %v) {
- %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
- %tobool = icmp ne i64 %v, 0
- %cond = select i1 %tobool, i64 64, i64 %cnt
- ret i64 %cond
-}
-; CHECK-LABEL: test3b_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test4b_cttz(i16 %v) {
%cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
%tobool = icmp ne i16 %v, 0
diff --git a/test/CodeGen/X86/machine-combiner-int.ll b/test/CodeGen/X86/machine-combiner-int.ll
index 4a1ba1a980ae..df35abd9534d 100644
--- a/test/CodeGen/X86/machine-combiner-int.ll
+++ b/test/CodeGen/X86/machine-combiner-int.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o /dev/null 2>&1 | FileCheck %s --check-prefix=DEAD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o - | FileCheck %s --check-prefix=DEAD
; Verify that integer multiplies are reassociated. The first multiply in
; each test should be independent of the result of the preceding add (lea).
@@ -10,9 +10,12 @@
define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) {
; CHECK-LABEL: reassociate_muls_i16:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill
+; CHECK-NEXT: # kill
; CHECK-NEXT: leal (%rdi,%rsi), %eax
; CHECK-NEXT: imull %ecx, %edx
; CHECK-NEXT: imull %edx, %eax
+; CHECK-NEXT: # kill
; CHECK-NEXT: retq
%t0 = add i16 %x0, %x1
%t1 = mul i16 %x2, %t0
@@ -23,6 +26,8 @@ define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) {
define i32 @reassociate_muls_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-LABEL: reassociate_muls_i32:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill
+; CHECK-NEXT: # kill
; CHECK-NEXT: leal (%rdi,%rsi), %eax
; CHECK-NEXT: imull %ecx, %edx
; CHECK-NEXT: imull %edx, %eax
@@ -60,8 +65,8 @@ define i8 @reassociate_ands_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: andb %cl, %dl
; CHECK-NEXT: andb %dil, %dl
-; CHECK_NEXT: movb %dx, %ax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i8 %x0, %x1
%t1 = and i8 %x2, %t0
%t2 = and i8 %x3, %t1
@@ -76,8 +81,8 @@ define i32 @reassociate_ands_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: andl %ecx, %edx
; CHECK-NEXT: andl %edi, %edx
-; CHECK_NEXT: movl %edx, %eax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i32 %x0, %x1
%t1 = and i32 %x2, %t0
%t2 = and i32 %x3, %t1
@@ -91,7 +96,7 @@ define i64 @reassociate_ands_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-NEXT: andq %rcx, %rdx
; CHECK-NEXT: andq %rdi, %rdx
; CHECK-NEXT: movq %rdx, %rax
-; CHECK_NEXT: retq
+; CHECK-NEXT: retq
%t0 = sub i64 %x0, %x1
%t1 = and i64 %x2, %t0
%t2 = and i64 %x3, %t1
@@ -107,8 +112,8 @@ define i8 @reassociate_ors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: orb %cl, %dl
; CHECK-NEXT: orb %dil, %dl
-; CHECK_NEXT: movb %dx, %ax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i8 %x0, %x1
%t1 = or i8 %x2, %t0
%t2 = or i8 %x3, %t1
@@ -123,8 +128,8 @@ define i32 @reassociate_ors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: orl %edi, %edx
-; CHECK_NEXT: movl %edx, %eax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i32 %x0, %x1
%t1 = or i32 %x2, %t0
%t2 = or i32 %x3, %t1
@@ -138,7 +143,7 @@ define i64 @reassociate_ors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: orq %rdi, %rdx
; CHECK-NEXT: movq %rdx, %rax
-; CHECK_NEXT: retq
+; CHECK-NEXT: retq
%t0 = sub i64 %x0, %x1
%t1 = or i64 %x2, %t0
%t2 = or i64 %x3, %t1
@@ -154,8 +159,8 @@ define i8 @reassociate_xors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: xorb %cl, %dl
; CHECK-NEXT: xorb %dil, %dl
-; CHECK_NEXT: movb %dx, %ax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i8 %x0, %x1
%t1 = xor i8 %x2, %t0
%t2 = xor i8 %x3, %t1
@@ -170,8 +175,8 @@ define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: xorl %ecx, %edx
; CHECK-NEXT: xorl %edi, %edx
-; CHECK_NEXT: movl %edx, %eax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i32 %x0, %x1
%t1 = xor i32 %x2, %t0
%t2 = xor i32 %x3, %t1
@@ -185,7 +190,7 @@ define i64 @reassociate_xors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: xorq %rdi, %rdx
; CHECK-NEXT: movq %rdx, %rax
-; CHECK_NEXT: retq
+; CHECK-NEXT: retq
%t0 = sub i64 %x0, %x1
%t1 = xor i64 %x2, %t0
%t2 = xor i64 %x3, %t1
diff --git a/test/CodeGen/X86/machine-copy-prop.mir b/test/CodeGen/X86/machine-copy-prop.mir
new file mode 100644
index 000000000000..c2cb4ceb7fbe
--- /dev/null
+++ b/test/CodeGen/X86/machine-copy-prop.mir
@@ -0,0 +1,227 @@
+# RUN: llc -march=x86 -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s
+
+--- |
+ declare void @foo()
+ define void @copyprop_remove_kill0() { ret void }
+ define void @copyprop_remove_kill1() { ret void }
+ define void @copyprop_remove_kill2() { ret void }
+ define void @copyprop0() { ret void }
+ define void @copyprop1() { ret void }
+ define void @copyprop2() { ret void }
+ define void @nocopyprop0() { ret void }
+ define void @nocopyprop1() { ret void }
+ define void @nocopyprop2() { ret void }
+ define void @nocopyprop3() { ret void }
+ define void @nocopyprop4() { ret void }
+ define void @nocopyprop5() { ret void }
+...
+---
+# The second copy is redundant and will be removed, check that we also remove
+# the kill flag of intermediate instructions.
+# CHECK-LABEL: name: copyprop_remove_kill0
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %rdi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop_remove_kill0
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %rdi
+ %rdi = COPY %rax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is redundant and will be removed, check that we also remove
+# the kill flag of intermediate instructions.
+# CHECK-LABEL: name: copyprop_remove_kill1
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %edi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop_remove_kill1
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %edi
+ %rdi = COPY %rax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is redundant and will be removed, check that we also remove
+# the kill flag of intermediate instructions.
+# CHECK-LABEL: name: copyprop_remove_kill2
+# CHECK: bb.0:
+# CHECK-NEXT: %ax = COPY %di
+# CHECK-NEXT: NOOP implicit %rdi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop_remove_kill2
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %ax = COPY %di
+ NOOP implicit killed %rdi
+ %di = COPY %ax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is redundant; the call preserves the source and dest register.
+# CHECK-LABEL: name: copyprop0
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64_rt_mostregs
+# CHECK-NEXT: NOOP implicit %edi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop0
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ CALL64pcrel32 @foo, csr_64_rt_mostregs
+ NOOP implicit killed %edi
+ %rdi = COPY %rax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The 2nd copy is redundant; The call preserves the source and dest register.
+# CHECK-LABEL: name: copyprop1
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %rax
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop1
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %rax
+ %rax = COPY %rdi
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# CHECK-LABEL: name: copyprop2
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %ax
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64_rt_mostregs
+# CHECK-NOT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop2
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %ax
+ CALL64pcrel32 @foo, csr_64_rt_mostregs
+ %rax = COPY %rdi
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is not redundant if the source register (%rax) is clobbered
+# even if the dest (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop0
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop0
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rbp
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rbp = COPY %rax
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# The second copy is not redundant if the dest register (%rax) is clobbered
+# even if the source (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop1
+# CHECK: bb.0:
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop1
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rbp = COPY %rax
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rax = COPY %rbp
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# The second copy is not redundant if the source register (%rax) is clobbered
+# even if the dest (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop2
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop2
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rbp
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rax = COPY %rbp
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# The second copy is not redundant if the dest register (%rax) is clobbered
+# even if the source (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop3
+# CHECK: bb.0:
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop3
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rbp = COPY %rax
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rbp = COPY %rax
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# A reserved register may change its value so the 2nd copy is not redundant.
+# CHECK-LABEL: name: nocopyprop4
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rip
+# CHECK-NEXT: NOOP implicit %rax
+# CHECK-NEXT: %rax = COPY %rip
+# CHECK-NEXT: NOOP implicit %rax
+name: nocopyprop4
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rip
+ NOOP implicit %rax
+ %rax = COPY %rip
+ NOOP implicit %rax
+...
+---
+# Writing to a reserved register may have additional effects (slightly illegal
+# testcase because writing to %rip like this should make the instruction a jump)
+# CHECK-LABEL: name: nocopyprop5
+# CHECK: bb.0:
+# CHECK-NEXT: %rip = COPY %rax
+# CHECK-NEXT: %rip = COPY %rax
+name: nocopyprop5
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rip = COPY %rax
+ %rip = COPY %rax
+...
diff --git a/test/CodeGen/X86/machine-cp.ll b/test/CodeGen/X86/machine-cp.ll
index 143a1c3787a0..57663a011f10 100644
--- a/test/CodeGen/X86/machine-cp.ll
+++ b/test/CodeGen/X86/machine-cp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+sse2 -verify-machineinstrs < %s | FileCheck %s
; After tail duplication, two copies in an early exit BB can be cancelled out.
; rdar://10640363
diff --git a/test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll b/test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll
new file mode 100644
index 000000000000..16ee6ebbbcdb
--- /dev/null
+++ b/test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -O3 -enable-implicit-null-checks -o - < %s 2>&1 | FileCheck %s
+
+declare void @throw0()
+declare void @throw1()
+
+define i1 @f(i8* %p0, i8* %p1) {
+ entry:
+ %c0 = icmp eq i8* %p0, null
+ br i1 %c0, label %throw0, label %continue0, !make.implicit !0
+
+ continue0:
+ %v0 = load i8, i8* %p0
+ %c1 = icmp eq i8* %p1, null
+ br i1 %c1, label %throw1, label %continue1, !make.implicit !0
+
+ continue1:
+ %v1 = load i8, i8* %p1
+ %v = icmp eq i8 %v0, %v1
+ ret i1 %v
+
+ throw0:
+ call void @throw0()
+ unreachable
+
+ throw1:
+ call void @throw1()
+ unreachable
+}
+
+declare void @foo()
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) nounwind readonly
+
+; Check for a crash. The crash is not specific to statepoints, but
+; gc.statpeoint is an easy way to generate a fill instruction in
+; %continue0 (which causes the llc crash).
+define i1 @g(i8 addrspace(1)* %p0, i8* %p1) gc "statepoint-example" {
+ entry:
+ %c0 = icmp eq i8 addrspace(1)* %p0, null
+ %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %p0)
+ %p0.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7) ; (%p0, %p0)
+ br i1 %c0, label %throw0, label %continue0, !make.implicit !0
+
+ continue0:
+ %c1 = icmp eq i8* %p1, null
+ br i1 %c1, label %throw1, label %continue1, !make.implicit !0
+
+ continue1:
+ %v0 = load i8, i8 addrspace(1)* %p0.relocated
+ %v1 = load i8, i8* %p1
+ %v = icmp eq i8 %v0, %v1
+ ret i1 %v
+
+ throw0:
+ call void @throw0()
+ unreachable
+
+ throw1:
+ call void @throw1()
+ unreachable
+}
+
+; Check that we have two implicit null checks in @f
+
+; CHECK: __LLVM_FaultMaps:
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 1
+
+; FunctionInfo[0] =
+
+; FunctionAddress =
+; CHECK-NEXT: .quad _f
+
+; NumFaultingPCs =
+; CHECK-NEXT: .long 2
+
+; Reserved =
+; CHECK-NEXT: .long 0
+
+!0 = !{}
diff --git a/test/CodeGen/X86/machine-trace-metrics-crash.ll b/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 048260c51fe3..5b7c5445316c 100644
--- a/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -51,10 +51,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: 1)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!1 = !DIFile(filename: "24199.cpp", directory: "/bin")
!2 = !{i32 2, !"Debug Info Version", i32 3}
-!3 = distinct !DISubprogram(linkageName: "foo", file: !1, line: 18, isLocal: false, isDefinition: true, scopeLine: 18)
+!3 = distinct !DISubprogram(linkageName: "foo", file: !1, line: 18, isLocal: false, isDefinition: true, scopeLine: 18, unit: !0)
!4 = !DIExpression()
!5 = !DILocalVariable(name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
!6 = !DILocation(line: 0, scope: !3)
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index b7280d87d3b7..3b748eeb2e5a 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1,10 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
-
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -266,7 +265,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %ymm2, %ymm0
; SKX-NEXT: retq
%a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
@@ -279,8 +278,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
;
; KNL_64-LABEL: test7:
; KNL_64: # BB#0:
-; KNL_64-NEXT: movzbl %sil, %eax
-; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
@@ -292,7 +290,8 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_32-LABEL: test7:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
@@ -306,7 +305,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; SKX-NEXT: kmovb %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
-; SKX-NEXT: vmovaps %zmm1, %zmm2
+; SKX-NEXT: vmovaps %ymm1, %ymm2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; SKX-NEXT: retq
@@ -405,9 +404,9 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -421,10 +420,10 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -434,10 +433,10 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; SKX: # BB#0: # %entry
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -467,9 +466,9 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -483,10 +482,10 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -496,10 +495,10 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; SKX: # BB#0: # %entry
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -638,8 +637,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
-; SKX-NEXT: vmovd %esi, %xmm1
-; SKX-NEXT: vpbroadcastd %xmm1, %ymm1
+; SKX-NEXT: vpbroadcastd %esi, %ymm1
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
@@ -677,42 +675,42 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
;
; KNL_64-LABEL: test15:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_64: vpxor %ymm2, %ymm2, %ymm2
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
-; KNL_64-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0
+; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
+; KNL_64-NEXT: # kill
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_32: vpxor %ymm2, %ymm2, %ymm2
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
-; KNL_32-NEXT: vpsllvq .LCPI14_0, %zmm0, %zmm0
-; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0
+; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
+; KNL_32-NEXT: # kill
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test15:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -726,7 +724,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
;
; KNL_64-LABEL: test16:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -740,7 +738,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
;
; KNL_32-LABEL: test16:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -756,18 +754,18 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; SKX-LABEL: test16:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %ymm2, %ymm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test16:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %ymm2, %ymm0
; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -780,7 +778,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
;
; KNL_64-LABEL: test17:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -790,7 +788,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
;
; KNL_32-LABEL: test17:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1
@@ -802,18 +800,18 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
; SKX-LABEL: test17:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test17:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
@@ -832,36 +830,34 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
;
; KNL_64-LABEL: test18:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
-; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test18:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_32-NEXT: vpsllvq .LCPI17_0, %zmm2, %zmm2
-; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test18:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovd2m %xmm2, %k1
+; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
@@ -872,7 +868,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
;
; KNL_64-LABEL: test19:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -884,7 +880,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
;
; KNL_32-LABEL: test19:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -898,14 +894,14 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
; SKX-LABEL: test19:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test19:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
; SKX_32-NEXT: retl
@@ -919,36 +915,34 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
;
; KNL_64-LABEL: test20:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
-; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test20:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_32-NEXT: vpsllvq .LCPI19_0, %zmm2, %zmm2
-; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
; SKX: # BB#0:
-; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vpmovq2m %xmm2, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX: vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX-NEXT: kshiftlb $6, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
;
@@ -956,9 +950,9 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovq2m %xmm2, %k0
-; SKX_32-NEXT: kshiftlw $2, %k0, %k0
-; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX_32-NEXT: kshiftlb $6, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
@@ -970,7 +964,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
;
; KNL_64-LABEL: test21:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
@@ -980,7 +974,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
;
; KNL_32-LABEL: test21:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2
@@ -990,20 +984,20 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
;
; SKX-LABEL: test21:
; SKX: # BB#0:
-; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vpmovq2m %xmm2, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX: vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX-NEXT: kshiftlb $6, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test21:
; SKX_32: # BB#0:
-; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovq2m %xmm2, %k0
-; SKX_32-NEXT: kshiftlw $2, %k0, %k0
-; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32: vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX_32-NEXT: kshiftlb $6, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: retl
@@ -1019,31 +1013,29 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
;
; KNL_64-LABEL: test22:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_64: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vmovaps %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpsllvq .LCPI21_0, %zmm1, %zmm1
-; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-NEXT: retl
@@ -1052,23 +1044,23 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX: # BB#0:
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
+; SKX-NEXT: kshiftlb $6, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test22:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k0
-; SKX_32-NEXT: kshiftlw $2, %k0, %k0
-; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
+; SKX_32-NEXT: kshiftlb $6, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
@@ -1083,7 +1075,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
;
; KNL_64-LABEL: test23:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1093,7 +1085,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
;
; KNL_32-LABEL: test23:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1
@@ -1105,18 +1097,18 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
; SKX-LABEL: test23:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
@@ -1127,8 +1119,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test24:
; KNL_64: # BB#0:
-; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
@@ -1136,7 +1127,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
;
; KNL_32-LABEL: test24:
; KNL_32: # BB#0:
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1
@@ -1149,7 +1140,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test24:
@@ -1157,7 +1148,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
@@ -1169,7 +1160,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
;
; KNL_64-LABEL: test25:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1179,7 +1170,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
;
; KNL_32-LABEL: test25:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1
@@ -1191,18 +1182,18 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
; SKX-LABEL: test25:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test25:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
@@ -1214,8 +1205,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
;
; KNL_64-LABEL: test26:
; KNL_64: # BB#0:
-; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
@@ -1223,7 +1213,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
;
; KNL_32-LABEL: test26:
; KNL_32: # BB#0:
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2
@@ -1236,7 +1226,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test26:
@@ -1244,7 +1234,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
@@ -1260,9 +1250,9 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: # kill
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
@@ -1271,9 +1261,9 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: movb $3, %cl
-; KNL_32-NEXT: movzbl %cl, %ecx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: # kill
; KNL_32-NEXT: retl
;
; SKX-LABEL: test27:
@@ -1295,16 +1285,15 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
;
; KNL_64-LABEL: test28:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test28:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2
@@ -1314,7 +1303,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
;
; SKX-LABEL: test28:
; SKX: # BB#0:
-; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovb %eax, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
@@ -1322,7 +1311,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
;
; SKX_32-LABEL: test28:
; SKX_32: # BB#0:
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: movb $3, %al
; SKX_32-NEXT: kmovb %eax, %k1
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
@@ -1381,12 +1370,9 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; KNL_64-LABEL: test30:
; KNL_64: # BB#0:
; KNL_64-NEXT: andl $1, %edx
-; KNL_64-NEXT: kmovw %edx, %k1
; KNL_64-NEXT: andl $1, %esi
-; KNL_64-NEXT: kmovw %esi, %k2
; KNL_64-NEXT: movl %edi, %eax
; KNL_64-NEXT: andl $1, %eax
-; KNL_64-NEXT: kmovw %eax, %k0
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
@@ -1394,102 +1380,97 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; KNL_64-NEXT: testb $1, %dil
; KNL_64-NEXT: je .LBB29_2
; KNL_64-NEXT: # BB#1: # %cond.load
-; KNL_64-NEXT: vmovq %xmm1, %rax
-; KNL_64-NEXT: vmovd (%rax), %xmm0
+; KNL_64-NEXT: vmovq %xmm1, %rcx
+; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL_64-NEXT: .LBB29_2: # %else
-; KNL_64-NEXT: kmovw %k2, %eax
-; KNL_64-NEXT: movl %eax, %ecx
-; KNL_64-NEXT: andl $1, %ecx
-; KNL_64-NEXT: testb %cl, %cl
+; KNL_64-NEXT: testb %sil, %sil
; KNL_64-NEXT: je .LBB29_4
; KNL_64-NEXT: # BB#3: # %cond.load1
; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
; KNL_64-NEXT: .LBB29_4: # %else2
-; KNL_64-NEXT: kmovw %k1, %ecx
-; KNL_64-NEXT: movl %ecx, %edx
-; KNL_64-NEXT: andl $1, %edx
; KNL_64-NEXT: testb %dl, %dl
; KNL_64-NEXT: je .LBB29_6
; KNL_64-NEXT: # BB#5: # %cond.load4
; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
-; KNL_64-NEXT: vmovq %xmm1, %rdx
-; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
+; KNL_64-NEXT: vmovq %xmm1, %rcx
+; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0
; KNL_64-NEXT: .LBB29_6: # %else5
-; KNL_64-NEXT: kmovw %k0, %edx
-; KNL_64-NEXT: vmovd %edx, %xmm1
-; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_64-NEXT: vmovd %eax, %xmm1
+; KNL_64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
+; KNL_64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test30:
; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebx
+; KNL_32-NEXT: .Ltmp0:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: pushl %esi
+; KNL_32-NEXT: .Ltmp1:
+; KNL_32-NEXT: .cfi_def_cfa_offset 12
+; KNL_32-NEXT: .Ltmp2:
+; KNL_32-NEXT: .cfi_offset %esi, -12
+; KNL_32-NEXT: .Ltmp3:
+; KNL_32-NEXT: .cfi_offset %ebx, -8
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: andl $1, %eax
-; KNL_32-NEXT: kmovw %eax, %k1
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: andl $1, %eax
-; KNL_32-NEXT: kmovw %eax, %k2
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: movl %eax, %ecx
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; KNL_32-NEXT: andl $1, %ecx
-; KNL_32-NEXT: kmovw %ecx, %k0
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; KNL_32-NEXT: movl %ebx, %edx
+; KNL_32-NEXT: andl $1, %edx
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; KNL_32-NEXT: # implicit-def: %XMM0
-; KNL_32-NEXT: testb $1, %al
+; KNL_32-NEXT: testb $1, %bl
; KNL_32-NEXT: je .LBB29_2
; KNL_32-NEXT: # BB#1: # %cond.load
-; KNL_32-NEXT: vmovd %xmm1, %eax
-; KNL_32-NEXT: vmovd (%eax), %xmm0
+; KNL_32-NEXT: vmovd %xmm1, %esi
+; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL_32-NEXT: .LBB29_2: # %else
-; KNL_32-NEXT: kmovw %k2, %eax
-; KNL_32-NEXT: movl %eax, %ecx
-; KNL_32-NEXT: andl $1, %ecx
; KNL_32-NEXT: testb %cl, %cl
; KNL_32-NEXT: je .LBB29_4
; KNL_32-NEXT: # BB#3: # %cond.load1
-; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
-; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
+; KNL_32-NEXT: vpextrd $1, %xmm1, %esi
+; KNL_32-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0
; KNL_32-NEXT: .LBB29_4: # %else2
-; KNL_32-NEXT: kmovw %k1, %ecx
-; KNL_32-NEXT: movl %ecx, %edx
-; KNL_32-NEXT: andl $1, %edx
-; KNL_32-NEXT: testb %dl, %dl
+; KNL_32-NEXT: testb %al, %al
; KNL_32-NEXT: je .LBB29_6
; KNL_32-NEXT: # BB#5: # %cond.load4
-; KNL_32-NEXT: vpextrd $2, %xmm1, %edx
-; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0
+; KNL_32-NEXT: vpextrd $2, %xmm1, %esi
+; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0
; KNL_32-NEXT: .LBB29_6: # %else5
-; KNL_32-NEXT: kmovw %k0, %edx
; KNL_32-NEXT: vmovd %edx, %xmm1
-; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_32-NEXT: popl %esi
+; KNL_32-NEXT: popl %ebx
; KNL_32-NEXT: retl
;
; SKX-LABEL: test30:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SKX-NEXT: # implicit-def: %XMM0
-; SKX-NEXT: andb $1, %al
+; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_2
; SKX-NEXT: # BB#1: # %cond.load
; SKX-NEXT: vmovq %xmm1, %rax
-; SKX-NEXT: vmovd (%rax), %xmm0
+; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: .LBB29_2: # %else
; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SKX-NEXT: andb $1, %al
+; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_4
; SKX-NEXT: # BB#3: # %cond.load1
; SKX-NEXT: vpextrq $1, %xmm1, %rax
@@ -1497,15 +1478,14 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; SKX-NEXT: .LBB29_4: # %else2
; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SKX-NEXT: andb $1, %al
+; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_6
; SKX-NEXT: # BB#5: # %cond.load4
-; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
+; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm1
; SKX-NEXT: vmovq %xmm1, %rax
; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
; SKX-NEXT: .LBB29_6: # %else5
-; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
-; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vpblendmd %xmm0, %xmm3, %xmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test30:
@@ -1514,36 +1494,36 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; SKX_32-NEXT: .Ltmp0:
; SKX_32-NEXT: .cfi_def_cfa_offset 16
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovd2m %xmm2, %k1
+; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
-; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: # implicit-def: %XMM1
-; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: # implicit-def: %XMM0
+; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_2
; SKX_32-NEXT: # BB#1: # %cond.load
-; SKX_32-NEXT: vmovd %xmm2, %eax
-; SKX_32-NEXT: vmovd (%eax), %xmm1
+; SKX_32-NEXT: vmovd %xmm1, %eax
+; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX_32-NEXT: .LBB29_2: # %else
; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_4
; SKX_32-NEXT: # BB#3: # %cond.load1
-; SKX_32-NEXT: vpextrd $1, %xmm2, %eax
-; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
+; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: .LBB29_4: # %else2
-; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm0
+; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm2
; SKX_32-NEXT: kmovb %k1, (%esp)
; SKX_32-NEXT: movb (%esp), %al
-; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_6
; SKX_32-NEXT: # BB#5: # %cond.load4
-; SKX_32-NEXT: vpextrd $2, %xmm2, %eax
-; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: vpextrd $2, %xmm1, %eax
+; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: .LBB29_6: # %else5
-; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; SKX_32-NEXT: vpblendmd %xmm0, %xmm2, %xmm0 {%k1}
; SKX_32-NEXT: addl $12, %esp
; SKX_32-NEXT: retl
@@ -1660,12 +1640,12 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-LABEL: test_gather_16i64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp0:
+; KNL_32-NEXT: .Ltmp4:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp1:
+; KNL_32-NEXT: .Ltmp5:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp2:
+; KNL_32-NEXT: .Ltmp6:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -1783,12 +1763,12 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-LABEL: test_gather_16f64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp3:
+; KNL_32-NEXT: .Ltmp7:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp4:
+; KNL_32-NEXT: .Ltmp8:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp5:
+; KNL_32-NEXT: .Ltmp9:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -1900,12 +1880,12 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; KNL_32-LABEL: test_scatter_16i64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp6:
+; KNL_32-NEXT: .Ltmp10:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp7:
+; KNL_32-NEXT: .Ltmp11:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp8:
+; KNL_32-NEXT: .Ltmp12:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -2014,12 +1994,12 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_32-LABEL: test_scatter_16f64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp9:
+; KNL_32-NEXT: .Ltmp13:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp10:
+; KNL_32-NEXT: .Ltmp14:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp11:
+; KNL_32-NEXT: .Ltmp15:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index c29933e266b2..e3657d67ad0e 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -1,379 +1,1562 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
-; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
-
-; AVX512-LABEL: test1
-; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-
-; AVX2-LABEL: test1
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2-NOT: blend
-
-; AVX_SCALAR-LABEL: test1
-; AVX_SCALAR-NOT: masked
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: insertelement
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: insertelement
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX
+
+; To test for the case where masked load/store is not legal, we should add a run with a target
+; that does not have AVX, but that case should probably be a separate test file using less tests
+; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
+
define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
+; AVX1-LABEL: test1:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test1:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test1:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
+ %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
ret <16 x i32> %res
}
-; AVX512-LABEL: test2
-; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-
-; AVX2-LABEL: test2
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2-NOT: blend
define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
+; AVX1-LABEL: test2:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test2:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test2:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
+ %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
ret <16 x i32> %res
}
-; AVX512-LABEL: test3
-; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
-
-; AVX_SCALAR-LABEL: test3
-; AVX_SCALAR-NOT: masked
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: store
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: store
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: store
define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
+; AVX1-LABEL: test3:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test3:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd %ymm3, %ymm1, 32(%rdi)
+; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test3:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
+ call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
ret void
}
-; AVX512-LABEL: test4
-; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}}
-
-; AVX2-LABEL: test4
-; AVX2: vmaskmovps {{.*}}(%rdi)
-; AVX2: vmaskmovps {{.*}}(%rdi)
-; AVX2: blend
define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
+; AVX1-LABEL: test4:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm4
+; AVX1-NEXT: vblendvps %ymm0, %ymm4, %ymm2, %ymm0
+; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm2
+; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test4:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vblendvps %ymm0, %ymm4, %ymm2, %ymm0
+; AVX2-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm2
+; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
+ %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
ret <16 x float> %res
}
-; AVX512-LABEL: test5
-; AVX512: vmovupd (%rdi), %zmm1 {%k1}
-
-; AVX2-LABEL: test5
-; AVX2: vmaskmovpd
-; AVX2: vblendvpd
-; AVX2: vmaskmovpd
-; AVX2: vblendvpd
define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
+; AVX1-LABEL: test5:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test5:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test5:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test5:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
- %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
+ %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
ret <8 x double> %res
}
-; AVX2-LABEL: test6
-; AVX2: vmaskmovpd
-; AVX2: vblendvpd
-
-; SKX-LABEL: test6
-; SKX: vmovupd {{.*}}{%k1}
define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
+; AVX-LABEL: test6:
+; AVX: ## BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test6:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test6:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovupd (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
- %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+ %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
ret <2 x double> %res
}
-; AVX2-LABEL: test7
-; AVX2: vmaskmovps {{.*}}(%rdi)
-; AVX2: blend
-
-; SKX-LABEL: test7
-; SKX: vmovups (%rdi){{.*}}{%k1}
define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
+; AVX-LABEL: test7:
+; AVX: ## BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test7:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test7:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
ret <4 x float> %res
}
-; AVX2-LABEL: test8
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2: blend
-
-; SKX-LABEL: test8
-; SKX: vmovdqu32 (%rdi){{.*}}{%k1}
define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
+; AVX1-LABEL: test8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+ %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
ret <4 x i32> %res
}
-; AVX2-LABEL: test9
-; AVX2: vpmaskmovd %xmm
-
-; SKX-LABEL: test9
-; SKX: vmovdqu32 %xmm{{.*}}{%k1}
define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+; AVX1-LABEL: test9:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test9:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test9:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test9:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
ret void
}
-; AVX2-LABEL: test10
-; AVX2: vmaskmovpd (%rdi), %ymm
-; AVX2: blend
-
-; SKX-LABEL: test10
-; SKX: vmovapd {{.*}}{%k1}
define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+; AVX1-LABEL: test10:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test10:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test10:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test10:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovapd (%rdi), %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
ret <4 x double> %res
}
-; AVX2-LABEL: test11a
-; AVX2: vmaskmovps
-; AVX2: vblendvps
+define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+; AVX1-LABEL: test10b:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test10b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test10b:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test10b:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
+ ret <4 x double> %res
+}
-; SKX-LABEL: test11a
-; SKX: vmovaps (%rdi), %ymm1 {%k1}
-; AVX512-LABEL: test11a
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovups (%rdi), %zmm1 {%k1}
define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+; AVX1-LABEL: test11a:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11a:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11a:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11a:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vmovaps (%rdi), %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
- %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
+ %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
ret <8 x float> %res
}
-; SKX-LABEL: test11b
-; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
-; AVX512-LABEL: test11b
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
- %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
+; AVX1-LABEL: test11b:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11b:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11b:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: retq
+ %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
ret <8 x i32> %res
}
-; SKX-LABEL: test11c
-; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
-; AVX512-LABEL: test11c
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
- %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
+; AVX1-LABEL: test11c:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11c:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11c:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11c:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
ret <8 x float> %res
}
-; SKX-LABEL: test11d
-; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
-; AVX512-LABEL: test11d
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
- %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
+; AVX1-LABEL: test11d:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11d:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11d:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
ret <8 x i32> %res
}
-; AVX2-LABEL: test12
-; AVX2: vpmaskmovd %ymm
-
-; SKX-LABEL: test12
-; SKX: vmovdqu32 {{.*}}{%k1}
define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
+; AVX1-LABEL: test12:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test12:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test12:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test12:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
+ call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
ret void
}
-; AVX512-LABEL: test13
-; AVX512: vmovups %zmm1, (%rdi) {%k1}
-
define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
+; AVX1-LABEL: test13:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test13:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX2-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test13:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
+ call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
ret void
}
-; AVX2-LABEL: test14
-; AVX2: vpshufd
-; AVX2: vmovq
-; AVX2: vmaskmovps
-
-; SKX-LABEL: test14
-; SKX: kshiftl
-; SKX: kshiftr
-; SKX: vmovups {{.*}}{%k1}
-
define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
+; AVX1-LABEL: test14:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test14:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test14:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test14:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
+; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
+ call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
ret void
}
-; AVX2-LABEL: test15
-; AVX2: vpmaskmovd
-
+define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
+; AVX1-LABEL: test15:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test15:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test15:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test15:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
-define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
+ call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
ret void
}
-; AVX2-LABEL: test16
-; AVX2: vmaskmovps
-; AVX2: vblendvps
-
-; SKX-LABEL: test16
-; SKX: kshiftl
-; SKX: kshiftr
-; SKX: vmovups {{.*}}{%k1}
define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
+; AVX1-LABEL: test16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+ %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
ret <2 x float> %res
}
-; AVX2-LABEL: test17
-; AVX2: vpmaskmovd
-; AVX2: vblendvps
-; AVX2: vpmovsxdq
-
-; SKX-LABEL: test17
-; SKX: kshiftl
-; SKX: kshiftr
-; SKX: vmovdqu32 {{.*}}{%k1}
define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
+; AVX1-LABEL: test17:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test17:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test17:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test17:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
+; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+ %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
ret <2 x i32> %res
}
-; AVX2-LABEL: test18
-; AVX2: vmaskmovps
-; AVX2-NOT: blend
-; AVX2: ret
define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
+; AVX1-LABEL: test18:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test18:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test18:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test18:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
+ %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
ret <2 x float> %res
}
-; AVX_SCALAR-LABEL: test19
-; AVX_SCALAR: load <4 x float>, <4 x float>* %addr, align 4
-
-define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) {
+define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
+; AVX-LABEL: load_all:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: load_all:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: load_all:
+; SKX: ## BB#0:
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
ret <4 x float> %res
}
-; AVX_SCALAR-LABEL: test20
-; AVX_SCALAR: load float, {{.*}}, align 4
-; AVX_SCALAR: insertelement <4 x float> undef, float
-; AVX_SCALAR: select <4 x i1> <i1 true, i1 false, i1 true, i1 true>
+;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
-define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) {
- %mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 16, <4 x i1><i1 true, i1 false, i1 true, i1 true>, <4 x float> %src0)
+; 128-bit FP vectors are supported with AVX.
+
+define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
+; AVX-LABEL: mload_constmask_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4f32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295]
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4f32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $13, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
ret <4 x float> %res
}
-; AVX_SCALAR-LABEL: test21
-; AVX_SCALAR: store <4 x i32> %val
+; 128-bit integer vectors are supported with AVX2.
+
+define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
+; AVX1-LABEL: mload_constmask_v4i32:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v4i32:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4i32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4i32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $14, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
+ ret <4 x i32> %res
+}
+
+; 256-bit FP vectors are supported with AVX.
+
+define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
+; AVX-LABEL: mload_constmask_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
+; AVX-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v8f32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: movw $7, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v8f32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovups (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
+ ret <8 x float> %res
+}
+
+define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
+; AVX-LABEL: mload_constmask_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm2
+; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4f64:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
+ ret <4 x double> %res
+}
+
+; 256-bit integer vectors are supported with AVX2.
+
+define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
+; AVX1-LABEL: mload_constmask_v8i32:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v8i32:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v8i32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: movw $135, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v8i32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $-121, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
+ ret <8 x i32> %res
+}
+
+define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
+; AVX1-LABEL: mload_constmask_v4i64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = mem[0],ymm0[1,2],mem[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v4i64:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]
+; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm2
+; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4i64:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $9, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
+ ret <4 x i64> %res
+}
+
+; 512-bit FP vectors are supported with AVX512.
+
+define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
+; AVX-LABEL: mload_constmask_v8f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],mem[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3]
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: mload_constmask_v8f64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: movb $-121, %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovupd (%rdi), %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
+ ret <8 x double> %res
+}
+
+; If the pass-through operand is undef, no blend is needed.
+
+define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
+; AVX-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
+ ret <4 x double> %res
+}
+
+define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
+; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $6, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
+ ret <4 x i64> %res
+}
+
define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+; AVX1-LABEL: test21:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test21:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test21:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test21:
+; SKX: ## BB#0:
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
ret void
}
-; AVX_SCALAR-LABEL: test22
-; AVX_SCALAR: extractelement <4 x i32> %val, i32 0
-; AVX_SCALAR: store i32
-define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
- %mask = icmp eq <4 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+; When only one element of the mask is set, reduce to a scalar store.
+
+define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
+; AVX-LABEL: one_mask_bit_set1:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: one_mask_bit_set1:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vmovd %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+; Choose a different element to show that the correct address offset is produced.
+
+define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
+; AVX-LABEL: one_mask_bit_set2:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: one_mask_bit_set2:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi)
+; AVX512-NEXT: retq
+ call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
+; AVX-LABEL: one_mask_bit_set3:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: one_mask_bit_set3:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovq %xmm0, 16(%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: one_mask_bit_set3:
+; SKX: ## BB#0:
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vmovq %xmm0, 16(%rdi)
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
-declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
-declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
-declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
-declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
-declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
-declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
-declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
-declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
-declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
-declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
-declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
-declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
-declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
-declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
-declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
-
-declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
-
-; AVX512-LABEL: test23
-; AVX512: vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
-; AVX512: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
+; AVX-LABEL: one_mask_bit_set4:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: one_mask_bit_set4:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: one_mask_bit_set4:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vmovhpd %xmm0, 24(%rdi)
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
+
+define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
+; AVX-LABEL: one_mask_bit_set5:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-NEXT: vmovlps %xmm0, 48(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: one_mask_bit_set5:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi)
+; AVX512-NEXT: retq
+ call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+; When only one element of the mask is set, reduce to a scalar load.
+
+define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
+; AVX-LABEL: load_one_mask_bit_set1:
+; AVX: ## BB#0:
+; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: load_one_mask_bit_set1:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
+ ret <4 x i32> %res
+}
+
+; Choose a different element to show that the correct address offset is produced.
+
+define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
+; AVX-LABEL: load_one_mask_bit_set2:
+; AVX: ## BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: load_one_mask_bit_set2:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512-NEXT: retq
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
+ ret <4 x float> %res
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
+; AVX1-LABEL: load_one_mask_bit_set3:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_one_mask_bit_set3:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_one_mask_bit_set3:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: load_one_mask_bit_set3:
+; SKX: ## BB#0:
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: retq
+ %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
+ ret <4 x i64> %res
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
+; AVX-LABEL: load_one_mask_bit_set4:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: load_one_mask_bit_set4:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: load_one_mask_bit_set4:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: retq
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
+ ret <4 x double> %res
+}
+
+; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
+
+define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
+; AVX-LABEL: load_one_mask_bit_set5:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: load_one_mask_bit_set5:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
+ ret <8 x double> %res
+}
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
+declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
+declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
+
+declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
+; AVX1-LABEL: test23:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm3, %ymm3
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm2
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test23:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm3, %ymm3
+; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm2, %ymm2
+; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test23:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; AVX512-NEXT: vpcmpeqq %zmm2, %zmm1, %k2
+; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32*> %trigger, zeroinitializer
- %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
+ %res = call <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
ret <16 x i32*> %res
}
%mystruct = type { i16, i16, [1 x i8*] }
-declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
+declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
-; AVX512-LABEL: test24:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX1-LABEL: test24:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm1, %ymm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm1, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm0, %ymm1
+; AVX1-NEXT: vmovapd %ymm4, %ymm0
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test24:
; AVX2: ## BB#0:
@@ -403,6 +1586,16 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
; AVX2-NEXT: vmovdqa %ymm4, %ymm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test24:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test24:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -411,20 +1604,50 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
; SKX-NEXT: retq
- %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
+ %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
ret <16 x %mystruct*> %res
}
define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
-; AVX512-LABEL: test_store_16i64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_store_16i64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_store_16i64:
; AVX2: ## BB#0:
@@ -454,6 +1677,16 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_store_16i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_store_16i64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -462,20 +1695,51 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
; SKX-NEXT: retq
- call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
+ call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
ret void
}
-declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
+
define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
-; AVX512-LABEL: test_store_16f64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovupd %zmm1, (%rdi) {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_store_16f64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_store_16f64:
; AVX2: ## BB#0:
@@ -505,6 +1769,16 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_store_16f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_store_16f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -513,22 +1787,55 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
; SKX-NEXT: retq
- call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
+ call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
ret void
}
-declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
+
define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
-; AVX512-LABEL: test_load_16i64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm2, %zmm1
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_load_16i64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX1-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX1-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vmovapd %ymm5, %ymm0
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_16i64:
; AVX2: ## BB#0:
@@ -536,22 +1843,22 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
-; AVX2-NEXT: vpmaskmovq (%rdi), %ymm5, %ymm9
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
-; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
-; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
-; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm7, %ymm8
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
-; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
-; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
-; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm6, %ymm10
-; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
-; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm5, %ymm6
+; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm6
+; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm2, %ymm6
+; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -562,6 +1869,18 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; AVX2-NEXT: vmovapd %ymm5, %ymm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_load_16i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_load_16i64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -572,22 +1891,55 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: vmovaps %zmm2, %zmm1
; SKX-NEXT: retq
- %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+ %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
}
-declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+
define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
-; AVX512-LABEL: test_load_16f64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm2, %zmm1
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_load_16f64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX1-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX1-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vmovapd %ymm5, %ymm0
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_16f64:
; AVX2: ## BB#0:
@@ -595,22 +1947,22 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
-; AVX2-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm9
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
-; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
-; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
-; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm7, %ymm8
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
-; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
-; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
-; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm10
-; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
-; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -621,6 +1973,18 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; AVX2-NEXT: vmovapd %ymm5, %ymm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_load_16f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_load_16f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -631,32 +1995,117 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: vmovaps %zmm2, %zmm1
; SKX-NEXT: retq
- %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+ %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
}
-declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
-; AVX512-LABEL: test_load_32f64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5
-; AVX512-NEXT: vpslld $31, %zmm5, %zmm5
-; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k1
-; AVX512-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k2
-; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k2}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
-; AVX512-NEXT: kshiftrw $8, %k2, %k1
-; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm2, %zmm1
-; AVX512-NEXT: vmovaps %zmm3, %zmm2
-; AVX512-NEXT: vmovaps %zmm4, %zmm3
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_load_32f64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: Ltmp0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: Ltmp1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: Ltmp2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vmovapd 16(%rbp), %ymm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm9, %xmm9
+; AVX1-NEXT: vpsrad $31, %xmm9, %xmm9
+; AVX1-NEXT: vpmovsxdq %xmm9, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm9, %xmm9
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vmaskmovpd 32(%rsi), %ymm9, %ymm10
+; AVX1-NEXT: vblendvpd %ymm9, %ymm10, %ymm2, %ymm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2
+; AVX1-NEXT: vmaskmovpd 64(%rsi), %ymm2, %ymm10
+; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm3, %ymm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2
+; AVX1-NEXT: vmaskmovpd 96(%rsi), %ymm2, %ymm10
+; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm4, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10
+; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT: vmaskmovpd 192(%rsi), %ymm3, %ymm10
+; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm7, %ymm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT: vmaskmovpd 224(%rsi), %ymm3, %ymm10
+; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm8, %ymm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm8
+; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm1, %ymm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd 128(%rsi), %ymm1, %ymm2
+; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
+; AVX1-NEXT: vmovapd %ymm1, 128(%rdi)
+; AVX1-NEXT: vmovapd %ymm0, (%rdi)
+; AVX1-NEXT: vmovapd %ymm3, 224(%rdi)
+; AVX1-NEXT: vmovapd %ymm7, 192(%rdi)
+; AVX1-NEXT: vmovapd %ymm6, 160(%rdi)
+; AVX1-NEXT: vmovapd %ymm4, 96(%rdi)
+; AVX1-NEXT: vmovapd %ymm11, 64(%rdi)
+; AVX1-NEXT: vmovapd %ymm9, 32(%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_32f64:
; AVX2: ## BB#0:
@@ -670,27 +2119,28 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX2-NEXT: .cfi_def_cfa_register %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $32, %rsp
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm8, %xmm8
-; AVX2-NEXT: vpsrad $31, %xmm8, %xmm8
-; AVX2-NEXT: vpmovsxdq %xmm8, %ymm8
-; AVX2-NEXT: vmaskmovpd 32(%rsi), %ymm8, %ymm9
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm10, %xmm10
-; AVX2-NEXT: vpsrad $31, %xmm10, %xmm10
-; AVX2-NEXT: vpmovsxdq %xmm10, %ymm10
-; AVX2-NEXT: vmaskmovpd 64(%rsi), %ymm10, %ymm11
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[3,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm12, %xmm12
-; AVX2-NEXT: vpsrad $31, %xmm12, %xmm12
-; AVX2-NEXT: vpmovsxdq %xmm12, %ymm12
-; AVX2-NEXT: vmaskmovpd 96(%rsi), %ymm12, %ymm13
-; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm2, %ymm8
-; AVX2-NEXT: vblendvpd %ymm10, %ymm11, %ymm3, %ymm9
-; AVX2-NEXT: vblendvpd %ymm12, %ymm13, %ymm4, %ymm11
+; AVX2-NEXT: vmovapd 16(%rbp), %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm9, %xmm9
+; AVX2-NEXT: vpsrad $31, %xmm9, %xmm9
+; AVX2-NEXT: vpmovsxdq %xmm9, %ymm9
+; AVX2-NEXT: vmaskmovpd 32(%rsi), %ymm9, %ymm10
+; AVX2-NEXT: vblendvpd %ymm9, %ymm10, %ymm2, %ymm9
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vmaskmovpd 64(%rsi), %ymm2, %ymm10
+; AVX2-NEXT: vblendvpd %ymm2, %ymm10, %ymm3, %ymm11
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vmaskmovpd 96(%rsi), %ymm2, %ymm10
+; AVX2-NEXT: vblendvpd %ymm2, %ymm10, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
@@ -698,28 +2148,27 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm4, %xmm4
-; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
-; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4
-; AVX2-NEXT: vmaskmovpd 192(%rsi), %ymm4, %ymm12
; AVX2-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm3
-; AVX2-NEXT: vmovapd 16(%rbp), %ymm6
-; AVX2-NEXT: vblendvpd %ymm4, %ymm12, %ymm7, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
+; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
+; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT: vmaskmovpd 192(%rsi), %ymm6, %ymm10
+; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm7, %ymm6
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
; AVX2-NEXT: vmaskmovpd 224(%rsi), %ymm7, %ymm10
-; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm6, %ymm6
+; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm8, %ymm7
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm7
-; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm1, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm8
+; AVX2-NEXT: vblendvpd %ymm0, %ymm8, %ymm1, %ymm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
@@ -728,18 +2177,39 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
; AVX2-NEXT: vmovapd %ymm1, 128(%rdi)
; AVX2-NEXT: vmovapd %ymm0, (%rdi)
-; AVX2-NEXT: vmovapd %ymm6, 224(%rdi)
-; AVX2-NEXT: vmovapd %ymm4, 192(%rdi)
+; AVX2-NEXT: vmovapd %ymm7, 224(%rdi)
+; AVX2-NEXT: vmovapd %ymm6, 192(%rdi)
; AVX2-NEXT: vmovapd %ymm3, 160(%rdi)
-; AVX2-NEXT: vmovapd %ymm11, 96(%rdi)
-; AVX2-NEXT: vmovapd %ymm9, 64(%rdi)
-; AVX2-NEXT: vmovapd %ymm8, 32(%rdi)
+; AVX2-NEXT: vmovapd %ymm4, 96(%rdi)
+; AVX2-NEXT: vmovapd %ymm11, 64(%rdi)
+; AVX2-NEXT: vmovapd %ymm9, 32(%rdi)
; AVX2-NEXT: movq %rdi, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_load_32f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
+; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
+; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
+; AVX512F-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
+; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k2}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k2, %k1
+; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: vmovaps %zmm3, %zmm2
+; AVX512F-NEXT: vmovaps %zmm4, %zmm3
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_load_32f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -756,7 +2226,8181 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; SKX-NEXT: vmovaps %zmm3, %zmm2
; SKX-NEXT: vmovaps %zmm4, %zmm3
; SKX-NEXT: retq
- %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
+ %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
ret <32 x double> %res
}
-declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
+
+declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
+
+define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; AVX-LABEL: test_mask_load_16xi8:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: ## implicit-def: %XMM1
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_2
+; AVX-NEXT: ## BB#1: ## %cond.load
+; AVX-NEXT: movzbl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: LBB50_2: ## %else
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_4
+; AVX-NEXT: ## BB#3: ## %cond.load1
+; AVX-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_4: ## %else2
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_6
+; AVX-NEXT: ## BB#5: ## %cond.load4
+; AVX-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_6: ## %else5
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_8
+; AVX-NEXT: ## BB#7: ## %cond.load7
+; AVX-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_8: ## %else8
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_10
+; AVX-NEXT: ## BB#9: ## %cond.load10
+; AVX-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_10: ## %else11
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_12
+; AVX-NEXT: ## BB#11: ## %cond.load13
+; AVX-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_12: ## %else14
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_14
+; AVX-NEXT: ## BB#13: ## %cond.load16
+; AVX-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_14: ## %else17
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_16
+; AVX-NEXT: ## BB#15: ## %cond.load19
+; AVX-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_16: ## %else20
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_18
+; AVX-NEXT: ## BB#17: ## %cond.load22
+; AVX-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_18: ## %else23
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_20
+; AVX-NEXT: ## BB#19: ## %cond.load25
+; AVX-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_20: ## %else26
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_22
+; AVX-NEXT: ## BB#21: ## %cond.load28
+; AVX-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_22: ## %else29
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_24
+; AVX-NEXT: ## BB#23: ## %cond.load31
+; AVX-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_24: ## %else32
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_26
+; AVX-NEXT: ## BB#25: ## %cond.load34
+; AVX-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_26: ## %else35
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_28
+; AVX-NEXT: ## BB#27: ## %cond.load37
+; AVX-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_28: ## %else38
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_30
+; AVX-NEXT: ## BB#29: ## %cond.load40
+; AVX-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_30: ## %else41
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_32
+; AVX-NEXT: ## BB#31: ## %cond.load43
+; AVX-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_32: ## %else44
+; AVX-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_16xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %XMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB50_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_26: ## %else35
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_28: ## %else38
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_30: ## %else41
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_32: ## %else44
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_16xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+
+define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; AVX1-LABEL: test_mask_load_32xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: ## implicit-def: %YMM1
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzbl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: LBB51_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_32: ## %else44
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_34: ## %else47
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_36: ## %else50
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_38: ## %else53
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_40: ## %else56
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_42: ## %else59
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_44: ## %else62
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_46: ## %else65
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_48: ## %else68
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_50: ## %else71
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_52: ## %else74
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_54: ## %else77
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_56: ## %else80
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_58: ## %else83
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_60: ## %else86
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_62: ## %else89
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_64: ## %else92
+; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_32xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: ## implicit-def: %YMM1
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzbl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: LBB51_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_32: ## %else44
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_34: ## %else47
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_36: ## %else50
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_38: ## %else53
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_40: ## %else56
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_42: ## %else59
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_44: ## %else62
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_46: ## %else65
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_48: ## %else68
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_50: ## %else71
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_52: ## %else74
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_54: ## %else77
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_56: ## %else80
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_58: ## %else83
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_60: ## %else86
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_62: ## %else89
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_64: ## %else92
+; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_32xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: ## implicit-def: %YMM1
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: LBB51_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_6: ## %else5
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_8: ## %else8
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_10: ## %else11
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_12: ## %else14
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_14: ## %else17
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_16: ## %else20
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_18: ## %else23
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_20: ## %else26
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_22: ## %else29
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_24: ## %else32
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_26: ## %else35
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_28: ## %else38
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_30: ## %else41
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_32: ## %else44
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_34: ## %else47
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_36: ## %else50
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_38: ## %else53
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_40: ## %else56
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_42: ## %else59
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_44: ## %else62
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_46: ## %else65
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_48: ## %else68
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_50: ## %else71
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_52: ## %else74
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_54: ## %else77
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_56: ## %else80
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_58: ## %else83
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_60: ## %else86
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_62: ## %else89
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_64: ## %else92
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_32xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
+
+define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
+; AVX1-LABEL: test_mask_load_64xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: Ltmp3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: Ltmp4:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: Ltmp5:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: Ltmp6:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: Ltmp7:
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: Ltmp8:
+; AVX1-NEXT: .cfi_def_cfa_offset 56
+; AVX1-NEXT: pushq %rax
+; AVX1-NEXT: Ltmp9:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: Ltmp10:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: Ltmp11:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: Ltmp12:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: Ltmp13:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: Ltmp14:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: Ltmp15:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl %edi, %r13d
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB52_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzbl (%rax), %ebp
+; AVX1-NEXT: vmovd %ebp, %xmm9
+; AVX1-NEXT: LBB52_2: ## %else
+; AVX1-NEXT: testb $1, %sil
+; AVX1-NEXT: je LBB52_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrb $1, 1(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_4: ## %else2
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB52_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrb $2, 2(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_6: ## %else5
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB52_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrb $3, 3(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_8: ## %else8
+; AVX1-NEXT: testb $1, %r8b
+; AVX1-NEXT: je LBB52_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrb $4, 4(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_10: ## %else11
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r10b
+; AVX1-NEXT: testb $1, %r9b
+; AVX1-NEXT: je LBB52_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrb $5, 5(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_12: ## %else14
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r11b
+; AVX1-NEXT: testb $1, %r10b
+; AVX1-NEXT: je LBB52_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrb $6, 6(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_14: ## %else17
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r14b
+; AVX1-NEXT: testb $1, %r11b
+; AVX1-NEXT: je LBB52_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrb $7, 7(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_16: ## %else20
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r15b
+; AVX1-NEXT: testb $1, %r14b
+; AVX1-NEXT: je LBB52_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vpinsrb $8, 8(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_18: ## %else23
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r12b
+; AVX1-NEXT: testb $1, %r15b
+; AVX1-NEXT: je LBB52_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vpinsrb $9, 9(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_20: ## %else26
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dil
+; AVX1-NEXT: testb $1, %r12b
+; AVX1-NEXT: je LBB52_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vpinsrb $10, 10(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_22: ## %else29
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bpl
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB52_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vpinsrb $11, 11(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_24: ## %else32
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bl
+; AVX1-NEXT: testb $1, %bpl
+; AVX1-NEXT: je LBB52_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vpinsrb $12, 12(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_26: ## %else35
+; AVX1-NEXT: testb $1, %bl
+; AVX1-NEXT: je LBB52_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vpinsrb $13, 13(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_28: ## %else38
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vpinsrb $14, 14(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_30: ## %else41
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vpinsrb $15, 15(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_32: ## %else44
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_34: ## %else47
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_36: ## %else50
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_38: ## %else53
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_40: ## %else56
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_42: ## %else59
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_44: ## %else62
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_46: ## %else65
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_48: ## %else68
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_50: ## %else71
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_52: ## %else74
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_54: ## %else77
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_56: ## %else80
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_58: ## %else83
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_60: ## %else86
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_62: ## %else89
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_64: ## %else92
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_66
+; AVX1-NEXT: ## BB#65: ## %cond.load94
+; AVX1-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: LBB52_66: ## %else95
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_68
+; AVX1-NEXT: ## BB#67: ## %cond.load97
+; AVX1-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_68: ## %else98
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_70
+; AVX1-NEXT: ## BB#69: ## %cond.load100
+; AVX1-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_70: ## %else101
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_72
+; AVX1-NEXT: ## BB#71: ## %cond.load103
+; AVX1-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_72: ## %else104
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_74
+; AVX1-NEXT: ## BB#73: ## %cond.load106
+; AVX1-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_74: ## %else107
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_76
+; AVX1-NEXT: ## BB#75: ## %cond.load109
+; AVX1-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_76: ## %else110
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_78
+; AVX1-NEXT: ## BB#77: ## %cond.load112
+; AVX1-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_78: ## %else113
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_80
+; AVX1-NEXT: ## BB#79: ## %cond.load115
+; AVX1-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_80: ## %else116
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_82
+; AVX1-NEXT: ## BB#81: ## %cond.load118
+; AVX1-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_82: ## %else119
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_84
+; AVX1-NEXT: ## BB#83: ## %cond.load121
+; AVX1-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_84: ## %else122
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_86
+; AVX1-NEXT: ## BB#85: ## %cond.load124
+; AVX1-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_86: ## %else125
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_88
+; AVX1-NEXT: ## BB#87: ## %cond.load127
+; AVX1-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_88: ## %else128
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_90
+; AVX1-NEXT: ## BB#89: ## %cond.load130
+; AVX1-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_90: ## %else131
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_92
+; AVX1-NEXT: ## BB#91: ## %cond.load133
+; AVX1-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_92: ## %else134
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_94
+; AVX1-NEXT: ## BB#93: ## %cond.load136
+; AVX1-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_94: ## %else137
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_96
+; AVX1-NEXT: ## BB#95: ## %cond.load139
+; AVX1-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_96: ## %else140
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_98
+; AVX1-NEXT: ## BB#97: ## %cond.load142
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_98: ## %else143
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_100
+; AVX1-NEXT: ## BB#99: ## %cond.load145
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_100: ## %else146
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_102
+; AVX1-NEXT: ## BB#101: ## %cond.load148
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_102: ## %else149
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_104
+; AVX1-NEXT: ## BB#103: ## %cond.load151
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_104: ## %else152
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_106
+; AVX1-NEXT: ## BB#105: ## %cond.load154
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_106: ## %else155
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_108
+; AVX1-NEXT: ## BB#107: ## %cond.load157
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_108: ## %else158
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_110
+; AVX1-NEXT: ## BB#109: ## %cond.load160
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_110: ## %else161
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_112
+; AVX1-NEXT: ## BB#111: ## %cond.load163
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_112: ## %else164
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_114
+; AVX1-NEXT: ## BB#113: ## %cond.load166
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_114: ## %else167
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_116
+; AVX1-NEXT: ## BB#115: ## %cond.load169
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_116: ## %else170
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_118
+; AVX1-NEXT: ## BB#117: ## %cond.load172
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_118: ## %else173
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_120
+; AVX1-NEXT: ## BB#119: ## %cond.load175
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_120: ## %else176
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_122
+; AVX1-NEXT: ## BB#121: ## %cond.load178
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_122: ## %else179
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_124
+; AVX1-NEXT: ## BB#123: ## %cond.load181
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_124: ## %else182
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_126
+; AVX1-NEXT: ## BB#125: ## %cond.load184
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_126: ## %else185
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %r8d, (%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: je LBB52_128
+; AVX1-NEXT: ## BB#127: ## %cond.load187
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $15, 63(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_128: ## %else188
+; AVX1-NEXT: movzbl %r10b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r11b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r14b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r15b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r12b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %dil, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %bpl, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %bl, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX1-NEXT: movzbl %r13b, %r13d
+; AVX1-NEXT: vmovd %r13d, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload
+; AVX1-NEXT: movzbl %dil, %ebp
+; AVX1-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl (%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: ## xmm5 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm8 ## 4-byte Folded Reload
+; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm6 ## 4-byte Folded Reload
+; AVX1-NEXT: ## xmm6 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $3, %r15d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $4, %r14d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $6, %r8d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $7, %edx, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX1-NEXT: vpinsrb $10, %esi, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX1-NEXT: vpinsrb $11, %r9d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX1-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX1-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: vpinsrb $14, %r13d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX1-NEXT: vpinsrb $15, %r14d, %xmm6, %xmm10
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX1-NEXT: vmovd %edi, %xmm7
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX1-NEXT: vpinsrb $1, %r11d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $2, %r15d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $3, %r12d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $4, %r8d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $7, %esi, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $10, %r13d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $11, %edx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $12, %r14d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $13, %ebx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $14, %edi, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $15, %ebp, %xmm7, %xmm7
+; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $7, %xmm8, %xmm6
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm4
+; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vpsllw $7, %xmm10, %xmm4
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $7, %xmm7, %xmm6
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm2
+; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: addq $8, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_64xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: Ltmp3:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: Ltmp4:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: Ltmp5:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: Ltmp6:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: Ltmp7:
+; AVX2-NEXT: .cfi_def_cfa_offset 48
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: Ltmp8:
+; AVX2-NEXT: .cfi_def_cfa_offset 56
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: Ltmp9:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: Ltmp10:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: Ltmp11:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: Ltmp12:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: Ltmp13:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: Ltmp14:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: Ltmp15:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: je LBB52_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzbl (%rax), %ebp
+; AVX2-NEXT: vmovd %ebp, %xmm2
+; AVX2-NEXT: LBB52_2: ## %else
+; AVX2-NEXT: testb $1, %sil
+; AVX2-NEXT: je LBB52_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrb $1, 1(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_4: ## %else2
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB52_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrb $2, 2(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_6: ## %else5
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB52_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrb $3, 3(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_8: ## %else8
+; AVX2-NEXT: testb $1, %r8b
+; AVX2-NEXT: je LBB52_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrb $4, 4(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_10: ## %else11
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r10b
+; AVX2-NEXT: testb $1, %r9b
+; AVX2-NEXT: je LBB52_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrb $5, 5(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_12: ## %else14
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r11b
+; AVX2-NEXT: testb $1, %r10b
+; AVX2-NEXT: je LBB52_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrb $6, 6(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_14: ## %else17
+; AVX2-NEXT: testb $1, %r11b
+; AVX2-NEXT: je LBB52_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrb $7, 7(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_16: ## %else20
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vpinsrb $8, 8(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_18: ## %else23
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vpinsrb $9, 9(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_20: ## %else26
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vpinsrb $10, 10(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_22: ## %else29
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bpl
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vpinsrb $11, 11(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_24: ## %else32
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bl
+; AVX2-NEXT: testb $1, %bpl
+; AVX2-NEXT: je LBB52_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vpinsrb $12, 12(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_26: ## %else35
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r14b
+; AVX2-NEXT: testb $1, %bl
+; AVX2-NEXT: je LBB52_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vpinsrb $13, 13(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_28: ## %else38
+; AVX2-NEXT: testb $1, %r14b
+; AVX2-NEXT: je LBB52_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vpinsrb $14, 14(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_30: ## %else41
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r13b
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vpinsrb $15, 15(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_32: ## %else44
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r12b
+; AVX2-NEXT: testb $1, %r13b
+; AVX2-NEXT: je LBB52_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_34: ## %else47
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r15b
+; AVX2-NEXT: testb $1, %r12b
+; AVX2-NEXT: je LBB52_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_36: ## %else50
+; AVX2-NEXT: testb $1, %r15b
+; AVX2-NEXT: je LBB52_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_38: ## %else53
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_40: ## %else56
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_42: ## %else59
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_44: ## %else62
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_46: ## %else65
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_48: ## %else68
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_50: ## %else71
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_52: ## %else74
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_54: ## %else77
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_56: ## %else80
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_58: ## %else83
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_60: ## %else86
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_62: ## %else89
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_64: ## %else92
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_66
+; AVX2-NEXT: ## BB#65: ## %cond.load94
+; AVX2-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: LBB52_66: ## %else95
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_68
+; AVX2-NEXT: ## BB#67: ## %cond.load97
+; AVX2-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_68: ## %else98
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_70
+; AVX2-NEXT: ## BB#69: ## %cond.load100
+; AVX2-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_70: ## %else101
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_72
+; AVX2-NEXT: ## BB#71: ## %cond.load103
+; AVX2-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_72: ## %else104
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_74
+; AVX2-NEXT: ## BB#73: ## %cond.load106
+; AVX2-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_74: ## %else107
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_76
+; AVX2-NEXT: ## BB#75: ## %cond.load109
+; AVX2-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_76: ## %else110
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_78
+; AVX2-NEXT: ## BB#77: ## %cond.load112
+; AVX2-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_78: ## %else113
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_80
+; AVX2-NEXT: ## BB#79: ## %cond.load115
+; AVX2-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_80: ## %else116
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_82
+; AVX2-NEXT: ## BB#81: ## %cond.load118
+; AVX2-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_82: ## %else119
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_84
+; AVX2-NEXT: ## BB#83: ## %cond.load121
+; AVX2-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_84: ## %else122
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_86
+; AVX2-NEXT: ## BB#85: ## %cond.load124
+; AVX2-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_86: ## %else125
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_88
+; AVX2-NEXT: ## BB#87: ## %cond.load127
+; AVX2-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_88: ## %else128
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_90
+; AVX2-NEXT: ## BB#89: ## %cond.load130
+; AVX2-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_90: ## %else131
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_92
+; AVX2-NEXT: ## BB#91: ## %cond.load133
+; AVX2-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_92: ## %else134
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_94
+; AVX2-NEXT: ## BB#93: ## %cond.load136
+; AVX2-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_94: ## %else137
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_96
+; AVX2-NEXT: ## BB#95: ## %cond.load139
+; AVX2-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_96: ## %else140
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_98
+; AVX2-NEXT: ## BB#97: ## %cond.load142
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_98: ## %else143
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_100
+; AVX2-NEXT: ## BB#99: ## %cond.load145
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_100: ## %else146
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_102
+; AVX2-NEXT: ## BB#101: ## %cond.load148
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_102: ## %else149
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_104
+; AVX2-NEXT: ## BB#103: ## %cond.load151
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_104: ## %else152
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_106
+; AVX2-NEXT: ## BB#105: ## %cond.load154
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_106: ## %else155
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_108
+; AVX2-NEXT: ## BB#107: ## %cond.load157
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_108: ## %else158
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_110
+; AVX2-NEXT: ## BB#109: ## %cond.load160
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_110: ## %else161
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_112
+; AVX2-NEXT: ## BB#111: ## %cond.load163
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_112: ## %else164
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_114
+; AVX2-NEXT: ## BB#113: ## %cond.load166
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_114: ## %else167
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_116
+; AVX2-NEXT: ## BB#115: ## %cond.load169
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_116: ## %else170
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_118
+; AVX2-NEXT: ## BB#117: ## %cond.load172
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_118: ## %else173
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_120
+; AVX2-NEXT: ## BB#119: ## %cond.load175
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_120: ## %else176
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_122
+; AVX2-NEXT: ## BB#121: ## %cond.load178
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_122: ## %else179
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_124
+; AVX2-NEXT: ## BB#123: ## %cond.load181
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_124: ## %else182
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: jne LBB52_126
+; AVX2-NEXT: ## BB#125:
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: jmp LBB52_127
+; AVX2-NEXT: LBB52_126: ## %cond.load184
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_127: ## %else185
+; AVX2-NEXT: movl %ebp, %eax
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %r8d, (%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %esi, %ebp
+; AVX2-NEXT: je LBB52_129
+; AVX2-NEXT: ## BB#128: ## %cond.load187
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $15, 63(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_129: ## %else188
+; AVX2-NEXT: movzbl %r10b, %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r11b, %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %bl, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r14b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r12b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r15b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload
+; AVX2-NEXT: movzbl %dil, %r13d
+; AVX2-NEXT: vmovd %r13d, %xmm4
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl (%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: ## xmm5 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vmovd %r12d, %xmm6
+; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $2, %r15d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $3, %r14d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $4, %ebx, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $7, %esi, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: vpinsrb $10, %edx, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX2-NEXT: vpinsrb $11, %r8d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX2-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX2-NEXT: vmovd %r12d, %xmm7
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX2-NEXT: vpinsrb $1, %r9d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $2, %r11d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $3, %r14d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $6, %r8d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $7, %ebx, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $9, %ebp, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $11, %edi, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $12, %r15d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $13, %esi, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm7, %xmm7
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX2-NEXT: vpsllw $7, %ymm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm2
+; AVX2-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_64xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: Ltmp0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: Ltmp1:
+; AVX512F-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: Ltmp2:
+; AVX512F-NEXT: .cfi_def_cfa_offset 32
+; AVX512F-NEXT: pushq %r13
+; AVX512F-NEXT: Ltmp3:
+; AVX512F-NEXT: .cfi_def_cfa_offset 40
+; AVX512F-NEXT: pushq %r12
+; AVX512F-NEXT: Ltmp4:
+; AVX512F-NEXT: .cfi_def_cfa_offset 48
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: Ltmp5:
+; AVX512F-NEXT: .cfi_def_cfa_offset 56
+; AVX512F-NEXT: subq $76, %rsp
+; AVX512F-NEXT: Ltmp6:
+; AVX512F-NEXT: .cfi_def_cfa_offset 132
+; AVX512F-NEXT: Ltmp7:
+; AVX512F-NEXT: .cfi_offset %rbx, -56
+; AVX512F-NEXT: Ltmp8:
+; AVX512F-NEXT: .cfi_offset %r12, -48
+; AVX512F-NEXT: Ltmp9:
+; AVX512F-NEXT: .cfi_offset %r13, -40
+; AVX512F-NEXT: Ltmp10:
+; AVX512F-NEXT: .cfi_offset %r14, -32
+; AVX512F-NEXT: Ltmp11:
+; AVX512F-NEXT: .cfi_offset %r15, -24
+; AVX512F-NEXT: Ltmp12:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB52_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, (%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_26: ## %else35
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_28: ## %else38
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_30: ## %else41
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_32: ## %else44
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_34: ## %else47
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_36: ## %else50
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_38: ## %else53
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_40: ## %else56
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_42: ## %else59
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_44: ## %else62
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_46: ## %else65
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_48: ## %else68
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_50: ## %else71
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_52: ## %else74
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_54: ## %else77
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_56: ## %else80
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_58: ## %else83
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm1
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_60: ## %else86
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_62: ## %else89
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_64: ## %else92
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_66
+; AVX512F-NEXT: ## BB#65: ## %cond.load94
+; AVX512F-NEXT: vpinsrb $0, 32(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_66: ## %else95
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_68
+; AVX512F-NEXT: ## BB#67: ## %cond.load97
+; AVX512F-NEXT: vpinsrb $1, 33(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_68: ## %else98
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_70
+; AVX512F-NEXT: ## BB#69: ## %cond.load100
+; AVX512F-NEXT: vpinsrb $2, 34(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_70: ## %else101
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_72
+; AVX512F-NEXT: ## BB#71: ## %cond.load103
+; AVX512F-NEXT: vpinsrb $3, 35(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_72: ## %else104
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_74
+; AVX512F-NEXT: ## BB#73: ## %cond.load106
+; AVX512F-NEXT: vpinsrb $4, 36(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_74: ## %else107
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_76
+; AVX512F-NEXT: ## BB#75: ## %cond.load109
+; AVX512F-NEXT: vpinsrb $5, 37(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_76: ## %else110
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_78
+; AVX512F-NEXT: ## BB#77: ## %cond.load112
+; AVX512F-NEXT: vpinsrb $6, 38(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_78: ## %else113
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_80
+; AVX512F-NEXT: ## BB#79: ## %cond.load115
+; AVX512F-NEXT: vpinsrb $7, 39(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_80: ## %else116
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_82
+; AVX512F-NEXT: ## BB#81: ## %cond.load118
+; AVX512F-NEXT: vpinsrb $8, 40(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_82: ## %else119
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_84
+; AVX512F-NEXT: ## BB#83: ## %cond.load121
+; AVX512F-NEXT: vpinsrb $9, 41(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_84: ## %else122
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_86
+; AVX512F-NEXT: ## BB#85: ## %cond.load124
+; AVX512F-NEXT: vpinsrb $10, 42(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_86: ## %else125
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_88
+; AVX512F-NEXT: ## BB#87: ## %cond.load127
+; AVX512F-NEXT: vpinsrb $11, 43(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_88: ## %else128
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_90
+; AVX512F-NEXT: ## BB#89: ## %cond.load130
+; AVX512F-NEXT: vpinsrb $12, 44(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_90: ## %else131
+; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm2
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_92
+; AVX512F-NEXT: ## BB#91: ## %cond.load133
+; AVX512F-NEXT: vpinsrb $13, 45(%rdi), %xmm1, %xmm3
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_92: ## %else134
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_94
+; AVX512F-NEXT: ## BB#93: ## %cond.load136
+; AVX512F-NEXT: vpinsrb $14, 46(%rdi), %xmm1, %xmm3
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_94: ## %else137
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_96
+; AVX512F-NEXT: ## BB#95: ## %cond.load139
+; AVX512F-NEXT: vpinsrb $15, 47(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_96: ## %else140
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_98
+; AVX512F-NEXT: ## BB#97: ## %cond.load142
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $0, 48(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_98: ## %else143
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_100
+; AVX512F-NEXT: ## BB#99: ## %cond.load145
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $1, 49(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_100: ## %else146
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_102
+; AVX512F-NEXT: ## BB#101: ## %cond.load148
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $2, 50(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_102: ## %else149
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_104
+; AVX512F-NEXT: ## BB#103: ## %cond.load151
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $3, 51(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_104: ## %else152
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_106
+; AVX512F-NEXT: ## BB#105: ## %cond.load154
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $4, 52(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_106: ## %else155
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_108
+; AVX512F-NEXT: ## BB#107: ## %cond.load157
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $5, 53(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_108: ## %else158
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_110
+; AVX512F-NEXT: ## BB#109: ## %cond.load160
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $6, 54(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_110: ## %else161
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_112
+; AVX512F-NEXT: ## BB#111: ## %cond.load163
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $7, 55(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_112: ## %else164
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_114
+; AVX512F-NEXT: ## BB#113: ## %cond.load166
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $8, 56(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_114: ## %else167
+; AVX512F-NEXT: kshiftlw $6, %k1, %k2
+; AVX512F-NEXT: kshiftrw $15, %k2, %k2
+; AVX512F-NEXT: kmovw %k2, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_116
+; AVX512F-NEXT: ## BB#115: ## %cond.load169
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $9, 57(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_116: ## %else170
+; AVX512F-NEXT: kshiftlw $5, %k1, %k3
+; AVX512F-NEXT: kshiftrw $15, %k3, %k3
+; AVX512F-NEXT: kmovw %k3, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_118
+; AVX512F-NEXT: ## BB#117: ## %cond.load172
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $10, 58(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_118: ## %else173
+; AVX512F-NEXT: kshiftlw $4, %k1, %k4
+; AVX512F-NEXT: kshiftrw $15, %k4, %k4
+; AVX512F-NEXT: kmovw %k4, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_120
+; AVX512F-NEXT: ## BB#119: ## %cond.load175
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $11, 59(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_120: ## %else176
+; AVX512F-NEXT: kshiftlw $3, %k1, %k5
+; AVX512F-NEXT: kshiftrw $15, %k5, %k5
+; AVX512F-NEXT: kmovw %k5, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_122
+; AVX512F-NEXT: ## BB#121: ## %cond.load178
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $12, 60(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_122: ## %else179
+; AVX512F-NEXT: kshiftlw $2, %k1, %k6
+; AVX512F-NEXT: kshiftrw $15, %k6, %k6
+; AVX512F-NEXT: kmovw %k6, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_124
+; AVX512F-NEXT: ## BB#123: ## %cond.load181
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $13, 61(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_124: ## %else182
+; AVX512F-NEXT: kshiftlw $1, %k1, %k7
+; AVX512F-NEXT: kshiftrw $15, %k7, %k7
+; AVX512F-NEXT: kmovw %k7, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_126
+; AVX512F-NEXT: ## BB#125: ## %cond.load184
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $14, 62(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_126: ## %else185
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_128
+; AVX512F-NEXT: ## BB#127: ## %cond.load187
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $15, 63(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_128: ## %else188
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw (%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, (%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw %k2, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw %k3, %r12d
+; AVX512F-NEXT: kmovw %k4, %r15d
+; AVX512F-NEXT: kmovw %k5, %r14d
+; AVX512F-NEXT: kmovw %k6, %ebx
+; AVX512F-NEXT: kmovw %k7, %r11d
+; AVX512F-NEXT: kmovw %k1, %r10d
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r8d
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r9d
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %edi
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %esi
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %r13d ## 4-byte Reload
+; AVX512F-NEXT: vmovd %r13d, %xmm2
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX512F-NEXT: vmovd %ebp, %xmm3
+; AVX512F-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, (%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX512F-NEXT: vmovd %ebp, %xmm6
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r13d
+; AVX512F-NEXT: vpinsrb $10, %r12d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r12d
+; AVX512F-NEXT: vpinsrb $11, %r15d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r15d
+; AVX512F-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r14d
+; AVX512F-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %ebx
+; AVX512F-NEXT: vpinsrb $14, %r11d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r11d
+; AVX512F-NEXT: vpinsrb $15, %r10d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r10d
+; AVX512F-NEXT: vmovd %r8d, %xmm7
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r8d
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm7, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, %r9d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $3, %edi, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $8, %r13d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $9, %r12d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $10, %r15d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $11, %r14d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $12, %ebx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $13, %r11d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $14, %r10d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $15, %r8d, %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: addq $76, %rsp
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r12
+; AVX512F-NEXT: popq %r13
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_64xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
+ ret <64 x i8> %res
+}
+declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
+
+define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; AVX-LABEL: test_mask_load_8xi16:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: ## implicit-def: %XMM1
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_2
+; AVX-NEXT: ## BB#1: ## %cond.load
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: LBB53_2: ## %else
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_4
+; AVX-NEXT: ## BB#3: ## %cond.load1
+; AVX-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_4: ## %else2
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_6
+; AVX-NEXT: ## BB#5: ## %cond.load4
+; AVX-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_6: ## %else5
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_8
+; AVX-NEXT: ## BB#7: ## %cond.load7
+; AVX-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_8: ## %else8
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_10
+; AVX-NEXT: ## BB#9: ## %cond.load10
+; AVX-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_10: ## %else11
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_12
+; AVX-NEXT: ## BB#11: ## %cond.load13
+; AVX-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_12: ## %else14
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_14
+; AVX-NEXT: ## BB#13: ## %cond.load16
+; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_14: ## %else17
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_16
+; AVX-NEXT: ## BB#15: ## %cond.load19
+; AVX-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_16: ## %else20
+; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_8xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %XMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB53_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_16: ## %else20
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_8xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+
+define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; AVX1-LABEL: test_mask_load_16xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: ## implicit-def: %YMM1
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: LBB54_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_32: ## %else44
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_16xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: ## implicit-def: %YMM1
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: LBB54_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_32: ## %else44
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_16xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %YMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB54_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_26: ## %else35
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_28: ## %else38
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_30: ## %else41
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_32: ## %else44
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_16xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
+
+define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
+; AVX1-LABEL: test_mask_load_32xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: LBB55_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_32: ## %else44
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: LBB55_34: ## %else47
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_36: ## %else50
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_38: ## %else53
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_40: ## %else56
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_42: ## %else59
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_44: ## %else62
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_46: ## %else65
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_48: ## %else68
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_50: ## %else71
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_52: ## %else74
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_54: ## %else77
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_56: ## %else80
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_58: ## %else83
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_60: ## %else86
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_62: ## %else89
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_64: ## %else92
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm6, %xmm6
+; AVX1-NEXT: vpsraw $15, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT: vandps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $15, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandnps %ymm2, %ymm1, %ymm2
+; AVX1-NEXT: vandps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_32xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: LBB55_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_32: ## %else44
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpextrb $0, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: LBB55_34: ## %else47
+; AVX2-NEXT: vpextrb $1, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_36: ## %else50
+; AVX2-NEXT: vpextrb $2, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_38: ## %else53
+; AVX2-NEXT: vpextrb $3, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_40: ## %else56
+; AVX2-NEXT: vpextrb $4, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_42: ## %else59
+; AVX2-NEXT: vpextrb $5, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_44: ## %else62
+; AVX2-NEXT: vpextrb $6, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_46: ## %else65
+; AVX2-NEXT: vpextrb $7, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_48: ## %else68
+; AVX2-NEXT: vpextrb $8, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_50: ## %else71
+; AVX2-NEXT: vpextrb $9, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_52: ## %else74
+; AVX2-NEXT: vpextrb $10, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_54: ## %else77
+; AVX2-NEXT: vpextrb $11, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_56: ## %else80
+; AVX2-NEXT: vpextrb $12, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_58: ## %else83
+; AVX2-NEXT: vpextrb $13, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_60: ## %else86
+; AVX2-NEXT: vpextrb $14, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_62: ## %else89
+; AVX2-NEXT: vpextrb $15, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_64: ## %else92
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_32xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: LBB55_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_6: ## %else5
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_8: ## %else8
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_10: ## %else11
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_12: ## %else14
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_14: ## %else17
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_16: ## %else20
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_18: ## %else23
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_20: ## %else26
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_22: ## %else29
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_24: ## %else32
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_26: ## %else35
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_28: ## %else38
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_30: ## %else41
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_32: ## %else44
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpextrb $0, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB55_34: ## %else47
+; AVX512F-NEXT: vpextrb $1, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_36: ## %else50
+; AVX512F-NEXT: vpextrb $2, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_38: ## %else53
+; AVX512F-NEXT: vpextrb $3, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_40: ## %else56
+; AVX512F-NEXT: vpextrb $4, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_42: ## %else59
+; AVX512F-NEXT: vpextrb $5, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_44: ## %else62
+; AVX512F-NEXT: vpextrb $6, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_46: ## %else65
+; AVX512F-NEXT: vpextrb $7, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_48: ## %else68
+; AVX512F-NEXT: vpextrb $8, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_50: ## %else71
+; AVX512F-NEXT: vpextrb $9, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_52: ## %else74
+; AVX512F-NEXT: vpextrb $10, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_54: ## %else77
+; AVX512F-NEXT: vpextrb $11, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_56: ## %else80
+; AVX512F-NEXT: vpextrb $12, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_58: ## %else83
+; AVX512F-NEXT: vpextrb $13, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_60: ## %else86
+; AVX512F-NEXT: vpextrb $14, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_62: ## %else89
+; AVX512F-NEXT: vpextrb $15, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_64: ## %else92
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_32xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
+ ret <32 x i16> %res
+}
+declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
+
+define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; AVX-LABEL: test_mask_store_16xi8:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_2
+; AVX-NEXT: ## BB#1: ## %cond.store
+; AVX-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX-NEXT: LBB56_2: ## %else
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_4
+; AVX-NEXT: ## BB#3: ## %cond.store1
+; AVX-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX-NEXT: LBB56_4: ## %else2
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_6
+; AVX-NEXT: ## BB#5: ## %cond.store3
+; AVX-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX-NEXT: LBB56_6: ## %else4
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_8
+; AVX-NEXT: ## BB#7: ## %cond.store5
+; AVX-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX-NEXT: LBB56_8: ## %else6
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_10
+; AVX-NEXT: ## BB#9: ## %cond.store7
+; AVX-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX-NEXT: LBB56_10: ## %else8
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_12
+; AVX-NEXT: ## BB#11: ## %cond.store9
+; AVX-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX-NEXT: LBB56_12: ## %else10
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_14
+; AVX-NEXT: ## BB#13: ## %cond.store11
+; AVX-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX-NEXT: LBB56_14: ## %else12
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_16
+; AVX-NEXT: ## BB#15: ## %cond.store13
+; AVX-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX-NEXT: LBB56_16: ## %else14
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_18
+; AVX-NEXT: ## BB#17: ## %cond.store15
+; AVX-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX-NEXT: LBB56_18: ## %else16
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_20
+; AVX-NEXT: ## BB#19: ## %cond.store17
+; AVX-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX-NEXT: LBB56_20: ## %else18
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_22
+; AVX-NEXT: ## BB#21: ## %cond.store19
+; AVX-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX-NEXT: LBB56_22: ## %else20
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_24
+; AVX-NEXT: ## BB#23: ## %cond.store21
+; AVX-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX-NEXT: LBB56_24: ## %else22
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_26
+; AVX-NEXT: ## BB#25: ## %cond.store23
+; AVX-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX-NEXT: LBB56_26: ## %else24
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_28
+; AVX-NEXT: ## BB#27: ## %cond.store25
+; AVX-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX-NEXT: LBB56_28: ## %else26
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_30
+; AVX-NEXT: ## BB#29: ## %cond.store27
+; AVX-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX-NEXT: LBB56_30: ## %else28
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_32
+; AVX-NEXT: ## BB#31: ## %cond.store29
+; AVX-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX-NEXT: LBB56_32: ## %else30
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_16xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX512F-NEXT: LBB56_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX512F-NEXT: LBB56_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB56_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX512F-NEXT: LBB56_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB56_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX512F-NEXT: LBB56_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB56_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX512F-NEXT: LBB56_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB56_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX512F-NEXT: LBB56_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB56_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX512F-NEXT: LBB56_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB56_26: ## %else24
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX512F-NEXT: LBB56_28: ## %else26
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB56_30: ## %else28
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX512F-NEXT: LBB56_32: ## %else30
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_16xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+
+define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; AVX1-LABEL: test_mask_store_32xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX1-NEXT: LBB57_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX1-NEXT: LBB57_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB57_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX1-NEXT: LBB57_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB57_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX1-NEXT: LBB57_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB57_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX1-NEXT: LBB57_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB57_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX1-NEXT: LBB57_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB57_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX1-NEXT: LBB57_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB57_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX1-NEXT: LBB57_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB57_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX1-NEXT: LBB57_32: ## %else30
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX1-NEXT: LBB57_34: ## %else32
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX1-NEXT: LBB57_36: ## %else34
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX1-NEXT: LBB57_38: ## %else36
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX1-NEXT: LBB57_40: ## %else38
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX1-NEXT: LBB57_42: ## %else40
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX1-NEXT: LBB57_44: ## %else42
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX1-NEXT: LBB57_46: ## %else44
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX1-NEXT: LBB57_48: ## %else46
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX1-NEXT: LBB57_50: ## %else48
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX1-NEXT: LBB57_52: ## %else50
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX1-NEXT: LBB57_54: ## %else52
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX1-NEXT: LBB57_56: ## %else54
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX1-NEXT: LBB57_58: ## %else56
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX1-NEXT: LBB57_60: ## %else58
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX1-NEXT: LBB57_62: ## %else60
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX1-NEXT: LBB57_64: ## %else62
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_32xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX2-NEXT: LBB57_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX2-NEXT: LBB57_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB57_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX2-NEXT: LBB57_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB57_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX2-NEXT: LBB57_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB57_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX2-NEXT: LBB57_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB57_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX2-NEXT: LBB57_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB57_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX2-NEXT: LBB57_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB57_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX2-NEXT: LBB57_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB57_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX2-NEXT: LBB57_32: ## %else30
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX2-NEXT: LBB57_34: ## %else32
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX2-NEXT: LBB57_36: ## %else34
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX2-NEXT: LBB57_38: ## %else36
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX2-NEXT: LBB57_40: ## %else38
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX2-NEXT: LBB57_42: ## %else40
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX2-NEXT: LBB57_44: ## %else42
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX2-NEXT: LBB57_46: ## %else44
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX2-NEXT: LBB57_48: ## %else46
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX2-NEXT: LBB57_50: ## %else48
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX2-NEXT: LBB57_52: ## %else50
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX2-NEXT: LBB57_54: ## %else52
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX2-NEXT: LBB57_56: ## %else54
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX2-NEXT: LBB57_58: ## %else56
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX2-NEXT: LBB57_60: ## %else58
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX2-NEXT: LBB57_62: ## %else60
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX2-NEXT: LBB57_64: ## %else62
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_32xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX512F-NEXT: LBB57_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX512F-NEXT: LBB57_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB57_6: ## %else4
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX512F-NEXT: LBB57_8: ## %else6
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB57_10: ## %else8
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX512F-NEXT: LBB57_12: ## %else10
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB57_14: ## %else12
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX512F-NEXT: LBB57_16: ## %else14
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB57_18: ## %else16
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX512F-NEXT: LBB57_20: ## %else18
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB57_22: ## %else20
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX512F-NEXT: LBB57_24: ## %else22
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB57_26: ## %else24
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX512F-NEXT: LBB57_28: ## %else26
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB57_30: ## %else28
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX512F-NEXT: LBB57_32: ## %else30
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX512F-NEXT: LBB57_34: ## %else32
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX512F-NEXT: LBB57_36: ## %else34
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX512F-NEXT: LBB57_38: ## %else36
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX512F-NEXT: LBB57_40: ## %else38
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX512F-NEXT: LBB57_42: ## %else40
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX512F-NEXT: LBB57_44: ## %else42
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX512F-NEXT: LBB57_46: ## %else44
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX512F-NEXT: LBB57_48: ## %else46
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX512F-NEXT: LBB57_50: ## %else48
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX512F-NEXT: LBB57_52: ## %else50
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX512F-NEXT: LBB57_54: ## %else52
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX512F-NEXT: LBB57_56: ## %else54
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX512F-NEXT: LBB57_58: ## %else56
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX512F-NEXT: LBB57_60: ## %else58
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX512F-NEXT: LBB57_62: ## %else60
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX512F-NEXT: LBB57_64: ## %else62
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_32xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
+
+define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
+; AVX1-LABEL: test_mask_store_64xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB58_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm0, (%rax)
+; AVX1-NEXT: LBB58_2: ## %else
+; AVX1-NEXT: testb $1, %sil
+; AVX1-NEXT: je LBB58_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rax)
+; AVX1-NEXT: LBB58_4: ## %else2
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rax)
+; AVX1-NEXT: LBB58_6: ## %else4
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rax)
+; AVX1-NEXT: LBB58_8: ## %else6
+; AVX1-NEXT: testb $1, %r8b
+; AVX1-NEXT: je LBB58_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rax)
+; AVX1-NEXT: LBB58_10: ## %else8
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %r9b
+; AVX1-NEXT: je LBB58_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rax)
+; AVX1-NEXT: LBB58_12: ## %else10
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rax)
+; AVX1-NEXT: LBB58_14: ## %else12
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rax)
+; AVX1-NEXT: LBB58_16: ## %else14
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rax)
+; AVX1-NEXT: LBB58_18: ## %else16
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rax)
+; AVX1-NEXT: LBB58_20: ## %else18
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rax)
+; AVX1-NEXT: LBB58_22: ## %else20
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rax)
+; AVX1-NEXT: LBB58_24: ## %else22
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rax)
+; AVX1-NEXT: LBB58_26: ## %else24
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rax)
+; AVX1-NEXT: LBB58_28: ## %else26
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rax)
+; AVX1-NEXT: LBB58_30: ## %else28
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rax)
+; AVX1-NEXT: LBB58_32: ## %else30
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rax)
+; AVX1-NEXT: LBB58_34: ## %else32
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rax)
+; AVX1-NEXT: LBB58_36: ## %else34
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rax)
+; AVX1-NEXT: LBB58_38: ## %else36
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rax)
+; AVX1-NEXT: LBB58_40: ## %else38
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rax)
+; AVX1-NEXT: LBB58_42: ## %else40
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rax)
+; AVX1-NEXT: LBB58_44: ## %else42
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rax)
+; AVX1-NEXT: LBB58_46: ## %else44
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rax)
+; AVX1-NEXT: LBB58_48: ## %else46
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rax)
+; AVX1-NEXT: LBB58_50: ## %else48
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rax)
+; AVX1-NEXT: LBB58_52: ## %else50
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rax)
+; AVX1-NEXT: LBB58_54: ## %else52
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rax)
+; AVX1-NEXT: LBB58_56: ## %else54
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rax)
+; AVX1-NEXT: LBB58_58: ## %else56
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rax)
+; AVX1-NEXT: LBB58_60: ## %else58
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rax)
+; AVX1-NEXT: LBB58_62: ## %else60
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rax)
+; AVX1-NEXT: LBB58_64: ## %else62
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_66
+; AVX1-NEXT: ## BB#65: ## %cond.store63
+; AVX1-NEXT: vpextrb $0, %xmm1, 32(%rax)
+; AVX1-NEXT: LBB58_66: ## %else64
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_68
+; AVX1-NEXT: ## BB#67: ## %cond.store65
+; AVX1-NEXT: vpextrb $1, %xmm1, 33(%rax)
+; AVX1-NEXT: LBB58_68: ## %else66
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_70
+; AVX1-NEXT: ## BB#69: ## %cond.store67
+; AVX1-NEXT: vpextrb $2, %xmm1, 34(%rax)
+; AVX1-NEXT: LBB58_70: ## %else68
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_72
+; AVX1-NEXT: ## BB#71: ## %cond.store69
+; AVX1-NEXT: vpextrb $3, %xmm1, 35(%rax)
+; AVX1-NEXT: LBB58_72: ## %else70
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_74
+; AVX1-NEXT: ## BB#73: ## %cond.store71
+; AVX1-NEXT: vpextrb $4, %xmm1, 36(%rax)
+; AVX1-NEXT: LBB58_74: ## %else72
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_76
+; AVX1-NEXT: ## BB#75: ## %cond.store73
+; AVX1-NEXT: vpextrb $5, %xmm1, 37(%rax)
+; AVX1-NEXT: LBB58_76: ## %else74
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_78
+; AVX1-NEXT: ## BB#77: ## %cond.store75
+; AVX1-NEXT: vpextrb $6, %xmm1, 38(%rax)
+; AVX1-NEXT: LBB58_78: ## %else76
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_80
+; AVX1-NEXT: ## BB#79: ## %cond.store77
+; AVX1-NEXT: vpextrb $7, %xmm1, 39(%rax)
+; AVX1-NEXT: LBB58_80: ## %else78
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_82
+; AVX1-NEXT: ## BB#81: ## %cond.store79
+; AVX1-NEXT: vpextrb $8, %xmm1, 40(%rax)
+; AVX1-NEXT: LBB58_82: ## %else80
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_84
+; AVX1-NEXT: ## BB#83: ## %cond.store81
+; AVX1-NEXT: vpextrb $9, %xmm1, 41(%rax)
+; AVX1-NEXT: LBB58_84: ## %else82
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_86
+; AVX1-NEXT: ## BB#85: ## %cond.store83
+; AVX1-NEXT: vpextrb $10, %xmm1, 42(%rax)
+; AVX1-NEXT: LBB58_86: ## %else84
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_88
+; AVX1-NEXT: ## BB#87: ## %cond.store85
+; AVX1-NEXT: vpextrb $11, %xmm1, 43(%rax)
+; AVX1-NEXT: LBB58_88: ## %else86
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_90
+; AVX1-NEXT: ## BB#89: ## %cond.store87
+; AVX1-NEXT: vpextrb $12, %xmm1, 44(%rax)
+; AVX1-NEXT: LBB58_90: ## %else88
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_92
+; AVX1-NEXT: ## BB#91: ## %cond.store89
+; AVX1-NEXT: vpextrb $13, %xmm1, 45(%rax)
+; AVX1-NEXT: LBB58_92: ## %else90
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_94
+; AVX1-NEXT: ## BB#93: ## %cond.store91
+; AVX1-NEXT: vpextrb $14, %xmm1, 46(%rax)
+; AVX1-NEXT: LBB58_94: ## %else92
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_96
+; AVX1-NEXT: ## BB#95: ## %cond.store93
+; AVX1-NEXT: vpextrb $15, %xmm1, 47(%rax)
+; AVX1-NEXT: LBB58_96: ## %else94
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_98
+; AVX1-NEXT: ## BB#97: ## %cond.store95
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, 48(%rax)
+; AVX1-NEXT: LBB58_98: ## %else96
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_100
+; AVX1-NEXT: ## BB#99: ## %cond.store97
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $1, %xmm0, 49(%rax)
+; AVX1-NEXT: LBB58_100: ## %else98
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_102
+; AVX1-NEXT: ## BB#101: ## %cond.store99
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rax)
+; AVX1-NEXT: LBB58_102: ## %else100
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_104
+; AVX1-NEXT: ## BB#103: ## %cond.store101
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $3, %xmm0, 51(%rax)
+; AVX1-NEXT: LBB58_104: ## %else102
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_106
+; AVX1-NEXT: ## BB#105: ## %cond.store103
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $4, %xmm0, 52(%rax)
+; AVX1-NEXT: LBB58_106: ## %else104
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_108
+; AVX1-NEXT: ## BB#107: ## %cond.store105
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $5, %xmm0, 53(%rax)
+; AVX1-NEXT: LBB58_108: ## %else106
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_110
+; AVX1-NEXT: ## BB#109: ## %cond.store107
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $6, %xmm0, 54(%rax)
+; AVX1-NEXT: LBB58_110: ## %else108
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_112
+; AVX1-NEXT: ## BB#111: ## %cond.store109
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $7, %xmm0, 55(%rax)
+; AVX1-NEXT: LBB58_112: ## %else110
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_114
+; AVX1-NEXT: ## BB#113: ## %cond.store111
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $8, %xmm0, 56(%rax)
+; AVX1-NEXT: LBB58_114: ## %else112
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_116
+; AVX1-NEXT: ## BB#115: ## %cond.store113
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $9, %xmm0, 57(%rax)
+; AVX1-NEXT: LBB58_116: ## %else114
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_118
+; AVX1-NEXT: ## BB#117: ## %cond.store115
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $10, %xmm0, 58(%rax)
+; AVX1-NEXT: LBB58_118: ## %else116
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_120
+; AVX1-NEXT: ## BB#119: ## %cond.store117
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $11, %xmm0, 59(%rax)
+; AVX1-NEXT: LBB58_120: ## %else118
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_122
+; AVX1-NEXT: ## BB#121: ## %cond.store119
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $12, %xmm0, 60(%rax)
+; AVX1-NEXT: LBB58_122: ## %else120
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_124
+; AVX1-NEXT: ## BB#123: ## %cond.store121
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $13, %xmm0, 61(%rax)
+; AVX1-NEXT: LBB58_124: ## %else122
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_126
+; AVX1-NEXT: ## BB#125: ## %cond.store123
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $14, %xmm0, 62(%rax)
+; AVX1-NEXT: LBB58_126: ## %else124
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_128
+; AVX1-NEXT: ## BB#127: ## %cond.store125
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 63(%rax)
+; AVX1-NEXT: LBB58_128: ## %else126
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_64xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: je LBB58_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rax)
+; AVX2-NEXT: LBB58_2: ## %else
+; AVX2-NEXT: testb $1, %sil
+; AVX2-NEXT: je LBB58_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rax)
+; AVX2-NEXT: LBB58_4: ## %else2
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rax)
+; AVX2-NEXT: LBB58_6: ## %else4
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rax)
+; AVX2-NEXT: LBB58_8: ## %else6
+; AVX2-NEXT: testb $1, %r8b
+; AVX2-NEXT: je LBB58_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rax)
+; AVX2-NEXT: LBB58_10: ## %else8
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %r9b
+; AVX2-NEXT: je LBB58_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rax)
+; AVX2-NEXT: LBB58_12: ## %else10
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rax)
+; AVX2-NEXT: LBB58_14: ## %else12
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rax)
+; AVX2-NEXT: LBB58_16: ## %else14
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rax)
+; AVX2-NEXT: LBB58_18: ## %else16
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rax)
+; AVX2-NEXT: LBB58_20: ## %else18
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rax)
+; AVX2-NEXT: LBB58_22: ## %else20
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rax)
+; AVX2-NEXT: LBB58_24: ## %else22
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rax)
+; AVX2-NEXT: LBB58_26: ## %else24
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rax)
+; AVX2-NEXT: LBB58_28: ## %else26
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rax)
+; AVX2-NEXT: LBB58_30: ## %else28
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rax)
+; AVX2-NEXT: LBB58_32: ## %else30
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rax)
+; AVX2-NEXT: LBB58_34: ## %else32
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rax)
+; AVX2-NEXT: LBB58_36: ## %else34
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rax)
+; AVX2-NEXT: LBB58_38: ## %else36
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rax)
+; AVX2-NEXT: LBB58_40: ## %else38
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rax)
+; AVX2-NEXT: LBB58_42: ## %else40
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rax)
+; AVX2-NEXT: LBB58_44: ## %else42
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rax)
+; AVX2-NEXT: LBB58_46: ## %else44
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rax)
+; AVX2-NEXT: LBB58_48: ## %else46
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rax)
+; AVX2-NEXT: LBB58_50: ## %else48
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rax)
+; AVX2-NEXT: LBB58_52: ## %else50
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rax)
+; AVX2-NEXT: LBB58_54: ## %else52
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rax)
+; AVX2-NEXT: LBB58_56: ## %else54
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rax)
+; AVX2-NEXT: LBB58_58: ## %else56
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rax)
+; AVX2-NEXT: LBB58_60: ## %else58
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rax)
+; AVX2-NEXT: LBB58_62: ## %else60
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rax)
+; AVX2-NEXT: LBB58_64: ## %else62
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_66
+; AVX2-NEXT: ## BB#65: ## %cond.store63
+; AVX2-NEXT: vpextrb $0, %xmm1, 32(%rax)
+; AVX2-NEXT: LBB58_66: ## %else64
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_68
+; AVX2-NEXT: ## BB#67: ## %cond.store65
+; AVX2-NEXT: vpextrb $1, %xmm1, 33(%rax)
+; AVX2-NEXT: LBB58_68: ## %else66
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_70
+; AVX2-NEXT: ## BB#69: ## %cond.store67
+; AVX2-NEXT: vpextrb $2, %xmm1, 34(%rax)
+; AVX2-NEXT: LBB58_70: ## %else68
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_72
+; AVX2-NEXT: ## BB#71: ## %cond.store69
+; AVX2-NEXT: vpextrb $3, %xmm1, 35(%rax)
+; AVX2-NEXT: LBB58_72: ## %else70
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_74
+; AVX2-NEXT: ## BB#73: ## %cond.store71
+; AVX2-NEXT: vpextrb $4, %xmm1, 36(%rax)
+; AVX2-NEXT: LBB58_74: ## %else72
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_76
+; AVX2-NEXT: ## BB#75: ## %cond.store73
+; AVX2-NEXT: vpextrb $5, %xmm1, 37(%rax)
+; AVX2-NEXT: LBB58_76: ## %else74
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_78
+; AVX2-NEXT: ## BB#77: ## %cond.store75
+; AVX2-NEXT: vpextrb $6, %xmm1, 38(%rax)
+; AVX2-NEXT: LBB58_78: ## %else76
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_80
+; AVX2-NEXT: ## BB#79: ## %cond.store77
+; AVX2-NEXT: vpextrb $7, %xmm1, 39(%rax)
+; AVX2-NEXT: LBB58_80: ## %else78
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_82
+; AVX2-NEXT: ## BB#81: ## %cond.store79
+; AVX2-NEXT: vpextrb $8, %xmm1, 40(%rax)
+; AVX2-NEXT: LBB58_82: ## %else80
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_84
+; AVX2-NEXT: ## BB#83: ## %cond.store81
+; AVX2-NEXT: vpextrb $9, %xmm1, 41(%rax)
+; AVX2-NEXT: LBB58_84: ## %else82
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_86
+; AVX2-NEXT: ## BB#85: ## %cond.store83
+; AVX2-NEXT: vpextrb $10, %xmm1, 42(%rax)
+; AVX2-NEXT: LBB58_86: ## %else84
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_88
+; AVX2-NEXT: ## BB#87: ## %cond.store85
+; AVX2-NEXT: vpextrb $11, %xmm1, 43(%rax)
+; AVX2-NEXT: LBB58_88: ## %else86
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_90
+; AVX2-NEXT: ## BB#89: ## %cond.store87
+; AVX2-NEXT: vpextrb $12, %xmm1, 44(%rax)
+; AVX2-NEXT: LBB58_90: ## %else88
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_92
+; AVX2-NEXT: ## BB#91: ## %cond.store89
+; AVX2-NEXT: vpextrb $13, %xmm1, 45(%rax)
+; AVX2-NEXT: LBB58_92: ## %else90
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_94
+; AVX2-NEXT: ## BB#93: ## %cond.store91
+; AVX2-NEXT: vpextrb $14, %xmm1, 46(%rax)
+; AVX2-NEXT: LBB58_94: ## %else92
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_96
+; AVX2-NEXT: ## BB#95: ## %cond.store93
+; AVX2-NEXT: vpextrb $15, %xmm1, 47(%rax)
+; AVX2-NEXT: LBB58_96: ## %else94
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_98
+; AVX2-NEXT: ## BB#97: ## %cond.store95
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, 48(%rax)
+; AVX2-NEXT: LBB58_98: ## %else96
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_100
+; AVX2-NEXT: ## BB#99: ## %cond.store97
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm0, 49(%rax)
+; AVX2-NEXT: LBB58_100: ## %else98
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_102
+; AVX2-NEXT: ## BB#101: ## %cond.store99
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $2, %xmm0, 50(%rax)
+; AVX2-NEXT: LBB58_102: ## %else100
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_104
+; AVX2-NEXT: ## BB#103: ## %cond.store101
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $3, %xmm0, 51(%rax)
+; AVX2-NEXT: LBB58_104: ## %else102
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_106
+; AVX2-NEXT: ## BB#105: ## %cond.store103
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $4, %xmm0, 52(%rax)
+; AVX2-NEXT: LBB58_106: ## %else104
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_108
+; AVX2-NEXT: ## BB#107: ## %cond.store105
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $5, %xmm0, 53(%rax)
+; AVX2-NEXT: LBB58_108: ## %else106
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_110
+; AVX2-NEXT: ## BB#109: ## %cond.store107
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $6, %xmm0, 54(%rax)
+; AVX2-NEXT: LBB58_110: ## %else108
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_112
+; AVX2-NEXT: ## BB#111: ## %cond.store109
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $7, %xmm0, 55(%rax)
+; AVX2-NEXT: LBB58_112: ## %else110
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_114
+; AVX2-NEXT: ## BB#113: ## %cond.store111
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $8, %xmm0, 56(%rax)
+; AVX2-NEXT: LBB58_114: ## %else112
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_116
+; AVX2-NEXT: ## BB#115: ## %cond.store113
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $9, %xmm0, 57(%rax)
+; AVX2-NEXT: LBB58_116: ## %else114
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_118
+; AVX2-NEXT: ## BB#117: ## %cond.store115
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $10, %xmm0, 58(%rax)
+; AVX2-NEXT: LBB58_118: ## %else116
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_120
+; AVX2-NEXT: ## BB#119: ## %cond.store117
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $11, %xmm0, 59(%rax)
+; AVX2-NEXT: LBB58_120: ## %else118
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_122
+; AVX2-NEXT: ## BB#121: ## %cond.store119
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $12, %xmm0, 60(%rax)
+; AVX2-NEXT: LBB58_122: ## %else120
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_124
+; AVX2-NEXT: ## BB#123: ## %cond.store121
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $13, %xmm0, 61(%rax)
+; AVX2-NEXT: LBB58_124: ## %else122
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_126
+; AVX2-NEXT: ## BB#125: ## %cond.store123
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $14, %xmm0, 62(%rax)
+; AVX2-NEXT: LBB58_126: ## %else124
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_128
+; AVX2-NEXT: ## BB#127: ## %cond.store125
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 63(%rax)
+; AVX2-NEXT: LBB58_128: ## %else126
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_64xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm4, (%rdi)
+; AVX512F-NEXT: LBB58_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm4, 1(%rdi)
+; AVX512F-NEXT: LBB58_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm4, 2(%rdi)
+; AVX512F-NEXT: LBB58_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm4, 3(%rdi)
+; AVX512F-NEXT: LBB58_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm4, 4(%rdi)
+; AVX512F-NEXT: LBB58_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm4, 5(%rdi)
+; AVX512F-NEXT: LBB58_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm4, 6(%rdi)
+; AVX512F-NEXT: LBB58_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm4, 7(%rdi)
+; AVX512F-NEXT: LBB58_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm4, 8(%rdi)
+; AVX512F-NEXT: LBB58_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm4, 9(%rdi)
+; AVX512F-NEXT: LBB58_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm4, 10(%rdi)
+; AVX512F-NEXT: LBB58_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm4, 11(%rdi)
+; AVX512F-NEXT: LBB58_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm4, 12(%rdi)
+; AVX512F-NEXT: LBB58_26: ## %else24
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm4, 13(%rdi)
+; AVX512F-NEXT: LBB58_28: ## %else26
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm4, 14(%rdi)
+; AVX512F-NEXT: LBB58_30: ## %else28
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm4, 15(%rdi)
+; AVX512F-NEXT: LBB58_32: ## %else30
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi)
+; AVX512F-NEXT: LBB58_34: ## %else32
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi)
+; AVX512F-NEXT: LBB58_36: ## %else34
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi)
+; AVX512F-NEXT: LBB58_38: ## %else36
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi)
+; AVX512F-NEXT: LBB58_40: ## %else38
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi)
+; AVX512F-NEXT: LBB58_42: ## %else40
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi)
+; AVX512F-NEXT: LBB58_44: ## %else42
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi)
+; AVX512F-NEXT: LBB58_46: ## %else44
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi)
+; AVX512F-NEXT: LBB58_48: ## %else46
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi)
+; AVX512F-NEXT: LBB58_50: ## %else48
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi)
+; AVX512F-NEXT: LBB58_52: ## %else50
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi)
+; AVX512F-NEXT: LBB58_54: ## %else52
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi)
+; AVX512F-NEXT: LBB58_56: ## %else54
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi)
+; AVX512F-NEXT: LBB58_58: ## %else56
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1
+; AVX512F-NEXT: vpextrb $13, %xmm1, 29(%rdi)
+; AVX512F-NEXT: LBB58_60: ## %else58
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1
+; AVX512F-NEXT: vpextrb $14, %xmm1, 30(%rdi)
+; AVX512F-NEXT: LBB58_62: ## %else60
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX512F-NEXT: LBB58_64: ## %else62
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_66
+; AVX512F-NEXT: ## BB#65: ## %cond.store63
+; AVX512F-NEXT: vpextrb $0, %xmm5, 32(%rdi)
+; AVX512F-NEXT: LBB58_66: ## %else64
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_68
+; AVX512F-NEXT: ## BB#67: ## %cond.store65
+; AVX512F-NEXT: vpextrb $1, %xmm5, 33(%rdi)
+; AVX512F-NEXT: LBB58_68: ## %else66
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_70
+; AVX512F-NEXT: ## BB#69: ## %cond.store67
+; AVX512F-NEXT: vpextrb $2, %xmm5, 34(%rdi)
+; AVX512F-NEXT: LBB58_70: ## %else68
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_72
+; AVX512F-NEXT: ## BB#71: ## %cond.store69
+; AVX512F-NEXT: vpextrb $3, %xmm5, 35(%rdi)
+; AVX512F-NEXT: LBB58_72: ## %else70
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_74
+; AVX512F-NEXT: ## BB#73: ## %cond.store71
+; AVX512F-NEXT: vpextrb $4, %xmm5, 36(%rdi)
+; AVX512F-NEXT: LBB58_74: ## %else72
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_76
+; AVX512F-NEXT: ## BB#75: ## %cond.store73
+; AVX512F-NEXT: vpextrb $5, %xmm5, 37(%rdi)
+; AVX512F-NEXT: LBB58_76: ## %else74
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_78
+; AVX512F-NEXT: ## BB#77: ## %cond.store75
+; AVX512F-NEXT: vpextrb $6, %xmm5, 38(%rdi)
+; AVX512F-NEXT: LBB58_78: ## %else76
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_80
+; AVX512F-NEXT: ## BB#79: ## %cond.store77
+; AVX512F-NEXT: vpextrb $7, %xmm5, 39(%rdi)
+; AVX512F-NEXT: LBB58_80: ## %else78
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_82
+; AVX512F-NEXT: ## BB#81: ## %cond.store79
+; AVX512F-NEXT: vpextrb $8, %xmm5, 40(%rdi)
+; AVX512F-NEXT: LBB58_82: ## %else80
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_84
+; AVX512F-NEXT: ## BB#83: ## %cond.store81
+; AVX512F-NEXT: vpextrb $9, %xmm5, 41(%rdi)
+; AVX512F-NEXT: LBB58_84: ## %else82
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_86
+; AVX512F-NEXT: ## BB#85: ## %cond.store83
+; AVX512F-NEXT: vpextrb $10, %xmm5, 42(%rdi)
+; AVX512F-NEXT: LBB58_86: ## %else84
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_88
+; AVX512F-NEXT: ## BB#87: ## %cond.store85
+; AVX512F-NEXT: vpextrb $11, %xmm5, 43(%rdi)
+; AVX512F-NEXT: LBB58_88: ## %else86
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_90
+; AVX512F-NEXT: ## BB#89: ## %cond.store87
+; AVX512F-NEXT: vpextrb $12, %xmm5, 44(%rdi)
+; AVX512F-NEXT: LBB58_90: ## %else88
+; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_92
+; AVX512F-NEXT: ## BB#91: ## %cond.store89
+; AVX512F-NEXT: vpextrb $13, %xmm5, 45(%rdi)
+; AVX512F-NEXT: LBB58_92: ## %else90
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_94
+; AVX512F-NEXT: ## BB#93: ## %cond.store91
+; AVX512F-NEXT: vpextrb $14, %xmm5, 46(%rdi)
+; AVX512F-NEXT: LBB58_94: ## %else92
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_96
+; AVX512F-NEXT: ## BB#95: ## %cond.store93
+; AVX512F-NEXT: vpextrb $15, %xmm5, 47(%rdi)
+; AVX512F-NEXT: LBB58_96: ## %else94
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_98
+; AVX512F-NEXT: ## BB#97: ## %cond.store95
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, 48(%rdi)
+; AVX512F-NEXT: LBB58_98: ## %else96
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_100
+; AVX512F-NEXT: ## BB#99: ## %cond.store97
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm0, 49(%rdi)
+; AVX512F-NEXT: LBB58_100: ## %else98
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_102
+; AVX512F-NEXT: ## BB#101: ## %cond.store99
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm0, 50(%rdi)
+; AVX512F-NEXT: LBB58_102: ## %else100
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_104
+; AVX512F-NEXT: ## BB#103: ## %cond.store101
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm0, 51(%rdi)
+; AVX512F-NEXT: LBB58_104: ## %else102
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_106
+; AVX512F-NEXT: ## BB#105: ## %cond.store103
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm0, 52(%rdi)
+; AVX512F-NEXT: LBB58_106: ## %else104
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_108
+; AVX512F-NEXT: ## BB#107: ## %cond.store105
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm0, 53(%rdi)
+; AVX512F-NEXT: LBB58_108: ## %else106
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_110
+; AVX512F-NEXT: ## BB#109: ## %cond.store107
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm0, 54(%rdi)
+; AVX512F-NEXT: LBB58_110: ## %else108
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_112
+; AVX512F-NEXT: ## BB#111: ## %cond.store109
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm0, 55(%rdi)
+; AVX512F-NEXT: LBB58_112: ## %else110
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_114
+; AVX512F-NEXT: ## BB#113: ## %cond.store111
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm0, 56(%rdi)
+; AVX512F-NEXT: LBB58_114: ## %else112
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_116
+; AVX512F-NEXT: ## BB#115: ## %cond.store113
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm0, 57(%rdi)
+; AVX512F-NEXT: LBB58_116: ## %else114
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_118
+; AVX512F-NEXT: ## BB#117: ## %cond.store115
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm0, 58(%rdi)
+; AVX512F-NEXT: LBB58_118: ## %else116
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_120
+; AVX512F-NEXT: ## BB#119: ## %cond.store117
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm0, 59(%rdi)
+; AVX512F-NEXT: LBB58_120: ## %else118
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_122
+; AVX512F-NEXT: ## BB#121: ## %cond.store119
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm0, 60(%rdi)
+; AVX512F-NEXT: LBB58_122: ## %else120
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_124
+; AVX512F-NEXT: ## BB#123: ## %cond.store121
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $13, %xmm0, 61(%rdi)
+; AVX512F-NEXT: LBB58_124: ## %else122
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_126
+; AVX512F-NEXT: ## BB#125: ## %cond.store123
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $14, %xmm0, 62(%rdi)
+; AVX512F-NEXT: LBB58_126: ## %else124
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_128
+; AVX512F-NEXT: ## BB#127: ## %cond.store125
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 63(%rdi)
+; AVX512F-NEXT: LBB58_128: ## %else126
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_64xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
+
+define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; AVX-LABEL: test_mask_store_8xi16:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_2
+; AVX-NEXT: ## BB#1: ## %cond.store
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: movw %ax, (%rdi)
+; AVX-NEXT: LBB59_2: ## %else
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_4
+; AVX-NEXT: ## BB#3: ## %cond.store1
+; AVX-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX-NEXT: LBB59_4: ## %else2
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_6
+; AVX-NEXT: ## BB#5: ## %cond.store3
+; AVX-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX-NEXT: LBB59_6: ## %else4
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_8
+; AVX-NEXT: ## BB#7: ## %cond.store5
+; AVX-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX-NEXT: LBB59_8: ## %else6
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_10
+; AVX-NEXT: ## BB#9: ## %cond.store7
+; AVX-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX-NEXT: LBB59_10: ## %else8
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_12
+; AVX-NEXT: ## BB#11: ## %cond.store9
+; AVX-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX-NEXT: LBB59_12: ## %else10
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_14
+; AVX-NEXT: ## BB#13: ## %cond.store11
+; AVX-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX-NEXT: LBB59_14: ## %else12
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_16
+; AVX-NEXT: ## BB#15: ## %cond.store13
+; AVX-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX-NEXT: LBB59_16: ## %else14
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_8xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB59_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB59_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB59_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB59_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB59_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB59_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB59_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB59_16: ## %else14
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_8xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+
+define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; AVX1-LABEL: test_mask_store_16xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: LBB60_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB60_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB60_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB60_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB60_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB60_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB60_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB60_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: LBB60_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $1, %xmm2, 18(%rdi)
+; AVX1-NEXT: LBB60_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $2, %xmm2, 20(%rdi)
+; AVX1-NEXT: LBB60_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $3, %xmm2, 22(%rdi)
+; AVX1-NEXT: LBB60_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $4, %xmm2, 24(%rdi)
+; AVX1-NEXT: LBB60_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $5, %xmm2, 26(%rdi)
+; AVX1-NEXT: LBB60_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $6, %xmm2, 28(%rdi)
+; AVX1-NEXT: LBB60_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX1-NEXT: LBB60_32: ## %else30
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_16xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: LBB60_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB60_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB60_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB60_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB60_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB60_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB60_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB60_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: LBB60_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $1, %xmm2, 18(%rdi)
+; AVX2-NEXT: LBB60_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $2, %xmm2, 20(%rdi)
+; AVX2-NEXT: LBB60_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $3, %xmm2, 22(%rdi)
+; AVX2-NEXT: LBB60_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $4, %xmm2, 24(%rdi)
+; AVX2-NEXT: LBB60_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $5, %xmm2, 26(%rdi)
+; AVX2-NEXT: LBB60_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $6, %xmm2, 28(%rdi)
+; AVX2-NEXT: LBB60_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX2-NEXT: LBB60_32: ## %else30
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_16xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB60_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB60_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB60_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB60_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB60_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB60_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB60_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB60_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: movw %ax, 16(%rdi)
+; AVX512F-NEXT: LBB60_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi)
+; AVX512F-NEXT: LBB60_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi)
+; AVX512F-NEXT: LBB60_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi)
+; AVX512F-NEXT: LBB60_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi)
+; AVX512F-NEXT: LBB60_26: ## %else24
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi)
+; AVX512F-NEXT: LBB60_28: ## %else26
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi)
+; AVX512F-NEXT: LBB60_30: ## %else28
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX512F-NEXT: LBB60_32: ## %else30
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_16xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
+
+define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
+; AVX1-LABEL: test_mask_store_32xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: LBB61_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB61_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB61_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB61_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB61_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB61_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB61_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB61_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: LBB61_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX1-NEXT: LBB61_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX1-NEXT: LBB61_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX1-NEXT: LBB61_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX1-NEXT: LBB61_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX1-NEXT: LBB61_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX1-NEXT: LBB61_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX1-NEXT: LBB61_32: ## %else30
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 32(%rdi)
+; AVX1-NEXT: LBB61_34: ## %else32
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX1-NEXT: LBB61_36: ## %else34
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX1-NEXT: LBB61_38: ## %else36
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX1-NEXT: LBB61_40: ## %else38
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX1-NEXT: LBB61_42: ## %else40
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX1-NEXT: LBB61_44: ## %else42
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX1-NEXT: LBB61_46: ## %else44
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX1-NEXT: LBB61_48: ## %else46
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, 48(%rdi)
+; AVX1-NEXT: LBB61_50: ## %else48
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX1-NEXT: LBB61_52: ## %else50
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX1-NEXT: LBB61_54: ## %else52
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX1-NEXT: LBB61_56: ## %else54
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX1-NEXT: LBB61_58: ## %else56
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX1-NEXT: LBB61_60: ## %else58
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX1-NEXT: LBB61_62: ## %else60
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX1-NEXT: LBB61_64: ## %else62
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_32xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: LBB61_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB61_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB61_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB61_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB61_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB61_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB61_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB61_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: LBB61_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX2-NEXT: LBB61_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX2-NEXT: LBB61_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX2-NEXT: LBB61_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX2-NEXT: LBB61_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX2-NEXT: LBB61_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX2-NEXT: LBB61_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX2-NEXT: LBB61_32: ## %else30
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 32(%rdi)
+; AVX2-NEXT: LBB61_34: ## %else32
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX2-NEXT: LBB61_36: ## %else34
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX2-NEXT: LBB61_38: ## %else36
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX2-NEXT: LBB61_40: ## %else38
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX2-NEXT: LBB61_42: ## %else40
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX2-NEXT: LBB61_44: ## %else42
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX2-NEXT: LBB61_46: ## %else44
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX2-NEXT: LBB61_48: ## %else46
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, 48(%rdi)
+; AVX2-NEXT: LBB61_50: ## %else48
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX2-NEXT: LBB61_52: ## %else50
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX2-NEXT: LBB61_54: ## %else52
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX2-NEXT: LBB61_56: ## %else54
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX2-NEXT: LBB61_58: ## %else56
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX2-NEXT: LBB61_60: ## %else58
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX2-NEXT: LBB61_62: ## %else60
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX2-NEXT: LBB61_64: ## %else62
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_32xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB61_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB61_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB61_6: ## %else4
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB61_8: ## %else6
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB61_10: ## %else8
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB61_12: ## %else10
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB61_14: ## %else12
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB61_16: ## %else14
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vmovd %xmm3, %eax
+; AVX512F-NEXT: movw %ax, 16(%rdi)
+; AVX512F-NEXT: LBB61_18: ## %else16
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX512F-NEXT: LBB61_20: ## %else18
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX512F-NEXT: LBB61_22: ## %else20
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX512F-NEXT: LBB61_24: ## %else22
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX512F-NEXT: LBB61_26: ## %else24
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX512F-NEXT: LBB61_28: ## %else26
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX512F-NEXT: LBB61_30: ## %else28
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX512F-NEXT: LBB61_32: ## %else30
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: movw %ax, 32(%rdi)
+; AVX512F-NEXT: LBB61_34: ## %else32
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX512F-NEXT: LBB61_36: ## %else34
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX512F-NEXT: LBB61_38: ## %else36
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX512F-NEXT: LBB61_40: ## %else38
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX512F-NEXT: LBB61_42: ## %else40
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX512F-NEXT: LBB61_44: ## %else42
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX512F-NEXT: LBB61_46: ## %else44
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX512F-NEXT: LBB61_48: ## %else46
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, 48(%rdi)
+; AVX512F-NEXT: LBB61_50: ## %else48
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX512F-NEXT: LBB61_52: ## %else50
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX512F-NEXT: LBB61_54: ## %else52
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX512F-NEXT: LBB61_56: ## %else54
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX512F-NEXT: LBB61_58: ## %else56
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX512F-NEXT: LBB61_60: ## %else58
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX512F-NEXT: LBB61_62: ## %else60
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX512F-NEXT: LBB61_64: ## %else62
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_32xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
+ ret void
+}
+
+declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>)
diff --git a/test/CodeGen/X86/materialize-one.ll b/test/CodeGen/X86/materialize-one.ll
deleted file mode 100644
index 49da8008b88c..000000000000
--- a/test/CodeGen/X86/materialize-one.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
-
-define i32 @one32() optsize {
-entry:
- ret i32 1
-
-; CHECK32-LABEL: one32
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: incl %eax
-; CHECK32-NEXT: ret
-
-; FIXME: Figure out the best approach in 64-bit mode.
-; CHECK64-LABEL: one32
-; CHECK64: movl $1, %eax
-; CHECK64-NEXT: retq
-}
-
-define i32 @minus_one32() optsize {
-entry:
- ret i32 -1
-
-; CHECK32-LABEL: minus_one32
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NEXT: ret
-}
-
-define i16 @one16() optsize {
-entry:
- ret i16 1
-
-; CHECK32-LABEL: one16
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: incl %eax
-; CHECK32-NEXT: retl
-}
-
-define i16 @minus_one16() optsize {
-entry:
- ret i16 -1
-
-; CHECK32-LABEL: minus_one16
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NEXT: retl
-}
-
-define i32 @test_rematerialization() optsize {
-entry:
- ; Materialize -1 (thiscall forces it into %ecx).
- tail call x86_thiscallcc void @f(i32 -1)
-
- ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
- ; spilling it to the stack.
- tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
- ; -1 should be re-materialized here instead of getting spilled above.
- ret i32 -1
-
-; CHECK32-LABEL: test_rematerialization
-; CHECK32: xorl %ecx, %ecx
-; CHECK32-NEXT: decl %ecx
-; CHECK32: calll
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NOT: %eax
-; CHECK32: retl
-}
-
-define i32 @test_rematerialization2(i32 %x) optsize {
-entry:
- ; Materialize -1 (thiscall forces it into %ecx).
- tail call x86_thiscallcc void @f(i32 -1)
-
- ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
- ; spilling it to the stack.
- tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
- ; Define eflags.
- %a = icmp ne i32 %x, 123
- %b = zext i1 %a to i32
- ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
- ; It must therefore not use the xor-dec lowering.
- %c = select i1 %a, i32 %b, i32 -1
- ret i32 %c
-
-; CHECK32-LABEL: test_rematerialization2
-; CHECK32: xorl %ecx, %ecx
-; CHECK32-NEXT: decl %ecx
-; CHECK32: calll
-; CHECK32: cmpl
-; CHECK32: setne
-; CHECK32-NOT: xorl
-; CHECK32: movl $-1
-; CHECK32: cmov
-; CHECK32: retl
-}
-
-declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/materialize.ll b/test/CodeGen/X86/materialize.ll
new file mode 100644
index 000000000000..6e1264b4fd43
--- /dev/null
+++ b/test/CodeGen/X86/materialize.ll
@@ -0,0 +1,216 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
+; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
+
+define i32 @one32_nooptsize() {
+entry:
+ ret i32 1
+
+; When not optimizing for size, use mov.
+; CHECK32-LABEL: one32_nooptsize:
+; CHECK32: movl $1, %eax
+; CHECK32-NEXT: retl
+; CHECK64-LABEL: one32_nooptsize:
+; CHECK64: movl $1, %eax
+; CHECK64-NEXT: retq
+}
+
+define i32 @one32() optsize {
+entry:
+ ret i32 1
+
+; CHECK32-LABEL: one32:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: retl
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32:
+; CHECK64: movl $1, %eax
+; CHECK64-NEXT: retq
+}
+
+define i32 @one32_minsize() minsize {
+entry:
+ ret i32 1
+
+; On 32-bit, xor-inc is preferred over push-pop.
+; CHECK32-LABEL: one32_minsize:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: retl
+
+; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
+; pop into a 64-bit register even when we just need 32 bits.
+; CHECK64-LABEL: one32_minsize:
+; CHECK64: pushq $1
+; CHECK64: .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64: .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT: retq
+
+; On Win64 we can't adjust the stack unless there's a frame pointer.
+; CHECKWIN64-LABEL: one32_minsize:
+; CHECKWIN64: movl $1, %eax
+; CHECKWIN64-NEXT: retq
+}
+
+define i32 @pr26023() minsize {
+entry:
+ %x = alloca [120 x i8]
+ %0 = getelementptr inbounds [120 x i8], [120 x i8]* %x, i64 0, i64 0
+ call void asm sideeffect "", "imr,~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %0)
+ %arrayidx = getelementptr inbounds [120 x i8], [120 x i8]* %x, i64 0, i64 119
+ store volatile i8 -2, i8* %arrayidx
+ call void asm sideeffect "", "r,~{dirflag},~{fpsr},~{flags}"(i32 5)
+ %1 = load volatile i8, i8* %arrayidx
+ %conv = sext i8 %1 to i32
+ ret i32 %conv
+
+; The function writes to the redzone, so push/pop cannot be used.
+; CHECK64-LABEL: pr26023:
+; CHECK64: movl $5, %ecx
+; CHECK64: retq
+
+; 32-bit X86 doesn't have a redzone.
+; CHECK32-LABEL: pr26023:
+; CHECK32: pushl $5
+; CHECK32: popl %ecx
+; CHECK32: retl
+}
+
+
+define i64 @one64_minsize() minsize {
+entry:
+ ret i64 1
+; On 64-bit we don't do xor-inc yet, so push-pop it is.
+; CHECK64-LABEL: one64_minsize:
+; CHECK64: pushq $1
+; CHECK64: .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64: .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT: retq
+
+; On Win64 we can't adjust the stack unless there's a frame pointer.
+; CHECKWIN64-LABEL: one64_minsize:
+; CHECKWIN64: movl $1, %eax
+; CHECKWIN64-NEXT: retq
+}
+
+define i32 @minus_one32() optsize {
+entry:
+ ret i32 -1
+
+; CHECK32-LABEL: minus_one32:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: retl
+}
+
+define i32 @minus_one32_minsize() minsize {
+entry:
+ ret i32 -1
+
+; xor-dec is preferred over push-pop.
+; CHECK32-LABEL: minus_one32_minsize:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: retl
+}
+
+define i16 @one16() optsize {
+entry:
+ ret i16 1
+
+; CHECK32-LABEL: one16:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: # kill
+; CHECK32-NEXT: retl
+}
+
+define i16 @minus_one16() optsize {
+entry:
+ ret i16 -1
+
+; CHECK32-LABEL: minus_one16:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: # kill
+; CHECK32-NEXT: retl
+}
+
+define i32 @minus_five32() minsize {
+entry:
+ ret i32 -5
+
+; CHECK32-LABEL: minus_five32:
+; CHECK32: pushl $-5
+; CHECK32: popl %eax
+; CHECK32: retl
+}
+
+define i64 @minus_five64() minsize {
+entry:
+ ret i64 -5
+
+; CHECK64-LABEL: minus_five64:
+; CHECK64: pushq $-5
+; CHECK64: .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64: .cfi_adjust_cfa_offset -8
+; CHECK64: retq
+}
+
+define i32 @rematerialize_minus_one() optsize {
+entry:
+ ; Materialize -1 (thiscall forces it into %ecx).
+ tail call x86_thiscallcc void @f(i32 -1)
+
+ ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+ ; spilling it to the stack.
+ tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+ ; -1 should be re-materialized here instead of getting spilled above.
+ ret i32 -1
+
+; CHECK32-LABEL: rematerialize_minus_one
+; CHECK32: xorl %ecx, %ecx
+; CHECK32-NEXT: decl %ecx
+; CHECK32: calll
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NOT: %eax
+; CHECK32: retl
+}
+
+define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
+entry:
+ ; Materialize -1 (thiscall forces it into %ecx).
+ tail call x86_thiscallcc void @f(i32 -1)
+
+ ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+ ; spilling it to the stack.
+ tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+ ; Define eflags.
+ %a = icmp ne i32 %x, 123
+ %b = zext i1 %a to i32
+ ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+ ; It must therefore not use the xor-dec lowering.
+ %c = select i1 %a, i32 %b, i32 -1
+ ret i32 %c
+
+; CHECK32-LABEL: rematerialize_minus_one_eflags
+; CHECK32: xorl %ecx, %ecx
+; CHECK32-NEXT: decl %ecx
+; CHECK32: calll
+; CHECK32: cmpl
+; CHECK32: setne
+; CHECK32-NOT: xorl
+; CHECK32: movl $-1
+; CHECK32: cmov
+; CHECK32: retl
+}
+
+declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/mbp-false-cfg-break.ll b/test/CodeGen/X86/mbp-false-cfg-break.ll
new file mode 100644
index 000000000000..bc8b0de3eef0
--- /dev/null
+++ b/test/CodeGen/X86/mbp-false-cfg-break.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define void @test(i1 %cnd) !prof !{!"function_entry_count", i64 1024} {
+; CHECK-LABEL: @test
+; Using the assembly comments to indicate block order..
+; CHECK: # %loop
+; CHECK: # %backedge
+; CHECK: # %exit
+; CHECK: # %rare
+; CHECK: # %rare.1
+
+ br i1 undef, label %rare.1, label %preheader, !prof !{!"branch_weights", i32 0, i32 1000}
+rare.1:
+ call void @foo()
+ br label %preheader
+
+preheader:
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %preheader], [%iv.next, %backedge]
+ call void @foo()
+ br i1 %cnd, label %backedge, label %rare, !prof !{!"branch_weights", i32 1000000, i32 1}
+rare:
+ call void @foo()
+ br label %backedge
+backedge:
+ call void @foo()
+ %iv.next = add i32 %iv, 1
+ %cmp = icmp eq i32 %iv.next, 200
+ br i1 %cmp, label %loop, label %exit, !prof !{!"branch_weights", i32 1000, i32 1}
+
+exit:
+ ret void
+
+}
+
+
+declare void @foo()
diff --git a/test/CodeGen/X86/mcinst-lowering.ll b/test/CodeGen/X86/mcinst-lowering.ll
index 51b2895f1c78..7b16d7616fe5 100644
--- a/test/CodeGen/X86/mcinst-lowering.ll
+++ b/test/CodeGen/X86/mcinst-lowering.ll
@@ -3,26 +3,17 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
+declare i32 @foo();
+
define i32 @f0(i32* nocapture %x) nounwind readonly ssp {
entry:
- %tmp1 = load i32, i32* %x ; <i32> [#uses=2]
- %tobool = icmp eq i32 %tmp1, 0 ; <i1> [#uses=1]
- br i1 %tobool, label %if.end, label %return
-
-if.end: ; preds = %entry
-
-; Check that we lower to the short form of cmpl, which has a fixed %eax
-; register.
-;
+ %tmp1 = call i32 @foo()
; CHECK: cmpl $16777216, %eax
; CHECK: # encoding: [0x3d,0x00,0x00,0x00,0x01]
%cmp = icmp eq i32 %tmp1, 16777216 ; <i1> [#uses=1]
%conv = zext i1 %cmp to i32 ; <i32> [#uses=1]
ret i32 %conv
-
-return: ; preds = %entry
- ret i32 0
}
define i32 @f1() nounwind {
diff --git a/test/CodeGen/X86/mcu-abi.ll b/test/CodeGen/X86/mcu-abi.ll
index 966fd4521f2d..1cc277c863f0 100644
--- a/test/CodeGen/X86/mcu-abi.ll
+++ b/test/CodeGen/X86/mcu-abi.ll
@@ -82,6 +82,8 @@ entry:
ret i32 %i1
}
+%struct.S = type { i8 }
+
; CHECK-LABEL: test_lib_args:
; CHECK: movl %edx, %eax
; CHECK: calll __fixsfsi
@@ -91,14 +93,10 @@ define i32 @test_lib_args(float %a, float %b) #0 {
}
; CHECK-LABEL: test_fp128:
-; CHECK: movl (%eax), %e[[CX:..]]
-; CHECK-NEXT: movl 4(%eax), %e[[DX:..]]
-; CHECK-NEXT: movl 8(%eax), %e[[SI:..]]
-; CHECK-NEXT: movl 12(%eax), %e[[AX:..]]
-; CHECK-NEXT: movl %e[[AX]], 12(%esp)
-; CHECK-NEXT: movl %e[[SI]], 8(%esp)
-; CHECK-NEXT: movl %e[[DX]], 4(%esp)
-; CHECK-NEXT: movl %e[[CX]], (%esp)
+; CHECK: pushl 12(%eax)
+; CHECK-NEXT: pushl 8(%eax)
+; CHECK-NEXT: pushl 4(%eax)
+; CHECK-NEXT: pushl (%eax)
; CHECK-NEXT: calll __fixtfsi
define i32 @test_fp128(fp128* %ptr) #0 {
%v = load fp128, fp128* %ptr
@@ -108,5 +106,50 @@ define i32 @test_fp128(fp128* %ptr) #0 {
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+; CHECK-LABEL: test_alignment_d:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_d() #0 {
+entry:
+ %d = alloca double
+ store double 2.000000e+00, double* %d
+ call void @food(double* inreg %d)
+ ret void
+}
+
+; CHECK-LABEL: test_alignment_i:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_i() #0 {
+entry:
+ %i = alloca i64
+ store i64 2, i64* %i
+ call void @fooi(i64* inreg %i)
+ ret void
+}
+
+
+; CHECK-LABEL: test_alignment_s:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_s() #0 {
+ %s = alloca %struct.S, align 4
+ call void @foos(%struct.S* inreg %s)
+ ret void
+}
+
+
+; CHECK-LABEL: test_alignment_fp:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_fp() #0 {
+entry:
+ %f = alloca fp128
+ store fp128 0xL00000000000000004000000000000000, fp128* %f
+ call void @foofp(fp128* inreg %f)
+ ret void
+}
+
+declare void @food(double* inreg)
+declare void @fooi(i64* inreg)
+declare void @foos(%struct.S* inreg)
+declare void @foofp(fp128* inreg)
+
attributes #0 = { nounwind "use-soft-float"="true"}
attributes #1 = { nounwind argmemonly }
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index e5f1f526b467..6a51d60f636c 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -23,8 +23,7 @@ return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp2:
; CHECK: movzwl
-; CHECK-NEXT: movzwl
-; CHECK-NEXT: cmpl
+; CHECK-NEXT: cmpw
; NOBUILTIN-LABEL: memcmp2:
; NOBUILTIN: callq
}
@@ -46,6 +45,21 @@ return: ; preds = %entry
; CHECK-NEXT: cmpl $28527,
}
+define void @memcmp2nb(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+entry:
+ %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin ; <i32> [#uses=1]
+ %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1]
+ br i1 %1, label %return, label %bb
+
+bb: ; preds = %entry
+ store i32 4, i32* %P, align 4
+ ret void
+
+return: ; preds = %entry
+ ret void
+; CHECK-LABEL: memcmp2nb:
+; CHECK: callq
+}
define void @memcmp4(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
entry:
diff --git a/test/CodeGen/X86/memcpy-from-string.ll b/test/CodeGen/X86/memcpy-from-string.ll
new file mode 100644
index 000000000000..d62d9e20254a
--- /dev/null
+++ b/test/CodeGen/X86/memcpy-from-string.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%0 = type { %1, i64, %2 }
+%1 = type { i8* }
+%2 = type { i64, [8 x i8] }
+
+@0 = internal constant [10 x i8] c"asdf jkl;\00", align 1
+
+; Memcpy lowering should emit stores of immediates containing string data from
+; the correct offsets.
+
+; CHECK-LABEL: foo:
+; CHECK: movb $0, 6(%rdi)
+; CHECK: movw $15212, 4(%rdi)
+; CHECK: movl $1802117222, (%rdi)
+define void @foo(i8* %tmp2) {
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @0, i64 0, i64 3), i64 7, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index a87ef2e15a5a..a02ef29ca6b3 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -1,19 +1,30 @@
-; RUN: llc -mtriple=i386-apple-darwin -mcpu=yonah < %s | FileCheck %s
-
-declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s
define fastcc void @t1() nounwind {
-entry:
; CHECK-LABEL: t1:
-; CHECK: calll L_memset$stub
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: subl $16, %esp
+; CHECK-NEXT: pushl $188
+; CHECK-NEXT: pushl $0
+; CHECK-NEXT: pushl $0
+; CHECK-NEXT: calll _memset
+; CHECK-NEXT: addl $16, %esp
+;
+entry:
call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i32 1, i1 false)
unreachable
}
define fastcc void @t2(i8 signext %c) nounwind {
-entry:
; CHECK-LABEL: t2:
-; CHECK: calll L_memset$stub
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp)
+; CHECK-NEXT: calll _memset
+;
+entry:
call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i32 1, i1 false)
unreachable
}
@@ -21,19 +32,34 @@ entry:
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
define void @t3(i8* nocapture %s, i8 %a) nounwind {
+; CHECK-LABEL: t3:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101
+; CHECK-NEXT: movl %ecx, 4(%eax)
+; CHECK-NEXT: movl %ecx, (%eax)
+; CHECK-NEXT: retl
+;
entry:
tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 8, i32 1, i1 false)
ret void
-; CHECK-LABEL: t3:
-; CHECK: imull $16843009
}
define void @t4(i8* nocapture %s, i8 %a) nounwind {
+; CHECK-LABEL: t4:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101
+; CHECK-NEXT: movl %ecx, 8(%eax)
+; CHECK-NEXT: movl %ecx, 4(%eax)
+; CHECK-NEXT: movl %ecx, (%eax)
+; CHECK-NEXT: movw %cx, 12(%eax)
+; CHECK-NEXT: movb %cl, 14(%eax)
+; CHECK-NEXT: retl
+;
entry:
tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i32 1, i1 false)
ret void
-; CHECK-LABEL: t4:
-; CHECK: imull $16843009
-; CHECK-NOT: imul
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
new file mode 100644
index 000000000000..29fee0710405
--- /dev/null
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -0,0 +1,470 @@
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; https://llvm.org/bugs/show_bug.cgi?id=27100
+
+define void @memset_16_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_16_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_16_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_16_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
+ ret void
+}
+
+define void @memset_32_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_32_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 24(%rdi)
+; SSE-NEXT: movq %rax, 16(%rdi)
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_32_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_32_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
+ ret void
+}
+
+define void @memset_64_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_64_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 56(%rdi)
+; SSE-NEXT: movq %rax, 48(%rdi)
+; SSE-NEXT: movq %rax, 40(%rdi)
+; SSE-NEXT: movq %rax, 32(%rdi)
+; SSE-NEXT: movq %rax, 24(%rdi)
+; SSE-NEXT: movq %rax, 16(%rdi)
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_64_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_64_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
+ ret void
+}
+
+define void @memset_128_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_128_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 120(%rdi)
+; SSE-NEXT: movq %rax, 112(%rdi)
+; SSE-NEXT: movq %rax, 104(%rdi)
+; SSE-NEXT: movq %rax, 96(%rdi)
+; SSE-NEXT: movq %rax, 88(%rdi)
+; SSE-NEXT: movq %rax, 80(%rdi)
+; SSE-NEXT: movq %rax, 72(%rdi)
+; SSE-NEXT: movq %rax, 64(%rdi)
+; SSE-NEXT: movq %rax, 56(%rdi)
+; SSE-NEXT: movq %rax, 48(%rdi)
+; SSE-NEXT: movq %rax, 40(%rdi)
+; SSE-NEXT: movq %rax, 32(%rdi)
+; SSE-NEXT: movq %rax, 24(%rdi)
+; SSE-NEXT: movq %rax, 16(%rdi)
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_128_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_128_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
+ ret void
+}
+
+define void @memset_256_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_256_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: .Ltmp0:
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: movl $42, %esi
+; SSE-NEXT: movl $256, %edx # imm = 0x100
+; SSE-NEXT: callq memset
+; SSE-NEXT: popq %rax
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_256_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 240(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 224(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 208(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 192(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 176(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 160(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 144(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 128(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_256_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, 224(%rdi)
+; AVX-NEXT: vmovups %ymm0, 192(%rdi)
+; AVX-NEXT: vmovups %ymm0, 160(%rdi)
+; AVX-NEXT: vmovups %ymm0, 128(%rdi)
+; AVX-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
+ ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+; Repeat with a non-constant value for the stores.
+
+define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_16_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_16_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_16_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_16_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_32_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 24(%rdi)
+; SSE-NEXT: movq %rcx, 16(%rdi)
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_32_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_32_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_32_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_64_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 56(%rdi)
+; SSE-NEXT: movq %rcx, 48(%rdi)
+; SSE-NEXT: movq %rcx, 40(%rdi)
+; SSE-NEXT: movq %rcx, 32(%rdi)
+; SSE-NEXT: movq %rcx, 24(%rdi)
+; SSE-NEXT: movq %rcx, 16(%rdi)
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_64_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_64_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_64_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_128_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 120(%rdi)
+; SSE-NEXT: movq %rcx, 112(%rdi)
+; SSE-NEXT: movq %rcx, 104(%rdi)
+; SSE-NEXT: movq %rcx, 96(%rdi)
+; SSE-NEXT: movq %rcx, 88(%rdi)
+; SSE-NEXT: movq %rcx, 80(%rdi)
+; SSE-NEXT: movq %rcx, 72(%rdi)
+; SSE-NEXT: movq %rcx, 64(%rdi)
+; SSE-NEXT: movq %rcx, 56(%rdi)
+; SSE-NEXT: movq %rcx, 48(%rdi)
+; SSE-NEXT: movq %rcx, 40(%rdi)
+; SSE-NEXT: movq %rcx, 32(%rdi)
+; SSE-NEXT: movq %rcx, 24(%rdi)
+; SSE-NEXT: movq %rcx, 16(%rdi)
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_128_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_128_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_128_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_256_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movl $256, %edx # imm = 0x100
+; SSE-NEXT: jmp memset # TAILCALL
+;
+; SSE2FAST-LABEL: memset_256_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 192(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 176(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 160(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 144(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 128(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_256_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 128(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_256_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index 8cfa032797f7..861cb88b0f57 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,12 +1,60 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
-; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core2 | grep movl | count 20
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=SLOW_32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=SLOW_64
define void @bork() nounwind {
-entry:
+; FAST-LABEL: bork:
+; FAST: # BB#0:
+; FAST-NEXT: xorps %xmm0, %xmm0
+; FAST-NEXT: movups %xmm0, 64
+; FAST-NEXT: movups %xmm0, 48
+; FAST-NEXT: movups %xmm0, 32
+; FAST-NEXT: movups %xmm0, 16
+; FAST-NEXT: movups %xmm0, 0
+; FAST-NEXT: retl
+;
+; SLOW_32-LABEL: bork:
+; SLOW_32: # BB#0:
+; SLOW_32-NEXT: movl $0, 4
+; SLOW_32-NEXT: movl $0, 0
+; SLOW_32-NEXT: movl $0, 12
+; SLOW_32-NEXT: movl $0, 8
+; SLOW_32-NEXT: movl $0, 20
+; SLOW_32-NEXT: movl $0, 16
+; SLOW_32-NEXT: movl $0, 28
+; SLOW_32-NEXT: movl $0, 24
+; SLOW_32-NEXT: movl $0, 36
+; SLOW_32-NEXT: movl $0, 32
+; SLOW_32-NEXT: movl $0, 44
+; SLOW_32-NEXT: movl $0, 40
+; SLOW_32-NEXT: movl $0, 52
+; SLOW_32-NEXT: movl $0, 48
+; SLOW_32-NEXT: movl $0, 60
+; SLOW_32-NEXT: movl $0, 56
+; SLOW_32-NEXT: movl $0, 68
+; SLOW_32-NEXT: movl $0, 64
+; SLOW_32-NEXT: movl $0, 76
+; SLOW_32-NEXT: movl $0, 72
+; SLOW_32-NEXT: retl
+;
+; SLOW_64-LABEL: bork:
+; SLOW_64: # BB#0:
+; SLOW_64-NEXT: movq $0, 72
+; SLOW_64-NEXT: movq $0, 64
+; SLOW_64-NEXT: movq $0, 56
+; SLOW_64-NEXT: movq $0, 48
+; SLOW_64-NEXT: movq $0, 40
+; SLOW_64-NEXT: movq $0, 32
+; SLOW_64-NEXT: movq $0, 24
+; SLOW_64-NEXT: movq $0, 16
+; SLOW_64-NEXT: movq $0, 8
+; SLOW_64-NEXT: movq $0, 0
+; SLOW_64-NEXT: retq
+;
call void @llvm.memset.p0i8.i64(i8* null, i8 0, i64 80, i32 4, i1 false)
ret void
}
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
new file mode 100644
index 000000000000..59b7efdf9bf8
--- /dev/null
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -0,0 +1,783 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
+;
+; Just one 32-bit run to make sure we do reasonable things.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
+
+define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_2f64_f64_23:
+; SSE: # BB#0:
+; SSE-NEXT: movups 16(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_2f64_f64_23:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 16(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_2f64_f64_23:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 16(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <2 x double> undef, double %val0, i32 0
+ %res1 = insertelement <2 x double> %res0, double %val1, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_2i64_i64_12:
+; SSE: # BB#0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_2i64_i64_12:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_2i64_i64_12:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 8(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
+ ret <2 x i64> %res1
+}
+
+define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_2345:
+; SSE: # BB#0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_2345:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_2345:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 8(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float %val3, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_3zuu:
+; SSE: # BB#0:
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_3zuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_3zuu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
+ %val0 = load float, float* %ptr0
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
+ ret <4 x float> %res1
+}
+
+define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_34uu:
+; SSE: # BB#0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_34uu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_34uu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ ret <4 x float> %res1
+}
+
+define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_34z6:
+; SSE2: # BB#0:
+; SSE2-NEXT: movups 12(%rdi), %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_34z6:
+; SSE41: # BB#0:
+; SSE41-NEXT: movups 12(%rdi), %xmm1
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_34z6:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_34z6:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 12(%eax), %xmm1
+; X32-SSE-NEXT: xorps %xmm0, %xmm0
+; X32-SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res3 = insertelement <4 x float> %res1, float %val3, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_45zz:
+; SSE: # BB#0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_45zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_45zz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ ret <4 x float> %res1
+}
+
+define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_012u:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_012u:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_012u:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_012u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float undef, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_019u:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_019u:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_019u:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_019u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float undef, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_23u5:
+; SSE: # BB#0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_23u5:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_23u5:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 8(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %val3 = load i32, i32* %ptr3
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
+ ret <4 x i32> %res3
+}
+
+define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_3zuu:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_3zuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_3zuu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %val0 = load i32, i32* %ptr0
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
+ ret <4 x i32> %res1
+}
+
+define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_34uu:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_34uu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_34uu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ ret <4 x i32> %res1
+}
+
+define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_45zz:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_45zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_45zz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ ret <4 x i32> %res1
+}
+
+define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_8i16_i16_23u567u9:
+; SSE: # BB#0:
+; SSE-NEXT: movups 4(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_8i16_i16_23u567u9:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 4(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_8i16_i16_23u567u9:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 4(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6
+ %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %val4 = load i16, i16* %ptr4
+ %val5 = load i16, i16* %ptr5
+ %val7 = load i16, i16* %ptr7
+ %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
+ %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
+ %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
+ ret <8 x i16> %res7
+}
+
+define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_8i16_i16_34uuuuuu:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_8i16_i16_34uuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_8i16_i16_34uuuuuu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
+ ret <8 x i16> %res1
+}
+
+define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_8i16_i16_45u7zzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_8i16_i16_45u7zzzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_8i16_i16_45u7zzzz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
+ %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 0, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 0, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 0, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 0, i32 7
+ ret <8 x i16> %res7
+}
+
+define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups (%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
+ %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8
+ %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9
+ %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10
+ %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11
+ %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12
+ %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13
+ %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %val4 = load i8, i8* %ptr4
+ %val5 = load i8, i8* %ptr5
+ %val6 = load i8, i8* %ptr6
+ %val7 = load i8, i8* %ptr7
+ %val8 = load i8, i8* %ptr8
+ %val9 = load i8, i8* %ptr9
+ %valA = load i8, i8* %ptrA
+ %valB = load i8, i8* %ptrB
+ %valC = load i8, i8* %ptrC
+ %valD = load i8, i8* %ptrD
+ %valF = load i8, i8* %ptrF
+ %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
+ %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
+ %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
+ %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
+ %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
+ %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
+ %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
+ ret <16 x i8> %resF
+}
+
+define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
+ %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
+ %res6 = insertelement <16 x i8> %res3, i8 0, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 0, i32 7
+ %resD = insertelement <16 x i8> %res7, i8 0, i32 13
+ %resE = insertelement <16 x i8> %resD, i8 0, i32 14
+ %resF = insertelement <16 x i8> %resE, i8 0, i32 15
+ ret <16 x i8> %resF
+}
+
+define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
+ %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val2 = load i8, i8* %ptr2
+ %val3 = load i8, i8* %ptr3
+ %val6 = load i8, i8* %ptr6
+ %val7 = load i8, i8* %ptr7
+ %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
+ %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
+ %resD = insertelement <16 x i8> %res7, i8 0, i32 13
+ %resE = insertelement <16 x i8> %resD, i8 0, i32 14
+ %resF = insertelement <16 x i8> %resE, i8 0, i32 15
+ ret <16 x i8> %resF
+}
+
+define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
+; SSE-LABEL: merge_4i32_i32_combine:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movaps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: merge_4i32_i32_combine:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovaps %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_4i32_i32_combine:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovaps %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_4i32_i32_combine:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_combine:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: movaps %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+ %1 = getelementptr i32, i32* %src, i32 0
+ %2 = load i32, i32* %1
+ %3 = insertelement <4 x i32> undef, i32 %2, i32 0
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+ %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef>
+ %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0>
+ store <4 x i32> %6, <4 x i32>* %dst
+ ret void
+}
+
+;
+; consecutive loads including any/all volatiles may not be combined
+;
+
+define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_2i64_i64_12_volatile:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_2i64_i64_12_volatile:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_2i64_i64_12_volatile:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: pinsrd $1, 12(%eax), %xmm0
+; X32-SSE-NEXT: pinsrd $2, 16(%eax), %xmm0
+; X32-SSE-NEXT: pinsrd $3, 20(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %val0 = load volatile i64, i64* %ptr0
+ %val1 = load volatile i64, i64* %ptr1
+ %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
+ ret <2 x i64> %res1
+}
+
+define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_2345_volatile:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_2345_volatile:
+; SSE41: # BB#0:
+; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_2345_volatile:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_2345_volatile:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
+ %val0 = load volatile float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float %val3, i32 3
+ ret <4 x float> %res3
+}
+
+;
+; Non-consecutive test.
+;
+
+define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_X0YY:
+; SSE: # BB#0:
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_X0YY:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_X0YY:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
+ %val0 = load float, float* %ptr0, align 4
+ %val1 = load float, float* %ptr1, align 4
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val1, i32 2
+ %res3 = insertelement <4 x float> %res2, float %val1, i32 3
+ ret <4 x float> %res3
+}
diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll
new file mode 100644
index 000000000000..8c2e93729004
--- /dev/null
+++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -0,0 +1,756 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
+;
+; Just one 32-bit run to make sure we do reasonable things.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+
+define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_2f64_23:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 32(%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_2f64_23:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 32(%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %val1 = load <2 x double>, <2 x double>* %ptr1
+ %res = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_2f64_2z:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps 32(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_2f64_2z:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %res = shufflevector <2 x double> %val0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <4 x double> @merge_4f64_f64_2345(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_2345:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 16(%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_2345:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 16(%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr2 = getelementptr inbounds double, double* %ptr, i64 4
+ %ptr3 = getelementptr inbounds double, double* %ptr, i64 5
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val2 = load double, double* %ptr2
+ %val3 = load double, double* %ptr3
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ %res2 = insertelement <4 x double> %res1, double %val2, i32 2
+ %res3 = insertelement <4 x double> %res2, double %val3, i32 3
+ ret <4 x double> %res3
+}
+
+define <4 x double> @merge_4f64_f64_3zuu(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_3zuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_3zuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %val0 = load double, double* %ptr0
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double 0.0, i32 1
+ ret <4 x double> %res1
+}
+
+define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_34uu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 24(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_34uu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 24(%eax), %xmm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ ret <4 x double> %res1
+}
+
+define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_45zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 32(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_45zz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 32(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 4
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 5
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <4 x double> zeroinitializer, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ ret <4 x double> %res1
+}
+
+define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_34z6:
+; AVX: # BB#0:
+; AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_34z6:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
+ %ptr3 = getelementptr inbounds double, double* %ptr, i64 6
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val3 = load double, double* %ptr3
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ %res2 = insertelement <4 x double> %res1, double 0.0, i32 2
+ %res3 = insertelement <4 x double> %res2, double %val3, i32 3
+ ret <4 x double> %res3
+}
+
+define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_2i64_3z:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps 48(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_2i64_3z:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3
+ %val0 = load <2 x i64>, <2 x i64>* %ptr0
+ %res = shufflevector <2 x i64> %val0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @merge_4i64_i64_1234(i64* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_i64_1234:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_i64_1234:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 8(%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i64, i64* %ptr, i64 4
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %val2 = load i64, i64* %ptr2
+ %val3 = load i64, i64* %ptr3
+ %res0 = insertelement <4 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %val1, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %val2, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %val3, i32 3
+ ret <4 x i64> %res3
+}
+
+define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_i64_1zzu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_i64_1zzu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %val0 = load i64, i64* %ptr0
+ %res0 = insertelement <4 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 0, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 0, i32 1
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_i64_23zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 16(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_i64_23zz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 16(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %res0 = insertelement <4 x i64> zeroinitializer, i64 %val0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %val1, i32 1
+ ret <4 x i64> %res1
+}
+
+define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8f32_2f32_23z5:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd 16(%rdi), %xmm0
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8f32_2f32_23z5:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovupd 16(%rdi), %xmm0
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8f32_2f32_23z5:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups 16(%rdi), %xmm0
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_2f32_23z5:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
+ %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
+ %ptr3 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 5
+ %val0 = load <2 x float>, <2 x float>* %ptr0
+ %val1 = load <2 x float>, <2 x float>* %ptr1
+ %val3 = load <2 x float>, <2 x float>* %ptr3
+ %res01 = shufflevector <2 x float> %val0, <2 x float> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res23 = shufflevector <2 x float> zeroinitializer, <2 x float> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x float> %res01, <4 x float> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8f32_4f32_z2:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_4f32_z2:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: vinsertf128 $1, 32(%eax), %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
+ %val1 = load <4 x float>, <4 x float>* %ptr1
+ %res = shufflevector <4 x float> zeroinitializer, <4 x float> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8f32_f32_12zzuuzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_f32_12zzuuzz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 2
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <8 x float> undef, float %val0, i32 0
+ %res1 = insertelement <8 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <8 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <8 x float> %res2, float 0.0, i32 3
+ %res6 = insertelement <8 x float> %res3, float 0.0, i32 6
+ %res7 = insertelement <8 x float> %res6, float 0.0, i32 7
+ ret <8 x float> %res7
+}
+
+define <8 x float> @merge_8f32_f32_1u3u5zu8(float* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8f32_f32_1u3u5zu8:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_f32_1u3u5zu8:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr4 = getelementptr inbounds float, float* %ptr, i64 5
+ %ptr7 = getelementptr inbounds float, float* %ptr, i64 8
+ %val0 = load float, float* %ptr0
+ %val2 = load float, float* %ptr2
+ %val4 = load float, float* %ptr4
+ %val7 = load float, float* %ptr7
+ %res0 = insertelement <8 x float> undef, float %val0, i32 0
+ %res2 = insertelement <8 x float> %res0, float %val2, i32 2
+ %res4 = insertelement <8 x float> %res2, float %val4, i32 4
+ %res5 = insertelement <8 x float> %res4, float 0.0, i32 5
+ %res7 = insertelement <8 x float> %res5, float %val7, i32 7
+ ret <8 x float> %res7
+}
+
+define <8 x i32> @merge_8i32_4i32_z3(<4 x i32>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8i32_4i32_z3:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8i32_4i32_z3:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: vinsertf128 $1, 48(%eax), %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
+ %val1 = load <4 x i32>, <4 x i32>* %ptr1
+ %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @merge_8i32_i32_56zz9uzz(i32* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8i32_i32_56zz9uzz:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8i32_i32_56zz9uzz:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8i32_i32_56zz9uzz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8i32_i32_56zz9uzz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 6
+ %ptr4 = getelementptr inbounds i32, i32* %ptr, i64 9
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %val4 = load i32, i32* %ptr4
+ %res0 = insertelement <8 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %val1, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 0, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 0, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %val4, i32 4
+ %res6 = insertelement <8 x i32> %res4, i32 0, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 0, i32 7
+ ret <8 x i32> %res7
+}
+
+define <8 x i32> @merge_8i32_i32_1u3u5zu8(i32* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8i32_i32_1u3u5zu8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8i32_i32_1u3u5zu8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8i32_i32_1u3u5zu8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8i32_i32_1u3u5zu8:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
+ %ptr2 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr4 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %ptr7 = getelementptr inbounds i32, i32* %ptr, i64 8
+ %val0 = load i32, i32* %ptr0
+ %val2 = load i32, i32* %ptr2
+ %val4 = load i32, i32* %ptr4
+ %val7 = load i32, i32* %ptr7
+ %res0 = insertelement <8 x i32> undef, i32 %val0, i32 0
+ %res2 = insertelement <8 x i32> %res0, i32 %val2, i32 2
+ %res4 = insertelement <8 x i32> %res2, i32 %val4, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 0, i32 5
+ %res7 = insertelement <8 x i32> %res5, i32 %val7, i32 7
+ ret <8 x i32> %res7
+}
+
+define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 8
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 9
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <16 x i16> %res0, i16 %val1, i16 1
+ %res2 = insertelement <16 x i16> %res1, i16 0, i16 2
+ %res3 = insertelement <16 x i16> %res2, i16 0, i16 3
+ %res4 = insertelement <16 x i16> %res3, i16 0, i16 4
+ %resF = insertelement <16 x i16> %res4, i16 0, i16 15
+ ret <16 x i16> %resF
+}
+
+define <16 x i16> @merge_16i16_i16_45u7uuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <16 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <16 x i16> %res1, i16 %val3, i16 3
+ ret <16 x i16> %res3
+}
+
+define <16 x i16> @merge_16i16_i16_0uu3uuuuuuuuCuEF(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups (%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
+ %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
+ %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
+ %val0 = load i16, i16* %ptr0
+ %val3 = load i16, i16* %ptr3
+ %valC = load i16, i16* %ptrC
+ %valE = load i16, i16* %ptrE
+ %valF = load i16, i16* %ptrF
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
+ %resC = insertelement <16 x i16> %res3, i16 %valC, i16 12
+ %resE = insertelement <16 x i16> %resC, i16 %valE, i16 14
+ %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
+ ret <16 x i16> %resF
+}
+
+define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,0,0,0,0,0,0,0,0,65535,0,65535,65535]
+; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,0,0,0,0,0,0,0,0,65535,0,65535,65535]
+; X32-AVX-NEXT: vandps (%eax), %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
+ %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
+ %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
+ %val0 = load i16, i16* %ptr0
+ %val3 = load i16, i16* %ptr3
+ %valC = load i16, i16* %ptrC
+ %valE = load i16, i16* %ptrE
+ %valF = load i16, i16* %ptrF
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
+ %res4 = insertelement <16 x i16> %res3, i16 0, i16 4
+ %res5 = insertelement <16 x i16> %res4, i16 0, i16 5
+ %resC = insertelement <16 x i16> %res5, i16 %valC, i16 12
+ %resD = insertelement <16 x i16> %resC, i16 0, i16 13
+ %resE = insertelement <16 x i16> %resD, i16 %valE, i16 14
+ %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
+ ret <16 x i16> %resF
+}
+
+define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <32 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <32 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <32 x i8> %res1, i8 %val3, i8 3
+ ret <32 x i8> %res3
+}
+
+define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <32 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <32 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <32 x i8> %res1, i8 %val3, i8 3
+ %resE = insertelement <32 x i8> %res3, i8 0, i8 14
+ %resF = insertelement <32 x i8> %resE, i8 0, i8 15
+ %resG = insertelement <32 x i8> %resF, i8 0, i8 16
+ %resH = insertelement <32 x i8> %resG, i8 0, i8 17
+ ret <32 x i8> %resH
+}
+
+;
+; consecutive loads including any/all volatiles may not be combined
+;
+
+define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_4f64_f64_34uz_volatile:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_4f64_f64_34uz_volatile:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_4f64_f64_34uz_volatile:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_34uz_volatile:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
+ %val0 = load volatile double, double* %ptr0
+ %val1 = load volatile double, double* %ptr1
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ %res3 = insertelement <4 x double> %res1, double 0.0, i32 3
+ ret <4 x double> %res3
+}
+
+define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: vpinsrw $0, (%eax), %xmm0, %xmm1
+; X32-AVX-NEXT: vpinsrw $3, 6(%eax), %xmm1, %xmm1
+; X32-AVX-NEXT: vpinsrw $4, 24(%eax), %xmm0, %xmm0
+; X32-AVX-NEXT: vpinsrw $6, 28(%eax), %xmm0, %xmm0
+; X32-AVX-NEXT: vpinsrw $7, 30(%eax), %xmm0, %xmm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
+ %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
+ %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
+ %val0 = load volatile i16, i16* %ptr0
+ %val3 = load i16, i16* %ptr3
+ %valC = load i16, i16* %ptrC
+ %valE = load i16, i16* %ptrE
+ %valF = load volatile i16, i16* %ptrF
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
+ %res4 = insertelement <16 x i16> %res3, i16 0, i16 4
+ %res5 = insertelement <16 x i16> %res4, i16 0, i16 5
+ %resC = insertelement <16 x i16> %res5, i16 %valC, i16 12
+ %resD = insertelement <16 x i16> %resC, i16 0, i16 13
+ %resE = insertelement <16 x i16> %resD, i16 %valE, i16 14
+ %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
+ ret <16 x i16> %resF
+}
diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll
new file mode 100644
index 000000000000..bb9a342ae9ae
--- /dev/null
+++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -0,0 +1,718 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+;
+; Just one 32-bit run to make sure we do reasonable things.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32-AVX512F
+
+define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_2f64_12u4:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 16(%rdi), %ymm0
+; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_2f64_12u4:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 16(%eax), %ymm0
+; X32-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1
+ %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 4
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %val1 = load <2 x double>, <2 x double>* %ptr1
+ %val3 = load <2 x double>, <2 x double>* %ptr3
+ %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_2f64_23z5:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 32(%rdi), %ymm0
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_2f64_23z5:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 32(%eax), %ymm0
+; X32-AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
+ %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 5
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %val1 = load <2 x double>, <2 x double>* %ptr1
+ %val3 = load <2 x double>, <2 x double>* %ptr3
+ %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_4f64_z2:
+; ALL: # BB#0:
+; ALL-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_4f64_z2:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2
+ %val1 = load <4 x double>, <4 x double>* %ptr1
+ %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_23uuuuu9:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 16(%rdi), %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 16(%eax), %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val7 = load double, double* %ptr7
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res1 = insertelement <8 x double> %res0, double %val1, i32 1
+ %res7 = insertelement <8 x double> %res1, double %val7, i32 7
+ ret <8 x double> %res7
+}
+
+define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_12zzuuzz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 8(%rdi), %xmm0
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 8(%eax), %xmm0
+; X32-AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 2
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res1 = insertelement <8 x double> %res0, double %val1, i32 1
+ %res2 = insertelement <8 x double> %res1, double 0.0, i32 2
+ %res3 = insertelement <8 x double> %res2, double 0.0, i32 3
+ %res6 = insertelement <8 x double> %res3, double 0.0, i32 6
+ %res7 = insertelement <8 x double> %res6, double 0.0, i32 7
+ ret <8 x double> %res7
+}
+
+define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 8(%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7>
+; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
+; X32-AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
+ %ptr2 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr4 = getelementptr inbounds double, double* %ptr, i64 5
+ %ptr7 = getelementptr inbounds double, double* %ptr, i64 8
+ %val0 = load double, double* %ptr0
+ %val2 = load double, double* %ptr2
+ %val4 = load double, double* %ptr4
+ %val7 = load double, double* %ptr7
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res2 = insertelement <8 x double> %res0, double %val2, i32 2
+ %res4 = insertelement <8 x double> %res2, double %val4, i32 4
+ %res5 = insertelement <8 x double> %res4, double 0.0, i32 5
+ %res7 = insertelement <8 x double> %res5, double %val7, i32 7
+ ret <8 x double> %res7
+}
+
+define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8i64_4i64_z3:
+; ALL: # BB#0:
+; ALL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, 96(%rdi), %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8i64_4i64_z3:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinserti64x4 $1, 96(%eax), %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3
+ %val1 = load <4 x i64>, <4 x i64>* %ptr1
+ %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8i64_i64_56zz9uzz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu 40(%rdi), %xmm0
+; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu 40(%eax), %xmm0
+; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6
+ %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 9
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %val4 = load i64, i64* %ptr4
+ %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1
+ %res2 = insertelement <8 x i64> %res1, i64 0, i32 2
+ %res3 = insertelement <8 x i64> %res2, i64 0, i32 3
+ %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4
+ %res6 = insertelement <8 x i64> %res4, i64 0, i32 6
+ %res7 = insertelement <8 x i64> %res6, i64 0, i32 7
+ ret <8 x i64> %res7
+}
+
+define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7>
+; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
+; X32-AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
+ %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5
+ %ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8
+ %val0 = load i64, i64* %ptr0
+ %val2 = load i64, i64* %ptr2
+ %val4 = load i64, i64* %ptr4
+ %val7 = load i64, i64* %ptr7
+ %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
+ %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2
+ %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4
+ %res5 = insertelement <8 x i64> %res4, i64 0, i32 5
+ %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7
+ ret <8 x i64> %res7
+}
+
+define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 8
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 9
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res1 = insertelement <16 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <16 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <16 x float> %res2, float 0.0, i32 3
+ %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
+ %resF = insertelement <16 x float> %res4, float 0.0, i32 15
+ ret <16 x float> %resF
+}
+
+define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups 16(%rdi), %xmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups 16(%eax), %xmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 7
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res1 = insertelement <16 x float> %res0, float %val1, i32 1
+ %res3 = insertelement <16 x float> %res1, float %val3, i32 3
+ ret <16 x float> %res3
+}
+
+define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups (%rdi), %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptrC = getelementptr inbounds float, float* %ptr, i64 12
+ %ptrE = getelementptr inbounds float, float* %ptr, i64 14
+ %ptrF = getelementptr inbounds float, float* %ptr, i64 15
+ %val0 = load float, float* %ptr0
+ %val3 = load float, float* %ptr3
+ %valC = load float, float* %ptrC
+ %valE = load float, float* %ptrE
+ %valF = load float, float* %ptrF
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res3 = insertelement <16 x float> %res0, float %val3, i32 3
+ %resC = insertelement <16 x float> %res3, float %valC, i32 12
+ %resE = insertelement <16 x float> %resC, float %valE, i32 14
+ %resF = insertelement <16 x float> %resE, float %valF, i32 15
+ ret <16 x float> %resF
+}
+
+define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups (%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; X32-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptrC = getelementptr inbounds float, float* %ptr, i64 12
+ %ptrE = getelementptr inbounds float, float* %ptr, i64 14
+ %ptrF = getelementptr inbounds float, float* %ptr, i64 15
+ %val0 = load float, float* %ptr0
+ %val3 = load float, float* %ptr3
+ %valC = load float, float* %ptrC
+ %valE = load float, float* %ptrE
+ %valF = load float, float* %ptrF
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res3 = insertelement <16 x float> %res0, float %val3, i32 3
+ %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
+ %res5 = insertelement <16 x float> %res4, float 0.0, i32 5
+ %resC = insertelement <16 x float> %res5, float %valC, i32 12
+ %resD = insertelement <16 x float> %resC, float 0.0, i32 13
+ %resE = insertelement <16 x float> %resD, float %valE, i32 14
+ %resF = insertelement <16 x float> %resE, float %valF, i32 15
+ ret <16 x float> %resF
+}
+
+define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
+ %res2 = insertelement <16 x i32> %res1, i32 0, i32 2
+ %res3 = insertelement <16 x i32> %res2, i32 0, i32 3
+ %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
+ %resF = insertelement <16 x i32> %res4, i32 0, i32 15
+ ret <16 x i32> %resF
+}
+
+define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups 8(%rdi), %xmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %val3 = load i32, i32* %ptr3
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
+ %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3
+ ret <16 x i32> %res3
+}
+
+define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu32 (%rdi), %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
+ %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
+ %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
+ %val0 = load i32, i32* %ptr0
+ %val3 = load i32, i32* %ptr3
+ %valC = load i32, i32* %ptrC
+ %valE = load i32, i32* %ptrE
+ %valF = load i32, i32* %ptrF
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
+ %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
+ %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
+ %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
+ ret <16 x i32> %resF
+}
+
+define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu32 (%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; X32-AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
+ %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
+ %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
+ %val0 = load i32, i32* %ptr0
+ %val3 = load i32, i32* %ptr3
+ %valC = load i32, i32* %ptrC
+ %valE = load i32, i32* %ptrE
+ %valF = load i32, i32* %ptrF
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
+ %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
+ %res5 = insertelement <16 x i32> %res4, i32 0, i32 5
+ %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12
+ %resD = insertelement <16 x i32> %resC, i32 0, i32 13
+ %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14
+ %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
+ ret <16 x i32> %resF
+}
+
+define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
+ %res30 = insertelement <32 x i16> %res3, i16 0, i16 30
+ %res31 = insertelement <32 x i16> %res30, i16 0, i16 31
+ ret <32 x i16> %res31
+}
+
+define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
+ ret <32 x i16> %res3
+}
+
+define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <32 x i16> %res1, i16 0, i16 3
+ %resE = insertelement <32 x i16> %res3, i16 0, i16 14
+ %resF = insertelement <32 x i16> %resE, i16 0, i16 15
+ %resG = insertelement <32 x i16> %resF, i16 0, i16 16
+ %resH = insertelement <32 x i16> %resG, i16 0, i16 17
+ ret <32 x i16> %resH
+}
+
+define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 8
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %val7 = load i8, i8* %ptr7
+ %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
+ %res7 = insertelement <64 x i8> %res3, i8 %val7, i8 7
+ %res14 = insertelement <64 x i8> %res7, i8 0, i8 14
+ %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
+ %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
+ %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
+ %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
+ ret <64 x i8> %res63
+}
+
+define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
+ %res14 = insertelement <64 x i8> %res3, i8 0, i8 14
+ %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
+ %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
+ %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
+ %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
+ ret <64 x i8> %res63
+}
+
+;
+; consecutive loads including any/all volatiles may not be combined
+;
+
+define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-AVX512F-NEXT: vbroadcastsd 72(%eax), %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
+ %val0 = load volatile double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val7 = load double, double* %ptr7
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res1 = insertelement <8 x double> %res0, double %val1, i32 1
+ %res7 = insertelement <8 x double> %res1, double %val7, i32 7
+ ret <8 x double> %res7
+}
+
+define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
+; ALL: # BB#0:
+; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0
+; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT: vpinsrd $2, 56(%rdi), %xmm1, %xmm1
+; ALL-NEXT: vpinsrd $3, 60(%rdi), %xmm1, %xmm1
+; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vpinsrd $2, 56(%eax), %xmm1, %xmm1
+; X32-AVX512F-NEXT: vpinsrd $3, 60(%eax), %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
+ %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
+ %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
+ %val0 = load volatile i32, i32* %ptr0
+ %val3 = load volatile i32, i32* %ptr3
+ %valC = load volatile i32, i32* %ptrC
+ %valE = load volatile i32, i32* %ptrE
+ %valF = load volatile i32, i32* %ptrF
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
+ %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
+ %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
+ %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
+ ret <16 x i32> %resF
+}
diff --git a/test/CodeGen/X86/merge-sp-update-lea.ll b/test/CodeGen/X86/merge-sp-update-lea.ll
new file mode 100644
index 000000000000..70209a2aec92
--- /dev/null
+++ b/test/CodeGen/X86/merge-sp-update-lea.ll
@@ -0,0 +1,32 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.5"
+
+; Check that the merging of SP updates, when LEAs are involved, happen
+; correctly.
+; CHECK-LABEL: useLEA:
+; CHECK: calll _realloc
+; Make sure that the offset we get here is 8 + 16.
+; We used to have 8 + 1 because we were not reading the right immediate form
+; the LEA instruction.
+; CHECK-NEXT: leal 24(%esp), %esp
+define noalias i8* @useLEA(i8* nocapture %p, i32 %nbytes) #0 {
+entry:
+ %cmp = icmp slt i32 %nbytes, 0
+ br i1 %cmp, label %cond.end.3, label %cond.false
+
+cond.false: ; preds = %entry
+ %tobool = icmp ne i32 %nbytes, 0
+ %cond = select i1 %tobool, i32 %nbytes, i32 1
+ %call = tail call i8* @realloc(i8* %p, i32 %cond)
+ br label %cond.end.3
+
+cond.end.3: ; preds = %entry, %cond.false
+ %cond4 = phi i8* [ %call, %cond.false ], [ null, %entry ]
+ ret i8* %cond4
+}
+
+; Function Attrs: nounwind optsize
+declare noalias i8* @realloc(i8* nocapture, i32)
+
+attributes #0 = { nounwind optsize ssp "disable-tail-calls"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-features"="+lea-sp" }
diff --git a/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
index 8e148aa76d38..735e64a076d0 100644
--- a/test/CodeGen/X86/merge-store-partially-alias-loads.ll
+++ b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
@@ -6,10 +6,10 @@
; they must not be placed on the same chain after merging.
; X86-LABEL: {{^}}merge_store_partial_overlap_load:
-; X86-DAG: movw ([[BASEREG:%[a-z]+]]), [[LO2:%[a-z]+]]
+; X86-DAG: movzwl ([[BASEREG:%[a-z]+]]), %e[[LO2:[a-z]+]]
; X86-DAG: movb 2([[BASEREG]]), [[HI1:%[a-z]+]]
-; X86-NEXT: movw [[LO2]], 1([[BASEREG]])
+; X86-NEXT: movw %[[LO2]], 1([[BASEREG]])
; X86-NEXT: movb [[HI1]], 3([[BASEREG]])
; X86-NEXT: retq
diff --git a/test/CodeGen/X86/mfence.ll b/test/CodeGen/X86/mfence.ll
index 6056adddcb4b..b67a5c355044 100644
--- a/test/CodeGen/X86/mfence.ll
+++ b/test/CodeGen/X86/mfence.ll
@@ -1,8 +1,37 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep sfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep lfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep mfence
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64
+
+; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence.
define void @test() {
+; X32-LABEL: test:
+; X32: # BB#0:
+; X32-NEXT: mfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: mfence
+; X64-NEXT: retq
fence seq_cst
ret void
}
+
+define i32 @fence(i32* %ptr) {
+; X32-LABEL: fence:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: mfence
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: fence:
+; X64: # BB#0:
+; X64-NEXT: mfence
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: retq
+ %atomic = atomicrmw add i32* %ptr, i32 0 seq_cst
+ ret i32 %atomic
+}
+
diff --git a/test/CodeGen/X86/mingw-alloca.ll b/test/CodeGen/X86/mingw-alloca.ll
index cbad4fbfa2ea..44097e983689 100644
--- a/test/CodeGen/X86/mingw-alloca.ll
+++ b/test/CodeGen/X86/mingw-alloca.ll
@@ -22,12 +22,12 @@ entry:
; COFF: andl $-16, %esp
; COFF: pushl %eax
; COFF: calll __alloca
-; COFF: movl 8028(%esp), %eax
+; COFF: movl 8012(%esp), %eax
; ELF: foo2:
; ELF: andl $-16, %esp
; ELF: pushl %eax
; ELF: calll _alloca
-; ELF: movl 8028(%esp), %eax
+; ELF: movl 8012(%esp), %eax
%A2 = alloca [2000 x i32], align 16 ; <[2000 x i32]*> [#uses=1]
%A2.sub = getelementptr [2000 x i32], [2000 x i32]* %A2, i32 0, i32 0 ; <i32*> [#uses=1]
call void @bar2( i32* %A2.sub, i32 %N )
diff --git a/test/CodeGen/X86/misched-aa-colored.ll b/test/CodeGen/X86/misched-aa-colored.ll
index ef7b98ac9c69..9f8f3a946e66 100644
--- a/test/CodeGen/X86/misched-aa-colored.ll
+++ b/test/CodeGen/X86/misched-aa-colored.ll
@@ -155,6 +155,7 @@ entry:
%ref.tmp.i = alloca %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", align 8
%Op.i = alloca %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", align 8
%0 = bitcast %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i to i8*
+ call void @llvm.lifetime.start(i64 24, i8* %0) #1
%retval.sroa.0.0.idx.i36 = getelementptr inbounds %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i, i64 0, i32 1, i32 0, i32 0
%retval.sroa.0.0.copyload.i37 = load i32, i32* %retval.sroa.0.0.idx.i36, align 8
call void @llvm.lifetime.end(i64 24, i8* %0) #1
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
index 0a1ea830a41d..db218f4bd097 100644
--- a/test/CodeGen/X86/misched-code-difference-with-debug.ll
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -49,10 +49,10 @@ entry:
%0 = load i8, i8* @argc, align 1
tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29), !dbg !DILocation(scope: !13)
%conv = sext i8 %0 to i32
- tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !DILocation(scope: !13)
+ tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !DIExpression(DW_OP_deref)), !dbg !DILocation(scope: !13)
%call = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
%1 = load i8, i8* @argc, align 1
- call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !DILocation(scope: !13)
+ call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !DIExpression(DW_OP_deref)), !dbg !DILocation(scope: !13)
%call2 = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
ret void
}
@@ -62,25 +62,24 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22, !23}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, enums: !2, retainedTypes: !3, subprograms: !12, globals: !20, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, enums: !2, retainedTypes: !3, globals: !20, imports: !2, emissionKind: FullDebug)
!1 = !DIFile(filename: "test.cpp", directory: "")
!2 = !{}
!3 = !{!4}
!4 = !DICompositeType(tag: DW_TAG_class_type, name: "C", line: 2, size: 8, align: 8, file: !1, elements: !5, identifier: "_ZTS1C")
!5 = !{!6}
-!6 = !DISubprogram(name: "test", file: !1, scope: !"_ZTS1C", type: !7, isDefinition: false)
+!6 = !DISubprogram(name: "test", file: !1, scope: !4, type: !7, isDefinition: false)
!7 = !DISubroutineType(types: !8)
!8 = !{!9, !10, !11, !11, !11, null}
!9 = !DIBasicType(encoding: DW_ATE_signed, size: 32, align: 32, name: "int")
-!10 = !DIDerivedType(baseType: !"_ZTS1C", tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial)
+!10 = !DIDerivedType(baseType: !4, tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial)
!11 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!12 = !{!13}
-!13 = distinct !DISubprogram(name: "test_with_debug", linkageName: "test_with_debug", line: 6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !14, type: !15, variables: !17)
+!13 = distinct !DISubprogram(name: "test_with_debug", linkageName: "test_with_debug", line: 6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 6, file: !1, scope: !14, type: !15, variables: !17)
!14 = !DIFile(filename: "test.cpp", directory: "")
!15 = !DISubroutineType(types: !16)
!16 = !{null}
!17 = !{!18, !19}
-!18 = !DILocalVariable(name: "c", line: 7, scope: !13, file: !14, type: !"_ZTS1C")
+!18 = !DILocalVariable(name: "c", line: 7, scope: !13, file: !14, type: !4)
!19 = !DILocalVariable(name: "lc", line: 8, scope: !13, file: !14, type: !11)
!20 = !{!21}
!21 = !DIGlobalVariable(name: "argc", line: 1, isLocal: false, isDefinition: true, scope: null, file: !14, type: !11, variable: i8* @argc)
diff --git a/test/CodeGen/X86/misched-ilp.ll b/test/CodeGen/X86/misched-ilp.ll
index 4ca296ca92e5..2babae25ea49 100644
--- a/test/CodeGen/X86/misched-ilp.ll
+++ b/test/CodeGen/X86/misched-ilp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
;
; Basic verification of the ScheduleDAGILP metric.
;
diff --git a/test/CodeGen/X86/mmx-bitcast-fold.ll b/test/CodeGen/X86/mmx-bitcast-fold.ll
new file mode 100644
index 000000000000..fc7ce73a441e
--- /dev/null
+++ b/test/CodeGen/X86/mmx-bitcast-fold.ll
@@ -0,0 +1,12 @@
+; RUN: opt -mtriple=x86_64-- -early-cse < %s -S | FileCheck %s
+
+; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
+
+define void @bar() {
+entry:
+ %0 = bitcast double 0.0 to x86_mmx
+ %1 = call x86_mmx @foo(x86_mmx %0)
+ ret void
+}
+
+declare x86_mmx @foo(x86_mmx)
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index 07d497b9f0a9..8e964bf16898 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -59,9 +59,10 @@ entry:
%0 = load i64, i64 addrspace(256)* %p
%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0
%1 = bitcast <2 x i64> %tmp2 to <8 x i16>
- %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone
- %3 = bitcast <4 x i32> %2 to <2 x i64>
- ret <2 x i64> %3
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i16> %2 to <4 x i32>
+ %4 = bitcast <4 x i32> %3 to <2 x i64>
+ ret <2 x i64> %4
}
; The two loads here both look identical to selection DAG, except for their
@@ -90,5 +91,3 @@ entry:
%tmp4 = add i32 %tmp1, %tmp3
ret i32 %tmp4
}
-
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/movmsk.ll b/test/CodeGen/X86/movmsk.ll
index a7ebebca4b72..1caa22a15947 100644
--- a/test/CodeGen/X86/movmsk.ll
+++ b/test/CodeGen/X86/movmsk.ll
@@ -1,12 +1,17 @@
-; RUN: llc -mcpu=core2 < %s | FileCheck %s
-; ModuleID = '<stdin>'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.6"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.6.6 -mattr=+sse4.1 | FileCheck %s
%0 = type { double }
%union.anon = type { float }
define i32 @double_signbit(double %d1) nounwind uwtable readnone ssp {
+; CHECK-LABEL: double_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskpd %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca double, align 8
%__u.i = alloca %0, align 8
@@ -16,15 +21,20 @@ entry:
%__f.i = getelementptr inbounds %0, %0* %__u.i, i64 0, i32 0
store double %d1, double* %__f.i, align 8
%tmp = bitcast double %d1 to i64
-; CHECK-NOT: shr
-; CHECK: movmskpd
-; CHECK-NEXT: and
%tmp1 = lshr i64 %tmp, 63
%shr.i = trunc i64 %tmp1 to i32
ret i32 %shr.i
}
define i32 @double_add_signbit(double %d1, double %d2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: double_add_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: addsd %xmm1, %xmm0
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskpd %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca double, align 8
%__u.i = alloca %0, align 8
@@ -35,15 +45,19 @@ entry:
%__f.i = getelementptr inbounds %0, %0* %__u.i, i64 0, i32 0
store double %add, double* %__f.i, align 8
%tmp = bitcast double %add to i64
-; CHECK-NOT: shr
-; CHECK: movmskpd
-; CHECK-NEXT: and
%tmp1 = lshr i64 %tmp, 63
%shr.i = trunc i64 %tmp1 to i32
ret i32 %shr.i
}
define i32 @float_signbit(float %f1) nounwind uwtable readnone ssp {
+; CHECK-LABEL: float_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskps %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca float, align 4
%__u.i = alloca %union.anon, align 4
@@ -53,14 +67,19 @@ entry:
%__f.i = getelementptr inbounds %union.anon, %union.anon* %__u.i, i64 0, i32 0
store float %f1, float* %__f.i, align 4
%2 = bitcast float %f1 to i32
-; CHECK-NOT: shr
-; CHECK: movmskps
-; CHECK-NEXT: and
%shr.i = lshr i32 %2, 31
ret i32 %shr.i
}
define i32 @float_add_signbit(float %f1, float %f2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: float_add_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: addss %xmm1, %xmm0
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskps %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca float, align 4
%__u.i = alloca %union.anon, align 4
@@ -71,21 +90,21 @@ entry:
%__f.i = getelementptr inbounds %union.anon, %union.anon* %__u.i, i64 0, i32 0
store float %add, float* %__f.i, align 4
%2 = bitcast float %add to i32
-; CHECK-NOT: shr
-; CHECK: movmskps
-; CHECK-NEXT: and
%shr.i = lshr i32 %2, 31
ret i32 %shr.i
}
; PR11570
-define void @float_call_signbit(double %n) {
-entry:
; FIXME: This should also use movmskps; we don't form the FGETSIGN node
; in this case, though.
+define void @float_call_signbit(double %n) {
; CHECK-LABEL: float_call_signbit:
-; CHECK: movd %xmm0, %rdi
-; FIXME
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movd %xmm0, %rdi
+; CHECK-NEXT: shrq $63, %rdi
+; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<kill>
+; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL
+entry:
%t0 = bitcast double %n to i64
%tobool.i.i.i.i = icmp slt i64 %t0, 0
tail call void @float_call_signbit_callee(i1 zeroext %tobool.i.i.i.i)
@@ -98,10 +117,12 @@ declare void @float_call_signbit_callee(i1 zeroext)
; movmskp{s|d} only set low 4/2 bits, high bits are known zero
define i32 @t1(<4 x float> %x, i32* nocapture %indexTable) nounwind uwtable readonly ssp {
-entry:
; CHECK-LABEL: t1:
-; CHECK: movmskps
-; CHECK-NOT: movslq
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movmskps %xmm0, %eax
+; CHECK-NEXT: movl (%rdi,%rax,4), %eax
+; CHECK-NEXT: retq
+entry:
%0 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %x) nounwind
%idxprom = sext i32 %0 to i64
%arrayidx = getelementptr inbounds i32, i32* %indexTable, i64 %idxprom
@@ -110,10 +131,12 @@ entry:
}
define i32 @t2(<4 x float> %x, i32* nocapture %indexTable) nounwind uwtable readonly ssp {
-entry:
; CHECK-LABEL: t2:
-; CHECK: movmskpd
-; CHECK-NOT: movslq
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movmskpd %xmm0, %eax
+; CHECK-NEXT: movl (%rdi,%rax,4), %eax
+; CHECK-NEXT: retq
+entry:
%0 = bitcast <4 x float> %x to <2 x double>
%1 = tail call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %0) nounwind
%idxprom = sext i32 %1 to i64
diff --git a/test/CodeGen/X86/movpc32-check.ll b/test/CodeGen/X86/movpc32-check.ll
index 606af3c898f4..f50613e9c718 100644
--- a/test/CodeGen/X86/movpc32-check.ll
+++ b/test/CodeGen/X86/movpc32-check.ll
@@ -19,11 +19,10 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
!llvm.module.flags = !{!7, !8, !9}
!llvm.ident = !{!10}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 3490ab8630d5643f71f1f04e46984f05b27b8d67) (http://llvm.org/git/llvm.git d2643e2ff955ed234944fe3c6b4ffc1250085843)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 3490ab8630d5643f71f1f04e46984f05b27b8d67) (http://llvm.org/git/llvm.git d2643e2ff955ed234944fe3c6b4ffc1250085843)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "test.c", directory: "movpc-test")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, variables: !2)
+!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, unit: !0, variables: !2)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index de4c87cf30ad..d715ccfa8c69 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -1,6 +1,8 @@
; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=i686-windows -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=i686-windows -stackrealign -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s -check-prefix=LINUX
%class.Class = type { i32 }
%struct.s = type { i64 }
@@ -11,26 +13,14 @@ declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32 %b, i32
declare void @oneparam(i32 %a)
declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
+declare void @inalloca(<{ %struct.s }>* inalloca)
-; Here, we should have a reserved frame, so we don't expect pushes
-; NORMAL-LABEL: test1:
-; NORMAL: subl $16, %esp
-; NORMAL-NEXT: movl $4, 12(%esp)
-; NORMAL-NEXT: movl $3, 8(%esp)
-; NORMAL-NEXT: movl $2, 4(%esp)
-; NORMAL-NEXT: movl $1, (%esp)
-; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-define void @test1() {
-entry:
- call void @good(i32 1, i32 2, i32 3, i32 4)
- ret void
-}
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
-; We're optimizing for code size, so we should get pushes for x86,
-; even though there is a reserved call frame.
-; Make sure we don't touch x86-64
-; NORMAL-LABEL: test1b:
+; We should get pushes for x86, even though there is a reserved call frame.
+; Make sure we don't touch x86-64, and that turning it off works.
+; NORMAL-LABEL: test1:
; NORMAL-NOT: subl {{.*}} %esp
; NORMAL: pushl $4
; NORMAL-NEXT: pushl $3
@@ -38,28 +28,21 @@ entry:
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $16, %esp
-; X64-LABEL: test1b:
+; X64-LABEL: test1:
; X64: movl $1, %ecx
; X64-NEXT: movl $2, %edx
; X64-NEXT: movl $3, %r8d
; X64-NEXT: movl $4, %r9d
; X64-NEXT: callq good
-define void @test1b() optsize {
-entry:
- call void @good(i32 1, i32 2, i32 3, i32 4)
- ret void
-}
-
-; Same as above, but for minsize
-; NORMAL-LABEL: test1c:
-; NORMAL-NOT: subl {{.*}} %esp
-; NORMAL: pushl $4
-; NORMAL-NEXT: pushl $3
-; NORMAL-NEXT: pushl $2
-; NORMAL-NEXT: pushl $1
-; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-define void @test1c() minsize {
+; NOPUSH-LABEL: test1:
+; NOPUSH: subl $16, %esp
+; NOPUSH-NEXT: movl $4, 12(%esp)
+; NOPUSH-NEXT: movl $3, 8(%esp)
+; NOPUSH-NEXT: movl $2, 4(%esp)
+; NOPUSH-NEXT: movl $1, (%esp)
+; NOPUSH-NEXT: call
+; NOPUSH-NEXT: addl $16, %esp
+define void @test1() {
entry:
call void @good(i32 1, i32 2, i32 3, i32 4)
ret void
@@ -245,8 +228,7 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-; NORMAL-NEXT: subl $20, %esp
+; NORMAL-NEXT: subl $4, %esp
; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]]
; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]]
; NORMAL-NEXT: movl [[E2]], 4(%esp)
@@ -283,7 +265,7 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: calll *16(%esp)
-; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: addl $24, %esp
define void @test10() optsize {
%stack_fptr = alloca void (i32, i32, i32, i32)*
store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr
@@ -336,8 +318,7 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: calll _good
-; NORMAL-NEXT: addl $16, %esp
-; NORMAL-NEXT: subl $20, %esp
+; NORMAL-NEXT: subl $4, %esp
; NORMAL: movl $8, 16(%esp)
; NORMAL-NEXT: movl $7, 12(%esp)
; NORMAL-NEXT: movl $6, 8(%esp)
@@ -380,3 +361,54 @@ entry:
call void @good(i32 %val1, i32 %val2, i32 %val3, i32 %add)
ret i32* %ptr3
}
+
+; Make sure to fold adjacent stack adjustments.
+; LINUX-LABEL: pr27140:
+; LINUX: subl $12, %esp
+; LINUX: .cfi_def_cfa_offset 16
+; LINUX-NOT: sub
+; LINUX: pushl $4
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: pushl $3
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: pushl $2
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: pushl $1
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: calll good
+; LINUX: addl $28, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX-NOT: add
+; LINUX: retl
+define void @pr27140() optsize {
+entry:
+ tail call void @good(i32 1, i32 2, i32 3, i32 4)
+ ret void
+}
+
+; Check that a stack restore (leal -4(%ebp), %esp) doesn't get merged with a
+; stack adjustment (addl $12, %esp). Just because it's a lea doesn't mean it's
+; simply decreasing the stack pointer.
+; NORMAL-LABEL: test14:
+; NORMAL: calll _B_func
+; NORMAL: leal -4(%ebp), %esp
+; NORMAL-NOT: %esp
+; NORMAL: retl
+%struct.A = type { i32, i32 }
+%struct.B = type { i8 }
+declare x86_thiscallcc %struct.B* @B_ctor(%struct.B* returned, %struct.A* byval)
+declare void @B_func(%struct.B* sret, %struct.B*, i32)
+define void @test14(%struct.A* %a) {
+entry:
+ %ref.tmp = alloca %struct.B, align 1
+ %agg.tmp = alloca i64, align 4
+ %tmpcast = bitcast i64* %agg.tmp to %struct.A*
+ %tmp = alloca %struct.B, align 1
+ %0 = bitcast %struct.A* %a to i64*
+ %1 = load i64, i64* %0, align 4
+ store i64 %1, i64* %agg.tmp, align 4
+ %call = call x86_thiscallcc %struct.B* @B_ctor(%struct.B* %ref.tmp, %struct.A* byval %tmpcast)
+ %2 = getelementptr inbounds %struct.B, %struct.B* %tmp, i32 0, i32 0
+ call void @B_func(%struct.B* sret %tmp, %struct.B* %ref.tmp, i32 1)
+ ret void
+}
diff --git a/test/CodeGen/X86/movtopush64.ll b/test/CodeGen/X86/movtopush64.ll
new file mode 100644
index 000000000000..1f4aa18c3227
--- /dev/null
+++ b/test/CodeGen/X86/movtopush64.ll
@@ -0,0 +1,193 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=NORMAL -check-prefix=NORMALFP
+; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=NOPUSH
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=NOPUSH -check-prefix=NORMALFP
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
+
+declare void @seven_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g)
+declare void @ten_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i64 %h, i32 %i, i64 %j)
+declare void @ten_params_ptr(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i8* %h, i32 %i, i64 %j)
+declare void @cannot_push(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i)
+
+; We should get pushes for the last 4 parameters. Test that the
+; in-register parameters are all in the right places, and check
+; that the stack manipulations are correct and correctly
+; described by the DWARF directives. Test that the switch
+; to disable the optimization works and that the optimization
+; doesn't kick in on Windows64 where it is not allowed.
+; NORMAL-LABEL: test1
+; NORMAL: pushq
+; NORMAL-DAG: movl $1, %edi
+; NORMAL-DAG: movl $2, %esi
+; NORMAL-DAG: movl $3, %edx
+; NORMAL-DAG: movl $4, %ecx
+; NORMAL-DAG: movl $5, %r8d
+; NORMAL-DAG: movl $6, %r9d
+; NORMAL: pushq $10
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $9
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $8
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $7
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: callq ten_params
+; NORMAL: addq $32, %rsp
+; NORMAL: .cfi_adjust_cfa_offset -32
+; NORMAL: popq
+; NORMAL: retq
+; NOPUSH-LABEL: test1
+; NOPUSH-NOT: pushq
+; NOPUSH: retq
+define void @test1() {
+entry:
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ ret void
+}
+
+; The presence of a frame pointer should not prevent pushes. But we
+; don't need the CFI directives in that case.
+; Also check that we generate the right pushes for >8bit immediates.
+; NORMALFP-LABEL: test2
+; NORMALFP: pushq $10000
+; NORMALFP-NEXT: pushq $9000
+; NORMALFP-NEXT: pushq $8000
+; NORMALFP-NEXT: pushq $7000
+; NORMALFP-NEXT: callq {{_?}}ten_params
+define void @test2(i32 %k) {
+entry:
+ %a = alloca i32, i32 %k
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7000, i64 8000, i32 9000, i64 10000)
+ ret void
+}
+
+; Parameters 7 & 8 should push a 64-bit register.
+; TODO: Note that the regular expressions disallow r8 and r9. That's fine for
+; now, because the pushes will always follow the moves into r8 and r9.
+; Eventually, though, we want to be able to schedule the pushes better.
+; In this example, it will save two copies, because we have to move the
+; incoming parameters out of %rdi and %rsi to make room for the outgoing
+; parameters.
+; NORMAL-LABEL: test3
+; NORMAL: pushq $10000
+; NORMAL: pushq $9000
+; NORMAL: pushq %r{{..}}
+; NORMAL: pushq %r{{..}}
+; NORMAL: callq ten_params
+define void @test3(i32 %a, i64 %b) {
+entry:
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %a, i64 %b, i32 9000, i64 10000)
+ ret void
+}
+
+; Check that we avoid the optimization for just one push.
+; NORMAL-LABEL: test4
+; NORMAL: movl $7, (%rsp)
+; NORMAL: callq seven_params
+define void @test4() {
+entry:
+ call void @seven_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7)
+ ret void
+}
+
+; Check that pushing link-time constant addresses works correctly
+; NORMAL-LABEL: test5
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq $ext
+; NORMAL: pushq $7
+; NORMAL: callq ten_params_ptr
+@ext = external constant i8
+define void @test5() {
+entry:
+ call void @ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i8* @ext, i32 9, i64 10)
+ ret void
+}
+
+; Check that we fuse 64-bit loads but not 32-bit loads into PUSH mem.
+; NORMAL-LABEL: test6
+; NORMAL: movq %rsi, [[REG64:%.+]]
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq ([[REG64]])
+; NORMAL: pushq {{%r..}}
+; NORMAL: callq ten_params
+define void @test6(i32* %p32, i64* %p64) {
+entry:
+ %v32 = load i32, i32* %p32
+ %v64 = load i64, i64* %p64
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %v32, i64 %v64, i32 9, i64 10)
+ ret void
+}
+
+; Fold stack-relative loads into the push with correct offsets.
+; Do the same for an indirect call whose address is loaded from the stack.
+; On entry, %p7 is at 8(%rsp) and %p8 is at 16(%rsp). Prior to the call
+; sequence, 72 bytes are allocated to the stack, 48 for register saves and
+; 24 for local storage and alignment, so %p7 is at 80(%rsp) and %p8 is at
+; 88(%rsp). The call address can be stored anywhere in the local space but
+; happens to be stored at 8(%rsp). Each push bumps these offsets up by
+; 8 bytes.
+; NORMAL-LABEL: test7
+; NORMAL: movq %r{{.*}}, 8(%rsp) {{.*Spill$}}
+; NORMAL: pushq 88(%rsp)
+; NORMAL: pushq $9
+; NORMAL: pushq 96(%rsp)
+; NORMAL: pushq $7
+; NORMAL: callq *40(%rsp)
+define void @test7(i64 %p1, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6, i64 %p7, i64 %p8) {
+entry:
+ %stack_fptr = alloca void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)*
+ store void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)* @ten_params, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr
+ %ten_params_ptr = load volatile void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)*, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr
+ call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ call void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64) %ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %p7, i32 9, i64 %p8)
+ ret void
+}
+
+; We can't fold the load from the global into the push because of
+; interference from the store
+; NORMAL-LABEL: test8
+; NORMAL: movq the_global(%rip), [[REG:%r.+]]
+; NORMAL: movq $42, the_global
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq [[REG]]
+; NORMAL: pushq $7
+; NORMAL: callq ten_params
+@the_global = external global i64
+define void @test8() {
+ %myload = load i64, i64* @the_global
+ store i64 42, i64* @the_global
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %myload, i32 9, i64 10)
+ ret void
+}
+
+
+; Converting one function call to use pushes negatively affects
+; other calls that pass arguments on the stack without pushes.
+; If the cost outweighs the benefit, avoid using pushes.
+; NORMAL-LABEL: test9
+; NORMAL: callq cannot_push
+; NORMAL-NOT: push
+; NORMAL: callq ten_params
+define void @test9(float %p1) {
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ ret void
+}
+
+; But if the benefit outweighs the cost, use pushes.
+; NORMAL-LABEL: test10
+; NORMAL: callq cannot_push
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq $8
+; NORMAL: pushq $7
+; NORMAL: callq ten_params
+define void @test10(float %p1) {
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ ret void
+}
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
new file mode 100644
index 000000000000..8b8b10aa1790
--- /dev/null
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i256* %a, i256* %b, i256* %out) #0 {
+entry:
+ %av = load i256, i256* %a
+ %bv = load i256, i256* %b
+ %r = mul i256 %av, %bv
+ store i256 %r, i256* %out
+ ret void
+}
+
+; CHECK-LABEL: @test
+; There is a lot of inter-register motion, and so matching the instruction
+; sequence will be fragile. There should be 6 underlying multiplications.
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK-NOT: imulq
+; CHECK: retq
+
+attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+
diff --git a/test/CodeGen/X86/mul128.ll b/test/CodeGen/X86/mul128.ll
index 6825b99f2425..2b3a13509b3c 100644
--- a/test/CodeGen/X86/mul128.ll
+++ b/test/CodeGen/X86/mul128.ll
@@ -1,6 +1,17 @@
-; RUN: llc < %s -march=x86-64 | grep mul | count 3
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
define i128 @foo(i128 %t, i128 %u) {
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %r8, %rsi
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: retq
%k = mul i128 %t, %u
ret i128 %k
}
diff --git a/test/CodeGen/X86/mul64.ll b/test/CodeGen/X86/mul64.ll
index 5a25c5d0e9de..f5ca52a93b51 100644
--- a/test/CodeGen/X86/mul64.ll
+++ b/test/CodeGen/X86/mul64.ll
@@ -1,6 +1,27 @@
-; RUN: llc < %s -march=x86 | grep mul | count 3
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
-define i64 @foo(i64 %t, i64 %u) {
+define i64 @foo(i64 %t, i64 %u) nounwind {
+; X32-LABEL: foo:
+; X32: # BB#0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: imulq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%k = mul i64 %t, %u
ret i64 %k
}
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 247d78776b80..b3f73aaf890b 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86
; Test that we actually spill and reload all arguments in the variadic argument
; pack. Doing a normal call will clobber all argument registers, and we will
@@ -136,6 +137,8 @@ define void @g_thunk(i8* %fptr_i8, ...) {
; WINDOWS: jmpq *%rcx # TAILCALL
; X86-LABEL: _g_thunk:
+; X86-NOT: push %ebp
+; X86-NOT: andl {{.*}}, %esp
; X86: jmpl *%eax # TAILCALL
; Do a simple multi-exit multi-bb test.
diff --git a/test/CodeGen/X86/mwaitx.ll b/test/CodeGen/X86/mwaitx.ll
new file mode 100644
index 000000000000..5bf64311282f
--- /dev/null
+++ b/test/CodeGen/X86/mwaitx.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+mwaitx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+mwaitx | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=bdver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=bdver4 | FileCheck %s -check-prefix=WIN64
+
+; CHECK-LABEL: foo:
+; CHECK: leaq (%rdi), %rax
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: monitorx
+; WIN64-LABEL: foo:
+; WIN64: leaq (%rcx), %rax
+; WIN64-NEXT: movl %edx, %ecx
+; WIN64-NEXT: movl %r8d, %edx
+; WIN64-NEXT: monitorx
+define void @foo(i8* %P, i32 %E, i32 %H) nounwind {
+entry:
+ tail call void @llvm.x86.monitorx(i8* %P, i32 %E, i32 %H)
+ ret void
+}
+
+declare void @llvm.x86.monitorx(i8*, i32, i32) nounwind
+
+; CHECK-LABEL: bar:
+; CHECK: movl %edi, %ecx
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movl %edx, %ebx
+; CHECK-NEXT: mwaitx
+; WIN64-LABEL: bar:
+; WIN64: movl %edx, %eax
+; WIN64: movl %r8d, %ebx
+; WIN64-NEXT: mwaitx
+define void @bar(i32 %E, i32 %H, i32 %C) nounwind {
+entry:
+ tail call void @llvm.x86.mwaitx(i32 %E, i32 %H, i32 %C)
+ ret void
+}
+
+declare void @llvm.x86.mwaitx(i32, i32, i32) nounwind
diff --git a/test/CodeGen/X86/negate-add-zero.ll b/test/CodeGen/X86/negate-add-zero.ll
index 06341dc7ba53..5911312053dd 100644
--- a/test/CodeGen/X86/negate-add-zero.ll
+++ b/test/CodeGen/X86/negate-add-zero.ll
@@ -1133,4 +1133,4 @@ declare %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZlsIdLi5EL
declare %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZlsIdLi5ELi6EERSoS0_RK15FixedMatrixBaseIT_XT0_EXT1_EE(%"struct.std::basic_ostream<char,std::char_traits<char> >"*, %"struct.FixedMatrixBase<double,5,6>"*)
-declare void @llvm.memset.i64(i8* nocapture, i8, i64, i32) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/negative-offset.ll b/test/CodeGen/X86/negative-offset.ll
new file mode 100644
index 000000000000..dc1b255d0202
--- /dev/null
+++ b/test/CodeGen/X86/negative-offset.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test that a constant consisting of a global symbol with a negative offset
+; is properly folded and isel'd.
+
+; CHECK-LABEL: negative_offset:
+; CHECK: movl $G, %eax
+; CHECK: notq %rax
+; CHECK: addq %rdi, %rax
+; CHECK: retq
+@G = external global [8 x i32]
+define i8* @negative_offset(i8* %a) {
+ %t = getelementptr i8, i8* %a, i64 sub (i64 -1, i64 ptrtoint ([8 x i32]* @G to i64))
+ ret i8* %t
+}
diff --git a/test/CodeGen/X86/new-remat.ll b/test/CodeGen/X86/new-remat.ll
new file mode 100644
index 000000000000..726ad2d0a127
--- /dev/null
+++ b/test/CodeGen/X86/new-remat.ll
@@ -0,0 +1,70 @@
+; RUN: llc -verify-regalloc < %s | FileCheck %s
+; Check all spills are rematerialized.
+; CHECK-NOT: Spill
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global double 0.000000e+00, align 8
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @uniform_testdata(i32 %p1) {
+entry:
+ %cmp3 = icmp sgt i32 %p1, 0
+ br i1 %cmp3, label %for.body.preheader, label %for.end
+
+for.body.preheader: ; preds = %entry
+ %tmp = add i32 %p1, -1
+ %xtraiter = and i32 %p1, 7
+ %lcmp.mod = icmp eq i32 %xtraiter, 0
+ br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader
+
+for.body.prol.preheader: ; preds = %for.body.preheader
+ br label %for.body.prol
+
+for.body.prol: ; preds = %for.body.prol, %for.body.prol.preheader
+ %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+ %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ]
+ %tmp1 = load double, double* @b, align 8
+ %call.prol = tail call double @pow(double %tmp1, double 2.500000e-01)
+ %inc.prol = add nuw nsw i32 %i.04.prol, 1
+ %prol.iter.sub = add i32 %prol.iter, -1
+ %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+ br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol
+
+for.body.preheader.split.loopexit: ; preds = %for.body.prol
+ %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ]
+ br label %for.body.preheader.split
+
+for.body.preheader.split: ; preds = %for.body.preheader.split.loopexit, %for.body.preheader
+ %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ]
+ %tmp2 = icmp ult i32 %tmp, 7
+ br i1 %tmp2, label %for.end.loopexit, label %for.body.preheader.split.split
+
+for.body.preheader.split.split: ; preds = %for.body.preheader.split
+ br label %for.body
+
+for.body: ; preds = %for.body, %for.body.preheader.split.split
+ %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ]
+ %tmp3 = load double, double* @b, align 8
+ %call = tail call double @pow(double %tmp3, double 2.500000e-01)
+ %tmp4 = load double, double* @b, align 8
+ %call.1 = tail call double @pow(double %tmp4, double 2.500000e-01)
+ %inc.7 = add nsw i32 %i.04, 8
+ %exitcond.7 = icmp eq i32 %inc.7, %p1
+ br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body
+
+for.end.loopexit.unr-lcssa: ; preds = %for.body
+ br label %for.end.loopexit
+
+for.end.loopexit: ; preds = %for.end.loopexit.unr-lcssa, %for.body.preheader.split
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %tmp5 = load i32, i32* @a, align 4
+ ret i32 %tmp5
+}
+
+; Function Attrs: nounwind
+declare double @pow(double, double)
diff --git a/test/CodeGen/X86/no-prolog-kill.ll b/test/CodeGen/X86/no-prolog-kill.ll
new file mode 100644
index 000000000000..f625f315bb7c
--- /dev/null
+++ b/test/CodeGen/X86/no-prolog-kill.ll
@@ -0,0 +1,21 @@
+; RUN: llc -verify-machineinstrs -o - %s | FileCheck %s
+target triple = "x86_64--"
+
+; This function gets a AL live-in and at same time saves+restores RAX. We must
+; not add a kill flag to the "PUSHQ %rax" or the machine verifier will complain.
+; CHECK-LABEL: test:
+; CHECK: pushq %rax
+; CHECK: testb %al, %al
+; CHECK: je .LBB
+define void @test(i64 %a, i8* %b, ...) {
+entry:
+ %bar = alloca i8
+ call void @llvm.va_start(i8* %bar)
+ call void @llvm.eh.unwind.init()
+ call void @llvm.eh.return.i64(i64 %a, i8* %b)
+ unreachable
+}
+
+declare void @llvm.eh.return.i64(i64, i8*)
+declare void @llvm.eh.unwind.init()
+declare void @llvm.va_start(i8*)
diff --git a/test/CodeGen/X86/no-sse2-avg.ll b/test/CodeGen/X86/no-sse2-avg.ll
new file mode 100644
index 000000000000..0ed0a7f74cb3
--- /dev/null
+++ b/test/CodeGen/X86/no-sse2-avg.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s
+
+define <16 x i8> @PR27973() {
+; CHECK-LABEL: PR27973:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb $0, 15(%rdi)
+; CHECK-NEXT: movb $0, 14(%rdi)
+; CHECK-NEXT: movb $0, 13(%rdi)
+; CHECK-NEXT: movb $0, 12(%rdi)
+; CHECK-NEXT: movb $0, 11(%rdi)
+; CHECK-NEXT: movb $0, 10(%rdi)
+; CHECK-NEXT: movb $0, 9(%rdi)
+; CHECK-NEXT: movb $0, 8(%rdi)
+; CHECK-NEXT: movb $0, 7(%rdi)
+; CHECK-NEXT: movb $0, 6(%rdi)
+; CHECK-NEXT: movb $0, 5(%rdi)
+; CHECK-NEXT: movb $0, 4(%rdi)
+; CHECK-NEXT: movb $0, 3(%rdi)
+; CHECK-NEXT: movb $0, 2(%rdi)
+; CHECK-NEXT: movb $0, 1(%rdi)
+; CHECK-NEXT: movb $0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+;
+ %t0 = zext <16 x i8> zeroinitializer to <16 x i32>
+ %t1 = add nuw nsw <16 x i32> %t0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %t2 = lshr <16 x i32> %t1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %t3 = trunc <16 x i32> %t2 to <16 x i8>
+ ret <16 x i8> %t3
+}
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index c9767f88488c..e221f8e9520b 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -1,54 +1,235 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX
; Make sure that we generate non-temporal stores for the test cases below.
; We use xorps for zeroing, so domain information isn't available anymore.
+; Scalar versions (zeroing means we can this even for fp types).
+
+define void @test_zero_f32(float* %dst) {
+; SSE-LABEL: test_zero_f32:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntil %eax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_f32:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_f32:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ store float zeroinitializer, float* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_zero_i32(i32* %dst) {
+; SSE-LABEL: test_zero_i32:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntil %eax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_i32:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_i32:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ store i32 zeroinitializer, i32* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_zero_f64(double* %dst) {
+; SSE-LABEL: test_zero_f64:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_f64:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_f64:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntiq %rax, (%rdi)
+; VLX-NEXT: retq
+ store double zeroinitializer, double* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_zero_i64(i64* %dst) {
+; SSE-LABEL: test_zero_i64:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_i64:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_i64:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntiq %rax, (%rdi)
+; VLX-NEXT: retq
+ store i64 zeroinitializer, i64* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+; And now XMM versions.
+
define void @test_zero_v4f32(<4 x float>* %dst) {
-; CHECK-LABEL: test_zero_v4f32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4f32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v4i32(<4 x i32>* %dst) {
-; CHECK-LABEL: test_zero_v4i32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
+ store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v2f64(<2 x double>* %dst) {
-; CHECK-LABEL: test_zero_v2f64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v2f64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v2i64(<2 x i64>* %dst) {
-; CHECK-LABEL: test_zero_v2i64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v2i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v8i16(<8 x i16>* %dst) {
-; CHECK-LABEL: test_zero_v8i16:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v8i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v16i8(<16 x i8>* %dst) {
-; CHECK-LABEL: test_zero_v16i8:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v16i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
}
@@ -56,43 +237,145 @@ define void @test_zero_v16i8(<16 x i8>* %dst) {
; And now YMM versions.
define void @test_zero_v8f32(<8 x float>* %dst) {
-; CHECK-LABEL: test_zero_v8f32:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_zero_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v8i32(<8 x i32>* %dst) {
-; CHECK-LABEL: test_zero_v8i32:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8i32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v8i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v4f64(<4 x double>* %dst) {
-; CHECK-LABEL: test_zero_v4f64:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_zero_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4f64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v4i64(<4 x i64>* %dst) {
-; CHECK-LABEL: test_zero_v4i64:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4i64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v16i16(<16 x i16>* %dst) {
-; CHECK-LABEL: test_zero_v16i16:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v16i16:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v16i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v32i8(<32 x i8>* %dst) {
-; CHECK-LABEL: test_zero_v32i8:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v32i8:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v32i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
}
@@ -100,50 +383,358 @@ define void @test_zero_v32i8(<32 x i8>* %dst) {
; Check that we also handle arguments. Here the type survives longer.
+; Scalar versions.
+
+define void @test_arg_f32(float %arg, float* %dst) {
+; SSE2-LABEL: test_arg_f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movss %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_arg_f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_f32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovss %xmm0, (%rdi)
+; VLX-NEXT: retq
+ store float %arg, float* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_arg_i32(i32 %arg, i32* %dst) {
+; SSE-LABEL: test_arg_i32:
+; SSE: # BB#0:
+; SSE-NEXT: movntil %edi, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movntil %edi, (%rsi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_i32:
+; VLX: # BB#0:
+; VLX-NEXT: movntil %edi, (%rsi)
+; VLX-NEXT: retq
+ store i32 %arg, i32* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_arg_f64(double %arg, double* %dst) {
+; SSE2-LABEL: test_arg_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_arg_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovsd %xmm0, (%rdi)
+; VLX-NEXT: retq
+ store double %arg, double* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_arg_i64(i64 %arg, i64* %dst) {
+; SSE-LABEL: test_arg_i64:
+; SSE: # BB#0:
+; SSE-NEXT: movntiq %rdi, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_i64:
+; AVX: # BB#0:
+; AVX-NEXT: movntiq %rdi, (%rsi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_i64:
+; VLX: # BB#0:
+; VLX-NEXT: movntiq %rdi, (%rsi)
+; VLX-NEXT: retq
+ store i64 %arg, i64* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+; Extract versions
+
+define void @test_extract_f32(<4 x float> %arg, float* %dst) {
+; SSE2-LABEL: test_extract_f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: movss %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_f32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: extractps $1, %xmm0, %eax
+; SSE41-NEXT: movntil %eax, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_f32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractps $1, %xmm0, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_f32:
+; VLX: # BB#0:
+; VLX-NEXT: vextractps $1, %xmm0, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <4 x float> %arg, i32 1
+ store float %1, float* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_extract_i32(<4 x i32> %arg, i32* %dst) {
+; SSE2-LABEL: test_extract_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movntil %eax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_i32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE4A-NEXT: movd %xmm0, %eax
+; SSE4A-NEXT: movntil %eax, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $1, %xmm0, %eax
+; SSE41-NEXT: movntil %eax, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpextrd $1, %xmm0, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <4 x i32> %arg, i32 1
+ store i32 %1, i32* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_extract_f64(<2 x double> %arg, double* %dst) {
+; SSE2-LABEL: test_extract_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movhpd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movhpd %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovhpd %xmm0, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <2 x double> %arg, i32 1
+ store double %1, double* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
+; SSE2-LABEL: test_extract_i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_i64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE4A-NEXT: movd %xmm0, %rax
+; SSE4A-NEXT: movntiq %rax, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpextrq $1, %xmm0, %rax
+; VLX-NEXT: movntiq %rax, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <2 x i64> %arg, i32 1
+ store i64 %1, i64* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+; And now XMM versions.
+
define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
-; CHECK-LABEL: test_arg_v4f32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4f32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
+; VLX-NEXT: retq
store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
-; CHECK-LABEL: test_arg_v4i32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4i32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
-; CHECK-LABEL: test_arg_v2f64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v2f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntpd %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
-; CHECK-LABEL: test_arg_v2i64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v2i64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
-; CHECK-LABEL: test_arg_v8i16:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v8i16:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
-; CHECK-LABEL: test_arg_v16i8:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v16i8:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
}
@@ -151,43 +742,127 @@ define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
; And now YMM versions.
define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
-; CHECK-LABEL: test_arg_v8f32:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_arg_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
-; CHECK-LABEL: test_arg_v8i32:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v8i32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
-; CHECK-LABEL: test_arg_v4f64:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_arg_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntpd %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
-; CHECK-LABEL: test_arg_v4i64:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4i64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
-; CHECK-LABEL: test_arg_v16i16:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v16i16:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
-; CHECK-LABEL: test_arg_v32i8:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v32i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v32i8:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
}
@@ -197,54 +872,138 @@ define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
; We use an add to make the type survive all the way to the MOVNT.
define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
-; CHECK-LABEL: test_op_v4f32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_op_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps %xmm1, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v4f32:
+; VLX: # BB#0:
+; VLX-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <4 x float> %a, %b
store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
-; CHECK-LABEL: test_op_v4i32:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v4i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <4 x i32> %a, %b
store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
-; CHECK-LABEL: test_op_v2f64:
-; SSE: movntpd
-; AVX: vmovntpd
+; SSE-LABEL: test_op_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntpd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v2f64:
+; VLX: # BB#0:
+; VLX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntpd %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <2 x double> %a, %b
store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
-; CHECK-LABEL: test_op_v2i64:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v2i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <2 x i64> %a, %b
store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
-; CHECK-LABEL: test_op_v8i16:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v8i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <8 x i16> %a, %b
store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
-; CHECK-LABEL: test_op_v16i8:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v16i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <16 x i8> %a, %b
store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
@@ -253,48 +1012,200 @@ define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
; And now YMM versions.
define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
-; CHECK-LABEL: test_op_v8f32:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_op_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps %xmm2, %xmm0
+; SSE-NEXT: addps %xmm3, %xmm1
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
-; CHECK-LABEL: test_op_v8i32:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v8i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <8 x i32> %a, %b
store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
-; CHECK-LABEL: test_op_v4f64:
-; AVX: vmovntpd %ymm
+; SSE-LABEL: test_op_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd %xmm2, %xmm0
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: movntpd %xmm1, 16(%rdi)
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v4f64:
+; VLX: # BB#0:
+; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntpd %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <4 x double> %a, %b
store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
-; CHECK-LABEL: test_op_v4i64:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v4i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <4 x i64> %a, %b
store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
-; CHECK-LABEL: test_op_v16i16:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: paddw %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v16i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <16 x i16> %a, %b
store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
-; CHECK-LABEL: test_op_v32i8:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb %xmm2, %xmm0
+; SSE-NEXT: paddb %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v32i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <32 x i8> %a, %b
store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
@@ -305,11 +1216,26 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
; could even scalarize to movnti when we have 1-alignment: nontemporal is
; probably always worth even some 20 instruction scalarization.
define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
-; CHECK-LABEL: test_unaligned_v8f32:
-; SSE: movntps %xmm
-; SSE: movntps %xmm
-; AVX-NOT: movnt
-; AVX: vmovups %ymm
+; SSE-LABEL: test_unaligned_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps %xmm2, %xmm0
+; SSE-NEXT: addps %xmm3, %xmm1
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_unaligned_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovups %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
ret void
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
new file mode 100644
index 000000000000..83301e60a1c4
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -0,0 +1,1638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+
+define <4 x float> @test_v4f32(<4 x float>* %src) {
+; SSE2-LABEL: test_v4f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v4f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1
+ ret <4 x float> %1
+}
+
+define <4 x i32> @test_v4i32(<4 x i32>* %src) {
+; SSE2-LABEL: test_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_v4i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v4i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa32 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %src, align 16, !nontemporal !1
+ ret <4 x i32> %1
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %src) {
+; SSE2-LABEL: test_v2f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1
+ ret <2 x double> %1
+}
+
+define <2 x i64> @test_v2i64(<2 x i64>* %src) {
+; SSE2-LABEL: test_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1
+ ret <2 x i64> %1
+}
+
+define <8 x i16> @test_v8i16(<8 x i16>* %src) {
+; SSE2-LABEL: test_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @test_v16i8(<16 x i8>* %src) {
+; SSE2-LABEL: test_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1
+ ret <16 x i8> %1
+}
+
+; And now YMM versions.
+
+define <8 x float> @test_v8f32(<8 x float>* %src) {
+; SSE2-LABEL: test_v8f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1
+ ret <8 x float> %1
+}
+
+define <8 x i32> @test_v8i32(<8 x i32>* %src) {
+; SSE2-LABEL: test_v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_v8i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v8i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa32 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1
+ ret <8 x i32> %1
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %src) {
+; SSE2-LABEL: test_v4f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v4f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1
+ ret <4 x double> %1
+}
+
+define <4 x i64> @test_v4i64(<4 x i64>* %src) {
+; SSE2-LABEL: test_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1
+ ret <4 x i64> %1
+}
+
+define <16 x i16> @test_v16i16(<16 x i16>* %src) {
+; SSE2-LABEL: test_v16i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1
+ ret <16 x i16> %1
+}
+
+define <32 x i8> @test_v32i8(<32 x i8>* %src) {
+; SSE2-LABEL: test_v32i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1
+ ret <32 x i8> %1
+}
+
+; And now ZMM versions.
+
+define <16 x float> @test_v16f32(<16 x float>* %src) {
+; SSE2-LABEL: test_v16f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1
+ ret <16 x float> %1
+}
+
+define <16 x i32> @test_v16i32(<16 x i32>* %src) {
+; SSE2-LABEL: test_v16i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1
+ ret <16 x i32> %1
+}
+
+define <8 x double> @test_v8f64(<8 x double>* %src) {
+; SSE2-LABEL: test_v8f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1
+ ret <8 x double> %1
+}
+
+define <8 x i64> @test_v8i64(<8 x i64>* %src) {
+; SSE2-LABEL: test_v8i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1
+ ret <8 x i64> %1
+}
+
+define <32 x i16> @test_v32i16(<32 x i16>* %src) {
+; SSE2-LABEL: test_v32i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v32i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v32i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
+ ret <32 x i16> %1
+}
+
+define <64 x i8> @test_v64i8(<64 x i8>* %src) {
+; SSE2-LABEL: test_v64i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v64i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
+ ret <64 x i8> %1
+}
+
+
+; Check cases where the load would be folded.
+
+define <4 x float> @test_arg_v4f32(<4 x float> %arg, <4 x float>* %src) {
+; SSE-LABEL: test_arg_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddps (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1
+ %2 = fadd <4 x float> %arg, %1
+ ret <4 x float> %2
+}
+
+define <4 x i32> @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %src) {
+; SSE-LABEL: test_arg_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %src, align 16, !nontemporal !1
+ %2 = add <4 x i32> %arg, %1
+ ret <4 x i32> %2
+}
+
+define <2 x double> @test_arg_v2f64(<2 x double> %arg, <2 x double>* %src) {
+; SSE-LABEL: test_arg_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v2f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddpd (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1
+ %2 = fadd <2 x double> %arg, %1
+ ret <2 x double> %2
+}
+
+define <2 x i64> @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %src) {
+; SSE-LABEL: test_arg_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddq (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1
+ %2 = add <2 x i64> %arg, %1
+ ret <2 x i64> %2
+}
+
+define <8 x i16> @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %src) {
+; SSE-LABEL: test_arg_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddw (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddw (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1
+ %2 = add <8 x i16> %arg, %1
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %src) {
+; SSE-LABEL: test_arg_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddb (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddb (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1
+ %2 = add <16 x i8> %arg, %1
+ ret <16 x i8> %2
+}
+
+; And now YMM versions.
+
+define <8 x float> @test_arg_v8f32(<8 x float> %arg, <8 x float>* %src) {
+; SSE-LABEL: test_arg_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps (%rdi), %xmm0
+; SSE-NEXT: addps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddps (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1
+ %2 = fadd <8 x float> %arg, %1
+ ret <8 x float> %2
+}
+
+define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
+; SSE-LABEL: test_arg_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd (%rdi), %xmm0
+; SSE-NEXT: paddd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1
+ %2 = add <8 x i32> %arg, %1
+ ret <8 x i32> %2
+}
+
+define <4 x double> @test_arg_v4f64(<4 x double> %arg, <4 x double>* %src) {
+; SSE-LABEL: test_arg_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd (%rdi), %xmm0
+; SSE-NEXT: addpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddpd (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1
+ %2 = fadd <4 x double> %arg, %1
+ ret <4 x double> %2
+}
+
+define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
+; SSE-LABEL: test_arg_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq (%rdi), %xmm0
+; SSE-NEXT: paddq 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1
+ %2 = add <4 x i64> %arg, %1
+ ret <4 x i64> %2
+}
+
+define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
+; SSE-LABEL: test_arg_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw (%rdi), %xmm0
+; SSE-NEXT: paddw 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1
+ %2 = add <16 x i16> %arg, %1
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
+; SSE-LABEL: test_arg_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb (%rdi), %xmm0
+; SSE-NEXT: paddb 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1
+ %2 = add <32 x i8> %arg, %1
+ ret <32 x i8> %2
+}
+
+; And now ZMM versions.
+
+define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
+; SSE-LABEL: test_arg_v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps (%rdi), %xmm0
+; SSE-NEXT: addps 16(%rdi), %xmm1
+; SSE-NEXT: addps 32(%rdi), %xmm2
+; SSE-NEXT: addps 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: vaddps 32(%rdi), %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddps (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1
+ %2 = fadd <16 x float> %arg, %1
+ ret <16 x float> %2
+}
+
+define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
+; SSE-LABEL: test_arg_v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd (%rdi), %xmm0
+; SSE-NEXT: paddd 16(%rdi), %xmm1
+; SSE-NEXT: paddd 32(%rdi), %xmm2
+; SSE-NEXT: paddd 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddd 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1
+ %2 = add <16 x i32> %arg, %1
+ ret <16 x i32> %2
+}
+
+define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
+; SSE-LABEL: test_arg_v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd (%rdi), %xmm0
+; SSE-NEXT: addpd 16(%rdi), %xmm1
+; SSE-NEXT: addpd 32(%rdi), %xmm2
+; SSE-NEXT: addpd 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: vaddpd 32(%rdi), %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddpd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1
+ %2 = fadd <8 x double> %arg, %1
+ ret <8 x double> %2
+}
+
+define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
+; SSE-LABEL: test_arg_v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq (%rdi), %xmm0
+; SSE-NEXT: paddq 16(%rdi), %xmm1
+; SSE-NEXT: paddq 32(%rdi), %xmm2
+; SSE-NEXT: paddq 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddq 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1
+ %2 = add <8 x i64> %arg, %1
+ ret <8 x i64> %2
+}
+
+define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
+; SSE-LABEL: test_arg_v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw (%rdi), %xmm0
+; SSE-NEXT: paddw 16(%rdi), %xmm1
+; SSE-NEXT: paddw 32(%rdi), %xmm2
+; SSE-NEXT: paddw 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddw 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_arg_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_arg_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_arg_v32i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddw 32(%rdi), %ymm1, %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
+ %2 = add <32 x i16> %arg, %1
+ ret <32 x i16> %2
+}
+
+define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
+; SSE-LABEL: test_arg_v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb (%rdi), %xmm0
+; SSE-NEXT: paddb 16(%rdi), %xmm1
+; SSE-NEXT: paddb 32(%rdi), %xmm2
+; SSE-NEXT: paddb 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_arg_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_arg_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_arg_v64i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb 32(%rdi), %ymm1, %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
+ %2 = add <64 x i8> %arg, %1
+ ret <64 x i8> %2
+}
+
+
+; Unaligned non-temporal loads (not supported)
+
+define <4 x float> @test_unaligned_v4f32(<4 x float>* %src) {
+; SSE-LABEL: test_unaligned_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v4f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovups (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1
+ ret <4 x float> %1
+}
+
+define <4 x i32> @test_unaligned_v4i32(<4 x i32>* %src) {
+; SSE-LABEL: test_unaligned_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v4i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v4i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu32 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1
+ ret <4 x i32> %1
+}
+
+define <2 x double> @test_unaligned_v2f64(<2 x double>* %src) {
+; SSE-LABEL: test_unaligned_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v2f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v2f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v2f64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovupd (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1
+ ret <2 x double> %1
+}
+
+define <2 x i64> @test_unaligned_v2i64(<2 x i64>* %src) {
+; SSE-LABEL: test_unaligned_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v2i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v2i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1
+ ret <2 x i64> %1
+}
+
+define <8 x i16> @test_unaligned_v8i16(<8 x i16>* %src) {
+; SSE-LABEL: test_unaligned_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v8i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v8i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @test_unaligned_v16i8(<16 x i8>* %src) {
+; SSE-LABEL: test_unaligned_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1
+ ret <16 x i8> %1
+}
+
+; And now YMM versions.
+
+define <8 x float> @test_unaligned_v8f32(<8 x float>* %src) {
+; SSE-LABEL: test_unaligned_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovups (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1
+ ret <8 x float> %1
+}
+
+define <8 x i32> @test_unaligned_v8i32(<8 x i32>* %src) {
+; SSE-LABEL: test_unaligned_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v8i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v8i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu32 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1
+ ret <8 x i32> %1
+}
+
+define <4 x double> @test_unaligned_v4f64(<4 x double>* %src) {
+; SSE-LABEL: test_unaligned_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v4f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v4f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v4f64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovupd (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1
+ ret <4 x double> %1
+}
+
+define <4 x i64> @test_unaligned_v4i64(<4 x i64>* %src) {
+; SSE-LABEL: test_unaligned_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v4i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v4i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1
+ ret <4 x i64> %1
+}
+
+define <16 x i16> @test_unaligned_v16i16(<16 x i16>* %src) {
+; SSE-LABEL: test_unaligned_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v16i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v16i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1
+ ret <16 x i16> %1
+}
+
+define <32 x i8> @test_unaligned_v32i8(<32 x i8>* %src) {
+; SSE-LABEL: test_unaligned_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v32i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v32i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v32i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1
+ ret <32 x i8> %1
+}
+
+; And now ZMM versions.
+
+define <16 x float> @test_unaligned_v16f32(<16 x float>* %src) {
+; SSE-LABEL: test_unaligned_v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovups (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1
+ ret <16 x float> %1
+}
+
+define <16 x i32> @test_unaligned_v16i32(<16 x i32>* %src) {
+; SSE-LABEL: test_unaligned_v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1
+ ret <16 x i32> %1
+}
+
+define <8 x double> @test_unaligned_v8f64(<8 x double>* %src) {
+; SSE-LABEL: test_unaligned_v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovupd (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1
+ ret <8 x double> %1
+}
+
+define <8 x i64> @test_unaligned_v8i64(<8 x i64>* %src) {
+; SSE-LABEL: test_unaligned_v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1
+ ret <8 x i64> %1
+}
+
+define <32 x i16> @test_unaligned_v32i16(<32 x i16>* %src) {
+; SSE-LABEL: test_unaligned_v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v32i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v32i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqu64 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1
+ ret <32 x i16> %1
+}
+
+define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
+; SSE-LABEL: test_unaligned_v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v64i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v64i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqu64 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1
+ ret <64 x i8> %1
+}
+
+!1 = !{i32 1}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
index 9a2f23596f79..33d5caba597c 100644
--- a/test/CodeGen/X86/nontemporal.ll
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -1,24 +1,135 @@
-; RUN: llc < %s -mtriple x86_64-unknown-unknown | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
-define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, i64 %F) {
-; CHECK: movntps
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 x i32> %F, <8 x i16> %G, <16 x i8> %H, i64 %I) nounwind {
+; X32-SSE-LABEL: f:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pushl %ebp
+; X32-SSE-NEXT: movl %esp, %ebp
+; X32-SSE-NEXT: pushl %esi
+; X32-SSE-NEXT: andl $-16, %esp
+; X32-SSE-NEXT: subl $16, %esp
+; X32-SSE-NEXT: movl 72(%ebp), %eax
+; X32-SSE-NEXT: movl 76(%ebp), %ecx
+; X32-SSE-NEXT: movl 12(%ebp), %edx
+; X32-SSE-NEXT: movdqa 56(%ebp), %xmm3
+; X32-SSE-NEXT: movdqa 40(%ebp), %xmm4
+; X32-SSE-NEXT: movdqa 24(%ebp), %xmm5
+; X32-SSE-NEXT: movl 8(%ebp), %esi
+; X32-SSE-NEXT: addps .LCPI0_0, %xmm0
+; X32-SSE-NEXT: movntps %xmm0, (%esi)
+; X32-SSE-NEXT: paddq .LCPI0_1, %xmm2
+; X32-SSE-NEXT: movntdq %xmm2, (%esi)
+; X32-SSE-NEXT: addpd .LCPI0_2, %xmm1
+; X32-SSE-NEXT: movntpd %xmm1, (%esi)
+; X32-SSE-NEXT: paddd .LCPI0_3, %xmm5
+; X32-SSE-NEXT: movntdq %xmm5, (%esi)
+; X32-SSE-NEXT: paddw .LCPI0_4, %xmm4
+; X32-SSE-NEXT: movntdq %xmm4, (%esi)
+; X32-SSE-NEXT: paddb .LCPI0_5, %xmm3
+; X32-SSE-NEXT: movntdq %xmm3, (%esi)
+; X32-SSE-NEXT: movntil %edx, (%esi)
+; X32-SSE-NEXT: movntil %ecx, 4(%esi)
+; X32-SSE-NEXT: movntil %eax, (%esi)
+; X32-SSE-NEXT: leal -4(%ebp), %esp
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: popl %ebp
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: f:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: pushl %ebp
+; X32-AVX-NEXT: movl %esp, %ebp
+; X32-AVX-NEXT: pushl %esi
+; X32-AVX-NEXT: andl $-16, %esp
+; X32-AVX-NEXT: subl $16, %esp
+; X32-AVX-NEXT: movl 72(%ebp), %eax
+; X32-AVX-NEXT: movl 76(%ebp), %ecx
+; X32-AVX-NEXT: movl 12(%ebp), %edx
+; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm3
+; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm4
+; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm5
+; X32-AVX-NEXT: movl 8(%ebp), %esi
+; X32-AVX-NEXT: vaddps .LCPI0_0, %xmm0, %xmm0
+; X32-AVX-NEXT: vmovntps %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddq .LCPI0_1, %xmm2, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: vaddpd .LCPI0_2, %xmm1, %xmm0
+; X32-AVX-NEXT: vmovntpd %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddd .LCPI0_3, %xmm5, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddw .LCPI0_4, %xmm4, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddb .LCPI0_5, %xmm3, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: movntil %edx, (%esi)
+; X32-AVX-NEXT: movntil %ecx, 4(%esi)
+; X32-AVX-NEXT: movntil %eax, (%esi)
+; X32-AVX-NEXT: leal -4(%ebp), %esp
+; X32-AVX-NEXT: popl %esi
+; X32-AVX-NEXT: popl %ebp
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: f:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: addps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: movntps %xmm0, (%rdi)
+; X64-SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; X64-SSE-NEXT: movntdq %xmm2, (%rdi)
+; X64-SSE-NEXT: addpd {{.*}}(%rip), %xmm1
+; X64-SSE-NEXT: movntpd %xmm1, (%rdi)
+; X64-SSE-NEXT: paddd {{.*}}(%rip), %xmm3
+; X64-SSE-NEXT: movntdq %xmm3, (%rdi)
+; X64-SSE-NEXT: paddw {{.*}}(%rip), %xmm4
+; X64-SSE-NEXT: movntdq %xmm4, (%rdi)
+; X64-SSE-NEXT: paddb {{.*}}(%rip), %xmm5
+; X64-SSE-NEXT: movntdq %xmm5, (%rdi)
+; X64-SSE-NEXT: movntil %esi, (%rdi)
+; X64-SSE-NEXT: movntiq %rdx, (%rdi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: f:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovntps %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm0
+; X64-AVX-NEXT: vmovntpd %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %xmm3, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddw {{.*}}(%rip), %xmm4, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddb {{.*}}(%rip), %xmm5, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: movntil %esi, (%rdi)
+; X64-AVX-NEXT: movntiq %rdx, (%rdi)
+; X64-AVX-NEXT: retq
%cast = bitcast i8* %B to <4 x float>*
- %A2 = fadd <4 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
+ %A2 = fadd <4 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0>
store <4 x float> %A2, <4 x float>* %cast, align 16, !nontemporal !0
-; CHECK: movntdq
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, <i64 1, i64 2>
store <2 x i64> %E2, <2 x i64>* %cast1, align 16, !nontemporal !0
-; CHECK: movntpd
%cast2 = bitcast i8* %B to <2 x double>*
- %C2 = fadd <2 x double> %C, <double 0x0, double 0x4200000000000000>
+ %C2 = fadd <2 x double> %C, <double 1.0, double 2.0>
store <2 x double> %C2, <2 x double>* %cast2, align 16, !nontemporal !0
-; CHECK: movntil
- %cast3 = bitcast i8* %B to i32*
- store i32 %D, i32* %cast3, align 1, !nontemporal !0
-; CHECK: movntiq
- %cast4 = bitcast i8* %B to i64*
- store i64 %F, i64* %cast4, align 1, !nontemporal !0
+ %cast3 = bitcast i8* %B to <4 x i32>*
+ %F2 = add <4 x i32> %F, <i32 1, i32 2, i32 3, i32 4>
+ store <4 x i32> %F2, <4 x i32>* %cast3, align 16, !nontemporal !0
+ %cast4 = bitcast i8* %B to <8 x i16>*
+ %G2 = add <8 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+ store <8 x i16> %G2, <8 x i16>* %cast4, align 16, !nontemporal !0
+ %cast5 = bitcast i8* %B to <16 x i8>*
+ %H2 = add <16 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+ store <16 x i8> %H2, <16 x i8>* %cast5, align 16, !nontemporal !0
+ %cast6 = bitcast i8* %B to i32*
+ store i32 %D, i32* %cast6, align 1, !nontemporal !0
+ %cast7 = bitcast i8* %B to i64*
+ store i64 %I, i64* %cast7, align 1, !nontemporal !0
ret void
}
diff --git a/test/CodeGen/X86/noreturn-call.ll b/test/CodeGen/X86/noreturn-call.ll
new file mode 100644
index 000000000000..89781816de82
--- /dev/null
+++ b/test/CodeGen/X86/noreturn-call.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+define void @test1(i32 %c) {
+; CHECK-LABEL: test1:
+entry:
+ %0 = alloca i8, i32 %c
+ %tobool = icmp eq i32 %c, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ call void @g(i8* %0)
+ ret void
+
+if.then:
+ call void @crash(i8* %0)
+ unreachable
+; CHECK: calll _crash
+; There is no need to adjust the stack after the call, since
+; the function is noreturn and that code will therefore never run.
+; CHECK-NOT: add
+; CHECK-NOT: pop
+}
+
+define void @test2(i32 %c) {
+; CHECK-LABEL: test2:
+entry:
+ %0 = alloca i8, i32 %c
+ %tobool = icmp eq i32 %c, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ call void @g(i8* %0)
+ ret void
+
+if.then:
+ call void @crash2(i8* %0)
+ unreachable
+; CHECK: calll _crash2
+; Even though _crash2 is not marked noreturn, it is in practice because
+; of the "unreachable" right after it. This happens e.g. when falling off
+; a non-void function after a call.
+; CHECK-NOT: add
+; CHECK-NOT: pop
+}
+
+declare void @crash(i8*) noreturn
+declare void @crash2(i8*)
+declare void @g(i8*)
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index e80f3fcbe58d..b331b92868f1 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -14,11 +14,10 @@ define void @f1() {
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !13}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: " ", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !9, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: " ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !9, imports: !2)
!1 = !DIFile(filename: "file.c", directory: "")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
!6 = !DISubroutineType(types: !7)
!7 = !{!8}
!8 = !DIBasicType(tag: DW_TAG_base_type, size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/opt-ext-uses.ll b/test/CodeGen/X86/opt-ext-uses.ll
index 39e6fd0e6a59..b654a81c11cd 100644
--- a/test/CodeGen/X86/opt-ext-uses.ll
+++ b/test/CodeGen/X86/opt-ext-uses.ll
@@ -2,8 +2,8 @@
; This test should get one and only one register to register mov.
; CHECK-LABEL: t:
-; CHECK: movw
-; CHECK-NOT: movw
+; CHECK: movl
+; CHECK-NOT: mov
; CHECK: ret
define signext i16 @t() {
diff --git a/test/CodeGen/X86/or-lea.ll b/test/CodeGen/X86/or-lea.ll
index f45a639ffa2c..e65056a91c43 100644
--- a/test/CodeGen/X86/or-lea.ll
+++ b/test/CodeGen/X86/or-lea.ll
@@ -9,6 +9,8 @@
define i32 @or_shift1_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq
@@ -22,6 +24,8 @@ define i32 @or_shift1_and1(i32 %x, i32 %y) {
define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1_swapped:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq
@@ -35,6 +39,8 @@ define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
define i32 @or_shift2_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift2_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,4), %eax
; CHECK-NEXT: retq
@@ -48,6 +54,8 @@ define i32 @or_shift2_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq
@@ -61,6 +69,8 @@ define i32 @or_shift3_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and7(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and7:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $7, %esi
; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq
@@ -76,6 +86,8 @@ define i32 @or_shift3_and7(i32 %x, i32 %y) {
define i32 @or_shift4_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift4_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: shll $4, %edi
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi), %eax
@@ -92,6 +104,7 @@ define i32 @or_shift4_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and8(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and8:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: leal (,%rdi,8), %eax
; CHECK-NEXT: andl $8, %esi
; CHECK-NEXT: orl %esi, %eax
diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll
index e30cb4824aa7..d7f0251c1387 100644
--- a/test/CodeGen/X86/osx-private-labels.ll
+++ b/test/CodeGen/X86/osx-private-labels.ll
@@ -11,7 +11,7 @@
@private2 = private unnamed_addr constant [5 x i16] [i16 116, i16 101,
i16 115, i16 116, i16 0]
; CHECK: .section __TEXT,__ustring
-; CHECK-NEXT: .align 1
+; CHECK-NEXT: .p2align 1
; CHECK-NEXT: l_private2:
; There is no dedicated 4 byte strings on MachO.
@@ -19,60 +19,60 @@
%struct.NSConstantString = type { i32*, i32, i8*, i32 }
@private3 = private constant %struct.NSConstantString { i32* null, i32 1992, i8* null, i32 0 }, section "__DATA,__cfstring"
; CHECK: .section __DATA,__cfstring
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: L_private3:
; There is no dedicated 1 or 2 byte constant section on MachO.
@private4 = private unnamed_addr constant i32 42
; CHECK: .section __TEXT,__literal4,4byte_literals
-; CHECK-NEXT: .align 2
+; CHECK-NEXT: .p2align 2
; CHECK-NEXT: L_private4:
@private5 = private unnamed_addr constant i64 42
; CHECK: .section __TEXT,__literal8,8byte_literals
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private5:
@private6 = private unnamed_addr constant i128 42
; CHECK: .section __TEXT,__literal16,16byte_literals
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private6:
%struct._objc_class = type { i8* }
@private7 = private global %struct._objc_class* null, section "__OBJC,__cls_refs,literal_pointers,no_dead_strip"
; CHECK: .section __OBJC,__cls_refs,literal_pointers,no_dead_strip
-; CHECK: .align 3
+; CHECK: .p2align 3
; CHECK: L_private7:
@private8 = private global i32* null, section "__DATA,__nl_symbol_ptr,non_lazy_symbol_pointers"
; CHECK: .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private8:
@private9 = private global i32* null, section "__DATA,__la_symbol_ptr,lazy_symbol_pointers"
; CHECK: .section __DATA,__la_symbol_ptr,lazy_symbol_pointers
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private9:
@private10 = private global i32* null, section "__DATA,__mod_init_func,mod_init_funcs"
; CHECK: .section __DATA,__mod_init_func,mod_init_funcs
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private10:
@private11 = private global i32* null, section "__DATA,__mod_term_func,mod_term_funcs"
; CHECK: .section __DATA,__mod_term_func,mod_term_funcs
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private11:
@private12 = private global i32* null, section "__DATA,__foobar,interposing"
; CHECK: .section __DATA,__foobar,interposing
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private12:
@private13 = private global i32 42, section "__DATA, __objc_classlist, regular, no_dead_strip"
; CHECK: .section __DATA,__objc_classlist,regular,no_dead_strip
-; CHECK-NEXT: .align 2
+; CHECK-NEXT: .p2align 2
; CHECK-NEXT: L_private13:
@private14 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_classname,cstring_literals"
diff --git a/test/CodeGen/X86/patchable-prologue.ll b/test/CodeGen/X86/patchable-prologue.ll
new file mode 100644
index 000000000000..c8daff33181c
--- /dev/null
+++ b/test/CodeGen/X86/patchable-prologue.ll
@@ -0,0 +1,67 @@
+; RUN: llc -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump -triple x86_64-apple-macosx -disassemble - | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
+
+declare void @callee(i64*)
+
+define void @f0() "patchable-function"="prologue-short-redirect" {
+; CHECK-LABEL: _f0:
+; CHECK-NEXT: 66 90 nop
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f0:
+
+ ret void
+}
+
+define void @f1() "patchable-function"="prologue-short-redirect" "no-frame-pointer-elim"="true" {
+; CHECK-LABEL: _f1
+; CHECK-NEXT: ff f5 pushq %rbp
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f1:
+ ret void
+}
+
+define void @f2() "patchable-function"="prologue-short-redirect" {
+; CHECK-LABEL: _f2
+; CHECK-NEXT: 48 81 ec a8 00 00 00 subq $168, %rsp
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f2:
+ %ptr = alloca i64, i32 20
+ call void @callee(i64* %ptr)
+ ret void
+}
+
+define void @f3() "patchable-function"="prologue-short-redirect" optsize {
+; CHECK-LABEL: _f3
+; CHECK-NEXT: 66 90 nop
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f3:
+ ret void
+}
+
+; This testcase happens to produce a KILL instruction at the beginning of the
+; first basic block. In this case the 2nd instruction should be turned into a
+; patchable one.
+; CHECK-LABEL: f4:
+; CHECK-NEXT: 8b 0c 37 movl (%rdi,%rsi), %ecx
+define i32 @f4(i8* %arg1, i64 %arg2, i32 %arg3) "patchable-function"="prologue-short-redirect" {
+bb:
+ %tmp10 = getelementptr i8, i8* %arg1, i64 %arg2
+ %tmp11 = bitcast i8* %tmp10 to i32*
+ %tmp12 = load i32, i32* %tmp11, align 4
+ fence acquire
+ %tmp13 = add i32 %tmp12, %arg3
+ %tmp14 = cmpxchg i32* %tmp11, i32 %tmp12, i32 %tmp13 seq_cst monotonic
+ %tmp15 = extractvalue { i32, i1 } %tmp14, 1
+ br i1 %tmp15, label %bb21, label %bb16
+
+bb16:
+ br label %bb21
+
+bb21:
+ %tmp22 = phi i32 [ %tmp12, %bb ], [ %arg3, %bb16 ]
+ ret i32 %tmp22
+}
diff --git a/test/CodeGen/X86/patchpoint-verifiable.mir b/test/CodeGen/X86/patchpoint-verifiable.mir
index 300ecaf002f2..c108473a1824 100644
--- a/test/CodeGen/X86/patchpoint-verifiable.mir
+++ b/test/CodeGen/X86/patchpoint-verifiable.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-apple-darwin -stop-after branch-folder -start-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=x86_64-apple-darwin -stop-after branch-folder -start-after branch-folder -o - %s | FileCheck %s
# This test verifies that the machine verifier won't report an error when
# verifying the PATCHPOINT instruction.
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index 441fb02a89e6..4bdfee6f81eb 100644
--- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
; TODO: Reenable verify-machineinstrs once the if (!AXDead) // FIXME in
; X86InstrInfo::copyPhysReg() is resolved.
@@ -142,14 +142,21 @@ f:
; CHECK: cmpxchg
; CHECK: seto %al
; CHECK-NEXT: lahf
-; Save result of the first cmpxchg into D.
-; CHECK-NEXT: mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]
+; Save result of the first cmpxchg into a temporary.
+; For 32-bit ISA, EDX, EAX are used by the results.
+; EAX, EBX, ECX, and EDX are used to set the arguments.
+; That leaves us EDI and ESI.
+; CHECK32-NEXT: movl %[[AX:eax]], %[[TMP:e[ds]i]]
+; For 64-bit ISA, RAX is used for both the result and argument.
+; This leaves us plenty of choices for the temporary. For now,
+; this is rdx, but any register could do.
+; CHECK64-NEXT: mov{{[lq]}} %[[AX:[er]ax]], %[[TMP:rdx]]
; CHECK: cmpxchg
; CHECK-NEXT: sete %al
; Save result of the second cmpxchg onto the stack.
; CHECK-NEXT: push{{[lq]}} %[[AX]]
; Restore result of the first cmpxchg from D, put it back in EFLAGS.
-; CHECK-NEXT: mov{{[lq]}} %[[D]], %[[AX]]
+; CHECK-NEXT: mov{{[lq]}} %[[TMP]], %[[AX]]
; CHECK-NEXT: addb $127, %al
; CHECK-NEXT: sahf
; Restore result of the second cmpxchg from the stack.
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
index 17e7e1dfdcf7..44ad05ec6ed7 100644
--- a/test/CodeGen/X86/phaddsub.ll
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -1,168 +1,225 @@
-; RUN: llc < %s -march=x86-64 -mattr=+ssse3,-avx | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -march=x86-64 -mattr=-ssse3,+avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phaddw1:
-; SSSE3-NOT: vphaddw
-; SSSE3: phaddw
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddw1:
-; AVX: vphaddw
-define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%r = add <8 x i16> %a, %b
ret <8 x i16> %r
}
+define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phaddw2:
-; SSSE3-NOT: vphaddw
-; SSSE3: phaddw
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddw2:
-; AVX: vphaddw
-define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
%b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
%r = add <8 x i16> %a, %b
ret <8 x i16> %r
}
+define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phaddd1:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd1:
-; AVX: vphaddd
-define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phaddd2:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd2:
-; AVX: vphaddd
-define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
%b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd3(<4 x i32> %x) {
; SSSE3-LABEL: phaddd3:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd3:
-; AVX: vphaddd
-define <4 x i32> @phaddd3(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd4(<4 x i32> %x) {
; SSSE3-LABEL: phaddd4:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd4:
-; AVX: vphaddd
-define <4 x i32> @phaddd4(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd5(<4 x i32> %x) {
; SSSE3-LABEL: phaddd5:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd5:
-; AVX: vphaddd
-define <4 x i32> @phaddd5(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd6(<4 x i32> %x) {
; SSSE3-LABEL: phaddd6:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd6:
-; AVX: vphaddd
-define <4 x i32> @phaddd6(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd7(<4 x i32> %x) {
; SSSE3-LABEL: phaddd7:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd7:
-; AVX: vphaddd
-define <4 x i32> @phaddd7(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phsubw1:
-; SSSE3-NOT: vphsubw
-; SSSE3: phsubw
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubw1:
-; AVX: vphsubw
-define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%r = sub <8 x i16> %a, %b
ret <8 x i16> %r
}
+define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phsubd1:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd1:
-; AVX: vphsubd
-define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = sub <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phsubd2(<4 x i32> %x) {
; SSSE3-LABEL: phsubd2:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd2:
-; AVX: vphsubd
-define <4 x i32> @phsubd2(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = sub <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phsubd3(<4 x i32> %x) {
; SSSE3-LABEL: phsubd3:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd3:
-; AVX: vphsubd
-define <4 x i32> @phsubd3(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = sub <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phsubd4(<4 x i32> %x) {
; SSSE3-LABEL: phsubd4:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd4:
-; AVX: vphsubd
-define <4 x i32> @phsubd4(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = sub <4 x i32> %a, %b
diff --git a/test/CodeGen/X86/phi-immediate-factoring.ll b/test/CodeGen/X86/phi-immediate-factoring.ll
index 6425ef0e8376..05a0bf68657b 100644
--- a/test/CodeGen/X86/phi-immediate-factoring.ll
+++ b/test/CodeGen/X86/phi-immediate-factoring.ll
@@ -1,5 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
+; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
+; RUN: llc < %s -disable-preheader-prot=false -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
; PR1296
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index a0adba0f8338..8b370d93afdb 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
@.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-2.ll b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
index 8ee97ae07e65..a02a4ae15c32 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-2.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
@@ -13,7 +13,7 @@ forcond.preheader: ; preds = %entry
ifthen: ; preds = %entry
ret i32 0
-; CHECK: forbody
+; CHECK: forbody{{$}}
; CHECK-NOT: mov
forbody: ; preds = %forbody, %forcond.preheader
%indvar = phi i32 [ 0, %forcond.preheader ], [ %divisor.02, %forbody ] ; <i32> [#uses=3]
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index 73be234db81c..f03dc3f4a285 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -192,7 +192,7 @@ bb12:
; LINUX: .LJTI7_0@GOTOFF(
; LINUX: jmpl *
-; LINUX: .align 4
+; LINUX: .p2align 2
; LINUX-NEXT: .LJTI7_0:
; LINUX: .long .LBB7_2@GOTOFF
; LINUX: .long .LBB7_8@GOTOFF
diff --git a/test/CodeGen/X86/pic_jumptable.ll b/test/CodeGen/X86/pic_jumptable.ll
index 8c1992a24ece..444f98ef83de 100644
--- a/test/CodeGen/X86/pic_jumptable.ll
+++ b/test/CodeGen/X86/pic_jumptable.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -relocation-model=pic -mark-data-regions -mtriple=i686-apple-darwin -asm-verbose=false \
; RUN: | FileCheck %s --check-prefix=CHECK-DATA
; RUN: llc < %s -relocation-model=pic -mtriple=i686-apple-darwin -asm-verbose=false \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefix=CHECK-DATA
; RUN: llc < %s -mtriple=x86_64-apple-darwin | not grep 'lJTI'
; rdar://6971437
; rdar://7738756
diff --git a/test/CodeGen/X86/pie.ll b/test/CodeGen/X86/pie.ll
new file mode 100644
index 000000000000..7b765f8ef54a
--- /dev/null
+++ b/test/CodeGen/X86/pie.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=i686-linux-gnu -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=i686-linux-gnu -fast-isel -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=x86_64-linux-gnu -fast-isel -relocation-model=pic | FileCheck %s
+
+; CHECK-LABEL: bar:
+; CHECK: call{{l|q}} foo{{$}}
+; CHECK: call{{l|q}} weak_odr_foo{{$}}
+; CHECK: call{{l|q}} weak_foo{{$}}
+; CHECK: call{{l|q}} internal_foo{{$}}
+; CHECK: call{{l|q}} ext_baz@PLT
+
+define weak void @weak_foo() {
+ ret void
+}
+
+define weak_odr void @weak_odr_foo() {
+ ret void
+}
+
+define internal void @internal_foo() {
+ ret void
+}
+
+declare i32 @ext_baz()
+
+define void @foo() {
+ ret void
+}
+
+define void @bar() {
+entry:
+ call void @foo()
+ call void @weak_odr_foo()
+ call void @weak_foo()
+ call void @internal_foo()
+ call i32 @ext_baz()
+ ret void
+}
+
+; -fpie for local global data tests should be added here
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/pku.ll b/test/CodeGen/X86/pku.ll
index 8568cf43abc0..79b8c474ade0 100644
--- a/test/CodeGen/X86/pku.ll
+++ b/test/CodeGen/X86/pku.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding -verify-machineinstrs | FileCheck %s
declare i32 @llvm.x86.rdpkru()
declare void @llvm.x86.wrpkru(i32)
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 37b6fdf7cfeb..5f2c88d670ac 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,10 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
-define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
-; SSE2-LABEL: mul8c:
+define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
+; SSE2-LABEL: mul_v16i8c:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE2-NEXT: psraw $8, %xmm1
@@ -21,7 +23,7 @@ define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: mul8c:
+; SSE41-LABEL: mul_v16i8c:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2
@@ -36,7 +38,7 @@ define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul8c:
+; AVX2-LABEL: mul_v16i8c:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
@@ -48,28 +50,46 @@ define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v16i8c:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v16i8c:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
+; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
ret <16 x i8> %A
}
-define <8 x i16> @mul16c(<8 x i16> %i) nounwind {
-; SSE-LABEL: mul16c:
+define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind {
+; SSE-LABEL: mul_v8i16c:
; SSE: # BB#0: # %entry
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: mul16c:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v8i16c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
ret <8 x i16> %A
}
-define <4 x i32> @a(<4 x i32> %i) nounwind {
-; SSE2-LABEL: a:
+define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind {
+; SSE2-LABEL: mul_v4i32c:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
@@ -80,23 +100,23 @@ define <4 x i32> @a(<4 x i32> %i) nounwind {
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; SSE41-LABEL: a:
+; SSE41-LABEL: mul_v4i32c:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: a:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i32c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
ret <4 x i32> %A
}
-define <2 x i64> @b(<2 x i64> %i) nounwind {
-; SSE-LABEL: b:
+define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind {
+; SSE-LABEL: mul_v2i64c:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -107,22 +127,22 @@ define <2 x i64> @b(<2 x i64> %i) nounwind {
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: b:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117]
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v2i64c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117]
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <2 x i64> %i, < i64 117, i64 117 >
ret <2 x i64> %A
}
-define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
-; SSE2-LABEL: mul8:
+define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
+; SSE2-LABEL: mul_v16i8:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -142,7 +162,7 @@ define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: mul8:
+; SSE41-LABEL: mul_v16i8:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovsxbw %xmm1, %xmm3
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
@@ -159,7 +179,7 @@ define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul8:
+; AVX2-LABEL: mul_v16i8:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
@@ -171,28 +191,46 @@ define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v16i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v16i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, %j
ret <16 x i8> %A
}
-define <8 x i16> @mul16(<8 x i16> %i, <8 x i16> %j) nounwind {
-; SSE-LABEL: mul16:
+define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind {
+; SSE-LABEL: mul_v8i16:
; SSE: # BB#0: # %entry
; SSE-NEXT: pmullw %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: mul16:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v8i16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <8 x i16> %i, %j
ret <8 x i16> %A
}
-define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind {
-; SSE2-LABEL: c:
+define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind {
+; SSE2-LABEL: mul_v4i32:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
@@ -203,22 +241,22 @@ define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; SSE41-LABEL: c:
+; SSE41-LABEL: mul_v4i32:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: c:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i32> %i, %j
ret <4 x i32> %A
}
-define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind {
-; SSE-LABEL: d:
+define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind {
+; SSE-LABEL: mul_v2i64:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -226,25 +264,25 @@ define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm3
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: paddq %xmm3, %xmm2
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm0
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: d:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v2i64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <2 x i64> %i, %j
ret <2 x i64> %A
@@ -252,8 +290,8 @@ entry:
declare void @foo()
-define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
-; SSE2-LABEL: e:
+define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind {
+; SSE2-LABEL: mul_v4i32spill:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: subq $40, %rsp
; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
@@ -271,7 +309,7 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-NEXT: addq $40, %rsp
; SSE2-NEXT: retq
;
-; SSE41-LABEL: e:
+; SSE41-LABEL: mul_v4i32spill:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: subq $40, %rsp
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
@@ -282,16 +320,16 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE41-NEXT: addq $40, %rsp
; SSE41-NEXT: retq
;
-; AVX2-LABEL: e:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: subq $40, %rsp
-; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: callq foo
-; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX2-NEXT: vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT: addq $40, %rsp
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i32spill:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: callq foo
+; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: retq
entry:
; Use a call to force spills.
call void @foo()
@@ -299,8 +337,8 @@ entry:
ret <4 x i32> %A
}
-define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind {
-; SSE-LABEL: f:
+define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind {
+; SSE-LABEL: mul_v2i64spill:
; SSE: # BB#0: # %entry
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
@@ -314,33 +352,55 @@ define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm1
; SSE-NEXT: pmuludq %xmm0, %xmm1
; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm1, %xmm2
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm3, %xmm0
; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
-; AVX2-LABEL: f:
+; AVX2-LABEL: mul_v2i64spill:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: subq $40, %rsp
; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: callq foo
; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm0
+; AVX2-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm1
-; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlq $32, %xmm3, %xmm1
-; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa %xmm2, %xmm3
+; AVX2-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq $32, %xmm4, %xmm2
+; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: addq $40, %rsp
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v2i64spill:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: callq foo
+; AVX512-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX512-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
+; AVX512-NEXT: vpsrlq $32, %xmm2, %xmm1
+; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsrlq $32, %xmm4, %xmm2
+; AVX512-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: retq
entry:
; Use a call to force spills.
call void @foo()
@@ -348,8 +408,160 @@ entry:
ret <2 x i64> %A
}
-define <4 x i64> @b1(<4 x i64> %i) nounwind {
-; SSE-LABEL: b1:
+define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
+; SSE2-LABEL: mul_v32i8c:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v32i8c:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm4
+; SSE41-NEXT: pmullw %xmm4, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pmullw %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm2
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm3
+; SSE41-NEXT: pmullw %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pmullw %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v32i8c:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v32i8c:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v32i8c:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
+ ret <32 x i8> %A
+}
+
+define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind {
+; SSE-LABEL: mul_v16i16c:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mul_v16i16c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
+ ret <16 x i16> %A
+}
+
+define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind {
+; SSE2-LABEL: mul_v8i32c:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v8i32c:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117]
+; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: pmulld %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mul_v8i32c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 >
+ ret <8 x i32> %A
+}
+
+define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind {
+; SSE-LABEL: mul_v4i64c:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm3
@@ -366,22 +578,188 @@ define <4 x i64> @b1(<4 x i64> %i) nounwind {
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: retq
;
-; AVX2-LABEL: b1:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i64c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
ret <4 x i64> %A
}
-define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind {
-; SSE-LABEL: b2:
+define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
+; SSE2-LABEL: mul_v32i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v32i8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm5
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm4
+; SSE41-NEXT: pmullw %xmm5, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm0
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v32i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v32i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v32i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <32 x i8> %i, %j
+ ret <32 x i8> %A
+}
+
+define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind {
+; SSE-LABEL: mul_v16i16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mul_v16i16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <16 x i16> %i, %j
+ ret <16 x i16> %A
+}
+
+define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind {
+; SSE2-LABEL: mul_v8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: pmulld %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mul_v8i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <8 x i32> %i, %j
+ ret <8 x i32> %A
+}
+
+define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind {
+; SSE-LABEL: mul_v4i64:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: pmuludq %xmm2, %xmm4
@@ -389,10 +767,10 @@ define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm5
; SSE-NEXT: pmuludq %xmm0, %xmm5
; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: paddq %xmm5, %xmm4
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
; SSE-NEXT: paddq %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pmuludq %xmm3, %xmm2
@@ -400,27 +778,401 @@ define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm4
; SSE-NEXT: pmuludq %xmm1, %xmm4
; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: paddq %xmm4, %xmm2
; SSE-NEXT: psrlq $32, %xmm1
; SSE-NEXT: pmuludq %xmm3, %xmm1
; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm4, %xmm1
; SSE-NEXT: paddq %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX2-LABEL: b2:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i64> %i, %j
ret <4 x i64> %A
}
+define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
+; SSE2-LABEL: mul_v64i8c:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: packuswb %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: packuswb %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v64i8c:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm6
+; SSE41-NEXT: pmullw %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm7, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm1
+; SSE41-NEXT: pmullw %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm7, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
+; SSE41-NEXT: pmullw %xmm6, %xmm4
+; SSE41-NEXT: pand %xmm7, %xmm4
+; SSE41-NEXT: packuswb %xmm4, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm4
+; SSE41-NEXT: pmullw %xmm6, %xmm4
+; SSE41-NEXT: pand %xmm7, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm7, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm5
+; SSE41-NEXT: pmullw %xmm6, %xmm5
+; SSE41-NEXT: pand %xmm7, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm7, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v64i8c:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v64i8c:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v64i8c:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
+ ret <64 x i8> %A
+}
+
+define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
+; SSE2-LABEL: mul_v64i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm9
+; SSE2-NEXT: pmullw %xmm8, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: packuswb %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: pmullw %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm7
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: packuswb %xmm5, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v64i8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm9
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: pmullw %xmm9, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm4, %xmm1
+; SSE41-NEXT: pand %xmm9, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: pmovsxbw %xmm5, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm8, %xmm1
+; SSE41-NEXT: pmullw %xmm4, %xmm1
+; SSE41-NEXT: pand %xmm9, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm5, %xmm5
+; SSE41-NEXT: pmullw %xmm4, %xmm5
+; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: packuswb %xmm5, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm6, %xmm5
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm4
+; SSE41-NEXT: pmullw %xmm5, %xmm4
+; SSE41-NEXT: pand %xmm9, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm5, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm5, %xmm2
+; SSE41-NEXT: pand %xmm9, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm7, %xmm2
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm5
+; SSE41-NEXT: pmullw %xmm2, %xmm5
+; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pand %xmm9, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v64i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX2-NEXT: vpmullw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v64i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v64i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <64 x i8> %i, %j
+ ret <64 x i8> %A
+}
+
diff --git a/test/CodeGen/X86/pop-stack-cleanup.ll b/test/CodeGen/X86/pop-stack-cleanup.ll
index bcf7594065f3..f81d911ea31b 100644
--- a/test/CodeGen/X86/pop-stack-cleanup.ll
+++ b/test/CodeGen/X86/pop-stack-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=LINUX64
declare void @param1(i32 %a)
@@ -7,6 +7,7 @@ declare i64 @param2_ret64(i32 %a, i32 %b)
declare void @param2(i32 %a, i32 %b)
declare void @param3(i32 %a, i32 %b, i32 %c)
declare void @param8(i64, i64, i64, i64, i64, i64, i64, i64)
+declare i32 @param8_ret(i64, i64, i64, i64, i64, i64, i64, i64)
define void @test() minsize nounwind {
@@ -74,3 +75,13 @@ define void @test_linux64(i32 %size) minsize nounwind {
call void @param8(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8)
ret void
}
+
+define i32 @test_linux64_i32(i32 %size) minsize nounwind {
+; LINUX64-LABEL: test_linux64_i32:
+; LINUX64: callq param8_ret
+; LINUX64-NOT: popq %rax
+; LINUX64: retq
+ %a = alloca i64, i32 %size, align 8
+ %r = call i32 @param8_ret(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8)
+ ret i32 %r
+}
diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll
index e9350de101f6..b5d4ebba0538 100644
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@@ -1,35 +1,252 @@
-; RUN: llc -march=x86-64 -mattr=+popcnt < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
define i8 @cnt8(i8 %x) nounwind readnone {
+; X32-LABEL: cnt8:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb %al
+; X32-NEXT: andb $85, %al
+; X32-NEXT: subb %al, %cl
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $51, %al
+; X32-NEXT: shrb $2, %cl
+; X32-NEXT: andb $51, %cl
+; X32-NEXT: addb %al, %cl
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $4, %al
+; X32-NEXT: addb %cl, %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt8:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrb %al
+; X64-NEXT: andb $85, %al
+; X64-NEXT: subb %al, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andb $51, %al
+; X64-NEXT: shrb $2, %dil
+; X64-NEXT: andb $51, %dil
+; X64-NEXT: addb %al, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrb $4, %al
+; X64-NEXT: addb %dil, %al
+; X64-NEXT: andb $15, %al
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt8:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT: popcntw %ax, %ax
+; X32-POPCNT-NEXT: # kill: %AL<def> %AL<kill> %AX<kill>
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt8:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: movzbl %dil, %eax
+; X64-POPCNT-NEXT: popcntw %ax, %ax
+; X64-POPCNT-NEXT: # kill: %AL<def> %AL<kill> %AX<kill>
+; X64-POPCNT-NEXT: retq
%cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
ret i8 %cnt
-; CHECK-LABEL: cnt8:
-; CHECK: popcntw
-; CHECK: ret
}
define i16 @cnt16(i16 %x) nounwind readnone {
+; X32-LABEL: cnt16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl %ecx
+; X32-NEXT: andl $21845, %ecx # imm = 0x5555
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $13107, %ecx # imm = 0x3333
+; X32-NEXT: shrl $2, %eax
+; X32-NEXT: andl $13107, %eax # imm = 0x3333
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $32752, %ecx # imm = 0x7FF0
+; X32-NEXT: shrl $4, %ecx
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: andl $3855, %ecx # imm = 0xF0F
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shll $8, %eax
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %ah, %eax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt16:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: andl $21845, %eax # imm = 0x5555
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $13107, %eax # imm = 0x3333
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $13107, %edi # imm = 0x3333
+; X64-NEXT: addl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $32752, %eax # imm = 0x7FF0
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: addl %edi, %eax
+; X64-NEXT: andl $3855, %eax # imm = 0xF0F
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shll $8, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: movzbl %ch, %eax # NOREX
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt16:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: popcntw {{[0-9]+}}(%esp), %ax
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt16:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: popcntw %di, %ax
+; X64-POPCNT-NEXT: retq
%cnt = tail call i16 @llvm.ctpop.i16(i16 %x)
ret i16 %cnt
-; CHECK-LABEL: cnt16:
-; CHECK: popcntw
-; CHECK: ret
}
define i32 @cnt32(i32 %x) nounwind readnone {
+; X32-LABEL: cnt32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl %ecx
+; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X32-NEXT: shrl $2, %eax
+; X32-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $4, %ecx
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
+; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101
+; X32-NEXT: shrl $24, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $858993459, %edi # imm = 0x33333333
+; X64-NEXT: addl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: addl %edi, %eax
+; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
+; X64-NEXT: shrl $24, %eax
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt32:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt32:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: popcntl %edi, %eax
+; X64-POPCNT-NEXT: retq
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
ret i32 %cnt
-; CHECK-LABEL: cnt32:
-; CHECK: popcntl
-; CHECK: ret
}
define i64 @cnt64(i64 %x) nounwind readnone {
+; X32-LABEL: cnt64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl %edx
+; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555
+; X32-NEXT: subl %edx, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andl $858993459, %edx # imm = 0x33333333
+; X32-NEXT: shrl $2, %ecx
+; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $4, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; X32-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrl %edx
+; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555
+; X32-NEXT: subl %edx, %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: andl $858993459, %edx # imm = 0x33333333
+; X32-NEXT: shrl $2, %eax
+; X32-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrl $4, %edx
+; X32-NEXT: addl %eax, %edx
+; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; X32-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
+; X32-NEXT: shrl $24, %eax
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $4, %rax
+; X64-NEXT: leaq (%rax,%rdi), %rax
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: shrq $56, %rax
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt64:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT: addl %ecx, %eax
+; X32-POPCNT-NEXT: xorl %edx, %edx
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt64:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: popcntq %rdi, %rax
+; X64-POPCNT-NEXT: retq
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %cnt
-; CHECK-LABEL: cnt64:
-; CHECK: popcntq
-; CHECK: ret
}
declare i8 @llvm.ctpop.i8(i8) nounwind readnone
diff --git a/test/CodeGen/X86/post-ra-sched.ll b/test/CodeGen/X86/post-ra-sched.ll
new file mode 100644
index 000000000000..c31072a8a5eb
--- /dev/null
+++ b/test/CodeGen/X86/post-ra-sched.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s
+;
+; Verify that scheduling puts some distance between a load feeding into
+; the address of another load, and that second load. This currently
+; happens during the post-RA-scheduler, which should be enabled by
+; default with the above specified cpus.
+
+@ptrs = external global [0 x i32*], align 4
+@idxa = common global i32 0, align 4
+@idxb = common global i32 0, align 4
+@res = common global i32 0, align 4
+
+define void @addindirect() {
+; CHECK-LABEL: addindirect:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl idxb, %ecx
+; CHECK-NEXT: movl idxa, %eax
+; CHECK-NEXT: movl ptrs(,%ecx,4), %ecx
+; CHECK-NEXT: movl ptrs(,%eax,4), %eax
+; CHECK-NEXT: movl (%ecx), %ecx
+; CHECK-NEXT: addl (%eax), %ecx
+; CHECK-NEXT: movl %ecx, res
+; CHECK-NEXT: retl
+entry:
+ %0 = load i32, i32* @idxa, align 4
+ %arrayidx = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %0
+ %1 = load i32*, i32** %arrayidx, align 4
+ %2 = load i32, i32* %1, align 4
+ %3 = load i32, i32* @idxb, align 4
+ %arrayidx1 = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %3
+ %4 = load i32*, i32** %arrayidx1, align 4
+ %5 = load i32, i32* %4, align 4
+ %add = add i32 %5, %2
+ store i32 %add, i32* @res, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/postra-licm.ll b/test/CodeGen/X86/postra-licm.ll
index 5c93160125e7..329184a88ff0 100644
--- a/test/CodeGen/X86/postra-licm.ll
+++ b/test/CodeGen/X86/postra-licm.ll
@@ -70,7 +70,7 @@ bb26.preheader: ; preds = %imix_test.exit
bb23: ; preds = %imix_test.exit
unreachable
; Verify that there are no loads inside the loop.
-; X86-32: .align 4
+; X86-32: .p2align 4
; X86-32: %bb28
; X86-32-NOT: (%esp),
; X86-32-NOT: (%ebp),
@@ -152,7 +152,7 @@ entry:
bb.nph: ; preds = %entry
; X86-64: movq _map_4_to_16@GOTPCREL(%rip)
-; X86-64: .align 4
+; X86-64: .p2align 4
%tmp5 = zext i32 undef to i64 ; <i64> [#uses=1]
%tmp6 = add i64 %tmp5, 1 ; <i64> [#uses=1]
%tmp11 = shl i64 undef, 1 ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll
index 88b5f4eb14b0..fb7f570d6251 100644
--- a/test/CodeGen/X86/powi.ll
+++ b/test/CodeGen/X86/powi.ll
@@ -29,8 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize {
define double @pow_wrapper_minsize(double %a) minsize {
; CHECK-LABEL: pow_wrapper_minsize:
; CHECK: # BB#0:
-; CHECK-NEXT: movl $15, %edi
-; CHECK-NEXT: jmp
+; CHECK-NEXT: pushq $15
+; CHECK: popq %rdi
+; CHECK: jmp
%ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
ret double %ret
}
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index 9fc754aa1128..d62aaf90587d 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -7,18 +7,14 @@ define <4 x i3> @test1(<4 x i3>* %in) nounwind {
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $3, %ecx
-; CHECK-NEXT: andl $7, %ecx
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: andl $7, %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
+; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $6, %ecx
-; CHECK-NEXT: andl $7, %ecx
; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; CHECK-NEXT: shrl $9, %eax
-; CHECK-NEXT: andl $7, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%ret = load <4 x i3>, <4 x i3>* %in, align 1
ret <4 x i3> %ret
@@ -30,18 +26,14 @@ define <4 x i1> @test2(<4 x i1>* %in) nounwind {
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl %ecx
-; CHECK-NEXT: andl $1, %ecx
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: andl $1, %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
+; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $2, %ecx
-; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; CHECK-NEXT: shrl $3, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%ret = load <4 x i1>, <4 x i1>* %in, align 1
ret <4 x i1> %ret
diff --git a/test/CodeGen/X86/pr16360.ll b/test/CodeGen/X86/pr16360.ll
index 1f73a4d43600..0d2878dc6af0 100644
--- a/test/CodeGen/X86/pr16360.ll
+++ b/test/CodeGen/X86/pr16360.ll
@@ -1,16 +1,17 @@
-; RUN: llc < %s -mcpu=pentium4 -mtriple=i686-pc-linux | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s
define i64 @foo(i32 %sum) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: orl $-67108864, %eax # imm = 0xFC000000
+; CHECK-NEXT: movl $1073741823, %edx # imm = 0x3FFFFFFF
+; CHECK-NEXT: retl
entry:
%conv = sext i32 %sum to i64
%shr = lshr i64 %conv, 2
%or = or i64 4611686018360279040, %shr
ret i64 %or
}
-
-; CHECK: foo
-; CHECK: shrl $2
-; CHECK: orl $-67108864
-; CHECK-NOT: movl $-1
-; CHECK: movl $1073741823
-; CHECK: ret
diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
index 7a3fd6d1810b..a44248ff3f59 100644
--- a/test/CodeGen/X86/pr17764.ll
+++ b/test/CodeGen/X86/pr17764.ll
@@ -1,10 +1,16 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s
define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0
+; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
+; CHECK-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+;
%ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
ret <16 x i16> %ret
}
-; CHECK: foo
-; CHECK: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
-; CHECK: ret
diff --git a/test/CodeGen/X86/pr23664.ll b/test/CodeGen/X86/pr23664.ll
index a501c0db837e..155fc03de83b 100644
--- a/test/CodeGen/X86/pr23664.ll
+++ b/test/CodeGen/X86/pr23664.ll
@@ -9,6 +9,6 @@ define i2 @f(i32 %arg) {
; CHECK-LABEL: f:
; CHECK: addb %dil, %dil
; CHECK-NEXT: orb $1, %dil
-; CHECK-NEXT: movb %dil, %al
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
}
diff --git a/test/CodeGen/X86/pr2585.ll b/test/CodeGen/X86/pr2585.ll
new file mode 100644
index 000000000000..7796ee9a2628
--- /dev/null
+++ b/test/CodeGen/X86/pr2585.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+@0 = external constant <4 x i32> ; <<4 x i32>*>:0 [#uses=1]
+@1 = external constant <4 x i16> ; <<4 x i16>*>:1 [#uses=1]
+
+define internal void @PR2585() {
+; X32-LABEL: PR2585:
+; X32: # BB#0:
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT: movq %xmm0, __unnamed_2
+; X32-NEXT: retl
+;
+; X64-LABEL: PR2585:
+; X64: # BB#0:
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, {{.*}}(%rip)
+; X64-NEXT: retq
+ load <4 x i32>, <4 x i32>* @0, align 16 ; <<4 x i32>>:1 [#uses=1]
+ bitcast <4 x i32> %1 to <8 x i16> ; <<8 x i16>>:2 [#uses=1]
+ shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > ; <<8 x i16>>:3 [#uses=1]
+ bitcast <8 x i16> %3 to <2 x i64> ; <<2 x i64>>:4 [#uses=1]
+ extractelement <2 x i64> %4, i32 0 ; <i64>:5 [#uses=1]
+ bitcast i64 %5 to <4 x i16> ; <<4 x i16>>:6 [#uses=1]
+ store <4 x i16> %6, <4 x i16>* @1, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/pr26350.ll b/test/CodeGen/X86/pr26350.ll
new file mode 100644
index 000000000000..6e87cb3e8b7a
--- /dev/null
+++ b/test/CodeGen/X86/pr26350.ll
@@ -0,0 +1,21 @@
+; RUN: llc -disable-constant-hoisting < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@d = global i32 8, align 4
+
+define i32 @main() {
+entry:
+ %load = load i32, i32* @d, align 4
+ %conv1 = zext i32 %load to i64
+ %shl = shl i64 %conv1, 1
+ %mul = and i64 %shl, 4294967312
+ %cmp = icmp ugt i64 4294967295, %mul
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+; CHECK: main:
+; CHECK: movl d, %[[load:.*]]
+; CHECK: movl %[[load]], %[[copy:.*]]
+; CHECK: shrl $31, %[[copy]]
+; CHECK: addl %[[load]], %[[load]]
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll
index 8003588a2e84..debb13ee3e5d 100644
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -21,7 +21,7 @@ forcond.preheader: ; preds = %entry
; CHECK: je
; There should be no moves required in the for loop body.
-; CHECK: %forbody
+; CHECK: %forbody{{$}}
; CHECK-NOT: mov
; CHECK: jbe
diff --git a/test/CodeGen/X86/pr26652.ll b/test/CodeGen/X86/pr26652.ll
new file mode 100644
index 000000000000..c47128a51e9a
--- /dev/null
+++ b/test/CodeGen/X86/pr26652.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86
+; PR26652
+
+define <2 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
+entry:
+ %0 = or <4 x i32> %a, %b
+ %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ ret <2 x i32> %1
+}
diff --git a/test/CodeGen/X86/pr26757.ll b/test/CodeGen/X86/pr26757.ll
new file mode 100644
index 000000000000..96cbb783ca01
--- /dev/null
+++ b/test/CodeGen/X86/pr26757.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc"
+
+declare void @throw()
+
+define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+ %e = alloca i8, align 4
+ invoke void @throw()
+ to label %.noexc unwind label %catch.dispatch
+
+.noexc:
+ unreachable
+
+catch.object.Exception:
+ %cp = catchpad within %cs [i8* null, i32 0, i8* %e]
+ catchret from %cp to label %catchhandler
+
+catch.dispatch:
+ %cs = catchswitch within none [label %catch.object.Exception] unwind to caller
+
+catchhandler:
+ call void @use(i8* %e)
+ ret void
+}
+
+; CHECK-LABEL: $handlerMap$0$test1:
+; CHECK: .long 0
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -20
+
+declare void @use(i8*)
+
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/CodeGen/X86/pr26835.ll b/test/CodeGen/X86/pr26835.ll
new file mode 100644
index 000000000000..4fc73b885757
--- /dev/null
+++ b/test/CodeGen/X86/pr26835.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux"
+
+; CHECK-LABEL: foo
+; CHECK: div
+define i24 @foo(i24 %a, i24 %b) {
+ %r = urem i24 %a, %b
+ ret i24 %r
+}
diff --git a/test/CodeGen/X86/pr26870.ll b/test/CodeGen/X86/pr26870.ll
new file mode 100644
index 000000000000..2731ed2d0125
--- /dev/null
+++ b/test/CodeGen/X86/pr26870.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mtriple=i686-pc-windows-msvc18.0.0 -mcpu=pentium4
+
+define x86_thiscallcc i32* @fn4(i32* %this, i8* dereferenceable(1) %p1) {
+entry:
+ %DL = getelementptr inbounds i32, i32* %this, i32 0
+ %call.i = tail call x86_thiscallcc i64 @fn1(i32* %DL)
+ %getTypeAllocSize___trans_tmp_2.i = getelementptr inbounds i32, i32* %this, i32 0
+ %0 = load i32, i32* %getTypeAllocSize___trans_tmp_2.i, align 4
+ %call.i8 = tail call x86_thiscallcc i64 @fn1(i32* %DL)
+ %1 = insertelement <2 x i64> undef, i64 %call.i, i32 0
+ %2 = insertelement <2 x i64> %1, i64 %call.i8, i32 1
+ %3 = add nsw <2 x i64> %2, <i64 7, i64 7>
+ %4 = sdiv <2 x i64> %3, <i64 8, i64 8>
+ %5 = add nsw <2 x i64> %4, <i64 1, i64 1>
+ %6 = load i32, i32* %getTypeAllocSize___trans_tmp_2.i, align 4
+ %7 = insertelement <2 x i32> undef, i32 %0, i32 0
+ %8 = insertelement <2 x i32> %7, i32 %6, i32 1
+ %9 = zext <2 x i32> %8 to <2 x i64>
+ %10 = srem <2 x i64> %5, %9
+ %11 = sub <2 x i64> %5, %10
+ %12 = trunc <2 x i64> %11 to <2 x i32>
+ %13 = extractelement <2 x i32> %12, i32 0
+ %14 = extractelement <2 x i32> %12, i32 1
+ %cmp = icmp eq i32 %13, %14
+ br i1 %cmp, label %if.then, label %cleanup
+
+if.then:
+ %call4 = tail call x86_thiscallcc i32* @fn3(i8* nonnull %p1)
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i32* [ %call4, %if.then ], [ undef, %entry ]
+ ret i32* %retval.0
+}
+
+declare x86_thiscallcc i32* @fn3(i8*)
+declare x86_thiscallcc i64 @fn1(i32*)
diff --git a/test/CodeGen/X86/pr27071.ll b/test/CodeGen/X86/pr27071.ll
new file mode 100644
index 000000000000..13608d510770
--- /dev/null
+++ b/test/CodeGen/X86/pr27071.ll
@@ -0,0 +1,29 @@
+; RUN: llc -relocation-model pic < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd"
+
+@x1 = external thread_local global i32, align 4
+
+define void @x3() #0 {
+entry:
+ %0 = load i32, i32* @x1, align 4
+ %cond = icmp eq i32 %0, 92
+ br i1 %cond, label %sw.bb, label %sw.epilog
+
+sw.bb: ; preds = %entry
+ call void @x2(i8* null)
+ unreachable
+
+sw.epilog: ; preds = %entry
+ ret void
+}
+
+declare void @x2(i8*)
+
+attributes #0 = { optsize }
+
+; CHECK-LABEL: x3:
+; CHECK: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.L0$pb), %[[REG:.*]]
+; CHECK-NEXT: leal x1@TLSGD(,%[[REG]]), %eax
+; CHECK-NEXT: calll ___tls_get_addr@PLT
+; CHECK-NEXT: cmpl $92, (%eax)
diff --git a/test/CodeGen/X86/pr27501.ll b/test/CodeGen/X86/pr27501.ll
new file mode 100644
index 000000000000..bde41214471d
--- /dev/null
+++ b/test/CodeGen/X86/pr27501.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @test1(i64* %result.repack) personality i32 (...)* @__CxxFrameHandler3 {
+bb:
+ invoke void @may_throw(i32 1)
+ to label %postinvoke unwind label %cleanuppad
+; CHECK: movl $1, %ecx
+; CHECK: callq may_throw
+
+postinvoke: ; preds = %bb
+ store i64 19, i64* %result.repack, align 8
+
+; CHECK: movq $19, (%rsi)
+; CHECK: movl $2, %ecx
+; CHECK-NEXT: movq %rsi, -8(%rbp)
+; CHECK-NEXT: callq may_throw
+ invoke void @may_throw(i32 2)
+ to label %assertFailed unwind label %catch.dispatch
+
+catch.dispatch: ; preds = %cleanuppad9, %postinvoke
+ %tmp3 = catchswitch within none [label %catch.object.Throwable] unwind label %cleanuppad
+
+catch.object.Throwable: ; preds = %catch.dispatch
+ %tmp2 = catchpad within %tmp3 [i8* null, i32 64, i8* null]
+ catchret from %tmp2 to label %catchhandler
+
+catchhandler: ; preds = %catch.object.Throwable
+ invoke void @may_throw(i32 3)
+ to label %try.success.or.caught unwind label %cleanuppad
+
+try.success.or.caught: ; preds = %catchhandler
+ invoke void @may_throw(i32 4)
+ to label %postinvoke27 unwind label %cleanuppad24
+; CHECK: movl $4, %ecx
+; CHECK-NEXT: callq may_throw
+
+postinvoke27: ; preds = %try.success.or.caught
+ store i64 42, i64* %result.repack, align 8
+; CHECK: movq -8(%rbp), %[[reload:r..]]
+; CHECK-NEXT: movq $42, (%[[reload]])
+ ret void
+
+cleanuppad24: ; preds = %try.success.or.caught
+ %tmp5 = cleanuppad within none []
+ cleanupret from %tmp5 unwind to caller
+
+cleanuppad: ; preds = %catchhandler, %catch.dispatch, %bb
+ %tmp1 = cleanuppad within none []
+ cleanupret from %tmp1 unwind to caller
+
+assertFailed: ; preds = %postinvoke
+ invoke void @may_throw(i32 5)
+ to label %postinvoke13 unwind label %cleanuppad9
+
+postinvoke13: ; preds = %assertFailed
+ unreachable
+
+cleanuppad9: ; preds = %assertFailed
+ %tmp4 = cleanuppad within none []
+ cleanupret from %tmp4 unwind label %catch.dispatch
+}
+
+declare void @may_throw(i32)
+
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/CodeGen/X86/pr27591.ll b/test/CodeGen/X86/pr27591.ll
new file mode 100644
index 000000000000..11f5de4956a4
--- /dev/null
+++ b/test/CodeGen/X86/pr27591.ll
@@ -0,0 +1,51 @@
+; RUN: llc -o - -O0 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1(i32 %x) #0 {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: kmovw %ecx, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: andb $1, %al
+; CHECK-NEXT: movzbl %al, %edi
+; CHECK-NEXT: callq callee1
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+entry:
+ %tobool = icmp ne i32 %x, 0
+ call void @callee1(i1 zeroext %tobool)
+ ret void
+}
+
+define void @test2(i32 %x) #0 {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: kmovw %ecx, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: movb %cl, %al
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: movl $-1, %edx
+; CHECK-NEXT: cmovnel %edx, %edi
+; CHECK-NEXT: callq callee2
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+entry:
+ %tobool = icmp ne i32 %x, 0
+ call void @callee2(i1 signext %tobool)
+ ret void
+}
+
+declare void @callee1(i1 zeroext)
+declare void @callee2(i1 signext)
+
+attributes #0 = { nounwind "target-cpu"="skylake-avx512" }
diff --git a/test/CodeGen/X86/pr27681.mir b/test/CodeGen/X86/pr27681.mir
new file mode 100644
index 000000000000..9473a21d7327
--- /dev/null
+++ b/test/CodeGen/X86/pr27681.mir
@@ -0,0 +1,87 @@
+# RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=slm -run-pass post-RA-sched -o - %s | FileCheck %s
+#
+# Verify that the critical antidependence breaker does not consider
+# a high byte register as available as a replacement register
+# in a certain context.
+--- |
+
+ define void @main() { ret void }
+
+...
+---
+# CHECK-LABEL: main
+name: main
+allVRegsAllocated: true
+tracksRegLiveness: true
+frameInfo:
+ stackSize: 52
+fixedStack:
+ - { id: 0, type: spill-slot, offset: -20, size: 4, alignment: 4, callee-saved-register: '%esi' }
+ - { id: 1, type: spill-slot, offset: -16, size: 4, alignment: 4, callee-saved-register: '%edi' }
+ - { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4, callee-saved-register: '%ebx' }
+ - { id: 3, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '%ebp' }
+stack:
+ - { id: 0, type: spill-slot, offset: -53, size: 1, alignment: 1 }
+ - { id: 1, type: spill-slot, offset: -48, size: 4, alignment: 4 }
+ - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: %ebp, %ebx, %edi, %esi
+
+ frame-setup PUSH32r killed %ebp, implicit-def %esp, implicit %esp
+ frame-setup PUSH32r killed %ebx, implicit-def %esp, implicit %esp
+ frame-setup PUSH32r killed %edi, implicit-def %esp, implicit %esp
+ frame-setup PUSH32r killed %esi, implicit-def %esp, implicit %esp
+ %esp = frame-setup SUB32ri8 %esp, 36, implicit-def dead %eflags
+ %eax = MOV32ri 1
+ %ebp = MOV32ri 2
+ %ebx = MOV32ri 3
+ %ecx = MOV32ri 4
+ %edi = MOV32ri 5
+ %edx = MOV32ri 6
+
+ bb.1:
+ successors: %bb.3, %bb.2
+ liveins: %eax, %ebp, %ebx, %ecx, %edi, %edx
+
+ %ebp = SHR32rCL killed %ebp, implicit-def dead %eflags, implicit %cl
+ %ebp = XOR32rr killed %ebp, killed %ebx, implicit-def dead %eflags
+ TEST32rr %edx, %edx, implicit-def %eflags
+ %cl = SETNEr implicit %eflags
+ ; This %bl def is antidependent on the above use of %ebx
+ %bl = MOV8rm %esp, 1, _, 3, _ ; :: (load 1 from %stack.0)
+ %cl = OR8rr killed %cl, %bl, implicit-def dead %eflags
+ %esi = MOVZX32rr8 killed %cl
+ %esi = ADD32rr killed %esi, killed %edi, implicit-def dead %eflags
+ %ecx = MOV32rm %esp, 1, _, 24, _ ; :: (load 4 from %stack.2)
+ %edx = SAR32rCL killed %edx, implicit-def dead %eflags, implicit %cl
+ TEST32rr killed %edx, %edx, implicit-def %eflags
+ %cl = SETNEr implicit %eflags
+ ; Verify that removal of the %bl antidependence does not use %ch
+ ; as a replacement register.
+ ; CHECK: %cl = AND8rr %cl, killed %b
+ %cl = AND8rr killed %cl, killed %bl, implicit-def dead %eflags
+ CMP32ri8 %ebp, -1, implicit-def %eflags
+ %edx = MOV32ri 0
+ JE_1 %bb.3, implicit %eflags
+
+ bb.2:
+ successors: %bb.3
+ liveins: %cl, %eax, %ebp, %esi
+
+ OR32mr %esp, 1, _, 8, _, killed %eax, implicit-def %eflags ; :: (store 4 into %stack.1)
+ %dl = SETNEr implicit %eflags, implicit-def %edx
+
+ bb.3:
+ liveins: %cl, %ebp, %edx, %esi
+
+ %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+ %esp = ADD32ri8 %esp, 36, implicit-def dead %eflags
+ %esi = POP32r implicit-def %esp, implicit %esp
+ %edi = POP32r implicit-def %esp, implicit %esp
+ %ebx = POP32r implicit-def %esp, implicit %esp
+ %ebp = POP32r implicit-def %esp, implicit %esp
+ RET 0, %eax
+
+...
diff --git a/test/CodeGen/X86/pr28173.ll b/test/CodeGen/X86/pr28173.ll
new file mode 100644
index 000000000000..31ea4ffb5616
--- /dev/null
+++ b/test/CodeGen/X86/pr28173.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mattr=+avx512f < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Note that the kmovs should really *not* appear in the output, this is an
+; artifact of the current poor lowering. This is tracked by PR28175.
+
+define i64 @foo64(i1 zeroext %i, i32 %j) #0 {
+; CHECK-LABEL: foo64:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill
+; CHECK-NEXT: orq $-2, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ br label %bb
+
+bb:
+ %z = zext i1 %i to i64
+ %v = or i64 %z, -2
+ br label %end
+
+end:
+ ret i64 %v
+}
+
+define i16 @foo16(i1 zeroext %i, i32 %j) #0 {
+; CHECK-LABEL: foo16:
+; CHECK: # BB#0:
+; CHECK-NEXT: orl $65534, %edi # imm = 0xFFFE
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ br label %bb
+
+bb:
+ %z = zext i1 %i to i16
+ %v = or i16 %z, -2
+ br label %end
+
+end:
+ ret i16 %v
+}
diff --git a/test/CodeGen/X86/pr28444.ll b/test/CodeGen/X86/pr28444.ll
new file mode 100644
index 000000000000..452f01c166b7
--- /dev/null
+++ b/test/CodeGen/X86/pr28444.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 < %s | FileCheck %s
+; https://llvm.org/bugs/show_bug.cgi?id=28444
+
+; extract_vector_elt is allowed to have a different result type than
+; the vector scalar type.
+; This uses both
+; i8 = extract_vector_elt v1i1, Constant:i64<0>
+; i1 = extract_vector_elt v1i1, Constant:i64<0>
+
+
+; CHECK-LABEL: {{^}}extractelt_mismatch_vector_element_type:
+; CHECK: movb $1, %al
+; CHECK: movb %al
+; CHECK: movb %al
+define void @extractelt_mismatch_vector_element_type(i32 %arg) {
+bb:
+ %tmp = icmp ult i32 %arg, 0
+ %tmp2 = insertelement <1 x i1> undef, i1 true, i32 0
+ %tmp3 = select i1 %tmp, <1 x i1> undef, <1 x i1> %tmp2
+ %tmp6 = extractelement <1 x i1> %tmp3, i32 0
+ br label %bb1
+
+bb1:
+ store volatile <1 x i1> %tmp3, <1 x i1>* undef
+ store volatile i1 %tmp6, i1* undef
+ ret void
+}
diff --git a/test/CodeGen/X86/pr28472.ll b/test/CodeGen/X86/pr28472.ll
new file mode 100644
index 000000000000..9d2609022b3d
--- /dev/null
+++ b/test/CodeGen/X86/pr28472.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}same_dynamic_index_fp_vector_type:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+define float @same_dynamic_index_fp_vector_type(float %val, i32 %idx) {
+bb:
+ %tmp0 = insertelement <4 x float> undef, float %val, i32 %idx
+ %tmp1 = extractelement <4 x float> %tmp0, i32 %idx
+ ret float %tmp1
+}
diff --git a/test/CodeGen/X86/pr28489.ll b/test/CodeGen/X86/pr28489.ll
new file mode 100644
index 000000000000..898b0870b65d
--- /dev/null
+++ b/test/CodeGen/X86/pr28489.ll
@@ -0,0 +1,15 @@
+; ; RUN: llc < %s -mtriple=i686-pc-linux -O0 | FileCheck %s
+declare void @g(i32, i1)
+
+;CHECK-LABEL: f:
+;CHECK: cmpxchg8b
+;CHECK: sete %cl
+;CHECK: movzbl %cl
+define void @f(i64* %arg, i64 %arg1) {
+entry:
+ %tmp5 = cmpxchg i64* %arg, i64 %arg1, i64 %arg1 seq_cst seq_cst
+ %tmp7 = extractvalue { i64, i1 } %tmp5, 1
+ %tmp9 = zext i1 %tmp7 to i32
+ call void @g(i32 %tmp9, i1 %tmp7)
+ ret void
+}
diff --git a/test/CodeGen/X86/pr28515.ll b/test/CodeGen/X86/pr28515.ll
new file mode 100644
index 000000000000..1fad26506668
--- /dev/null
+++ b/test/CodeGen/X86/pr28515.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
+
+@0 = private constant [8 x i32] zeroinitializer
+
+; CHECK-LABEL: foo:
+; CHECK: movl %esi, (%rdi)
+; CHECK-NEXT: retq
+define void @foo(i32* %p, i32 %v, <8 x i1> %mask) {
+ store i32 %v, i32* %p
+ %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @0, i64 0, i64 0) to <8 x i32>*), i32 4, <8 x i1> %mask, <8 x i32> undef)
+ ret void
+}
+
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) #0
+
+attributes #0 = { argmemonly nounwind readonly }
diff --git a/test/CodeGen/X86/pr28560.ll b/test/CodeGen/X86/pr28560.ll
new file mode 100644
index 000000000000..d0061f670cf1
--- /dev/null
+++ b/test/CodeGen/X86/pr28560.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=i686-pc-linux -print-after=postrapseudos < %s 2>&1 | FileCheck %s
+
+; CHECK: MOV8rr %{{[A-D]}}L, %E[[R:[A-D]]]X<imp-use,kill>, %E[[R]]X<imp-def>
+define i32 @foo(i32 %i, i32 %k, i8* %p) {
+ %f = icmp ne i32 %i, %k
+ %s = zext i1 %f to i8
+ %ret = zext i1 %f to i32
+ br label %next
+next:
+ %d = add i8 %s, 5
+ store i8 %d, i8* %p
+ ret i32 %ret
+}
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
index 4dee5f8d7d2a..259d55b030e5 100644
--- a/test/CodeGen/X86/pr5145.ll
+++ b/test/CodeGen/X86/pr5145.ll
@@ -5,26 +5,26 @@ define void @atomic_maxmin_i8() {
; CHECK: atomic_maxmin_i8
%1 = atomicrmw max i8* @sc8, i8 5 acquire
; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movsbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: jg
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL1]]
%2 = atomicrmw min i8* @sc8, i8 6 acquire
; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movsbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: jl
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL3]]
%3 = atomicrmw umax i8* @sc8, i8 7 acquire
; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movzbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: ja
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL5]]
%4 = atomicrmw umin i8* @sc8, i8 8 acquire
; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movzbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: jb
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL7]]
ret void
diff --git a/test/CodeGen/X86/promote-i16.ll b/test/CodeGen/X86/promote-i16.ll
index 963bc1c2927a..7eb367480d76 100644
--- a/test/CodeGen/X86/promote-i16.ll
+++ b/test/CodeGen/X86/promote-i16.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
define signext i16 @foo(i16 signext %x) nounwind {
entry:
; CHECK-LABEL: foo:
-; CHECK-NOT: movzwl
-; CHECK: movswl 4(%esp), %eax
-; CHECK: xorl $21998, %eax
+; CHECK: movzwl 4(%esp), %eax
+; CHECK-NEXT: xorl $21998, %eax
+; CHECK-NEXT: # kill
+; CHECK-NEXT: retl
%0 = xor i16 %x, 21998
ret i16 %0
}
@@ -13,9 +14,10 @@ entry:
define signext i16 @bar(i16 signext %x) nounwind {
entry:
; CHECK-LABEL: bar:
-; CHECK-NOT: movzwl
-; CHECK: movswl 4(%esp), %eax
-; CHECK: xorl $-10770, %eax
+; CHECK: movzwl 4(%esp), %eax
+; CHECK-NEXT: xorl $54766, %eax
+; CHECK-NEXT: # kill
+; CHECK-NEXT: retl
%0 = xor i16 %x, 54766
ret i16 %0
}
diff --git a/test/CodeGen/X86/ps4-noreturn.ll b/test/CodeGen/X86/ps4-noreturn.ll
new file mode 100644
index 000000000000..4c14f2189325
--- /dev/null
+++ b/test/CodeGen/X86/ps4-noreturn.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-scei-ps4 | FileCheck %s
+
+declare i32 @personality(...)
+
+; Check that after the (implicitly noreturn) unwind call, there is
+; another instruction. It was easy to produce 'ud2' so we check for that.
+define void @foo1() personality i32 (...)* @personality {
+; CHECK-LABEL: foo1:
+; CHECK: .cfi_startproc
+; CHECK: callq bar
+; CHECK: retq
+; Check for 'ud2' between noreturn call and function end.
+; CHECK: callq _Unwind_Resume
+; CHECK-NEXT: ud2
+; CHECK-NEXT: .Lfunc_end0:
+ invoke void @bar()
+ to label %normal
+ unwind label %catch
+normal:
+ ret void
+catch:
+ %1 = landingpad { i8*, i32 } cleanup
+ resume { i8*, i32 } %1
+}
+
+declare void @bar() #0
+
+; Similar check after an explicit noreturn call.
+define void @foo2() {
+; CHECK-LABEL: foo2:
+; CHECK: callq bar
+; CHECK-NEXT: ud2
+; CHECK-NEXT: .Lfunc_end1:
+ tail call void @bar()
+ unreachable
+}
+
+attributes #0 = { noreturn }
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 105a035be592..8364915fa0d0 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -1,10 +1,13 @@
-; RUN: llc < %s -march=x86-64 -mattr=+ssse3 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s
; Test that the pshufb mask comment is correct.
define <16 x i8> @test1(<16 x i8> %V) {
; CHECK-LABEL: test1:
-; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
+; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>)
ret <16 x i8> %1
}
@@ -13,7 +16,9 @@ define <16 x i8> @test1(<16 x i8> %V) {
define <16 x i8> @test2(<16 x i8> %V) {
; CHECK-LABEL: test2:
-; CHECK: pshufb {{.*}}# xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
+; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>)
ret <16 x i8> %1
}
@@ -22,31 +27,64 @@ define <16 x i8> @test2(<16 x i8> %V) {
define <16 x i8> @test3(<16 x i8> %V) {
; CHECK-LABEL: test3:
-; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
+; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>)
ret <16 x i8> %1
}
; Test that we won't crash when the constant was reused for another instruction.
-define <16 x i8> @test4(<2 x i64>* %V) {
-; CHECK-LABEL: test4
-; CHECK: pshufb {{.*}}
- store <2 x i64> <i64 1084818905618843912, i64 506097522914230528>, <2 x i64>* %V, align 16
- %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
- ret <16 x i8> %1
+define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1084818905618843912,506097522914230528]
+; CHECK-NEXT: movaps %xmm1, (%rdi)
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: retq
+ %1 = insertelement <2 x i64> undef, i64 1084818905618843912, i32 0
+ %2 = insertelement <2 x i64> %1, i64 506097522914230528, i32 1
+ store <2 x i64> %2, <2 x i64>* %P, align 16
+ %3 = bitcast <2 x i64> %2 to <16 x i8>
+ %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> %3)
+ ret <16 x i8> %4
}
-define <16 x i8> @test5() {
-; CHECK-LABEL: test5
-; CHECK: pshufb {{.*}}
+define <16 x i8> @test5(<16 x i8> %V) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: movd %rax, %xmm1
+; CHECK-NEXT: movaps %xmm1, (%rax)
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT: movdqa %xmm1, (%rax)
+; CHECK-NEXT: pshufb %xmm1, %xmm0
+; CHECK-NEXT: retq
store <2 x i64> <i64 1, i64 0>, <2 x i64>* undef, align 16
%l = load <2 x i64>, <2 x i64>* undef, align 16
%shuffle = shufflevector <2 x i64> %l, <2 x i64> undef, <2 x i32> zeroinitializer
store <2 x i64> %shuffle, <2 x i64>* undef, align 16
%1 = load <16 x i8>, <16 x i8>* undef, align 16
- %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> %1)
+ %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> %1)
ret <16 x i8> %2
}
+; Test for a reused constant that would allow the pshufb to combine to a simpler instruction.
+
+define <16 x i8> @test6(<16 x i8> %V, <2 x i64>* %P) {
+; CHECK-LABEL: test6:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [217019414673948672,506380106026255364]
+; CHECK-NEXT: movaps %xmm1, (%rdi)
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: retq
+ %1 = insertelement <2 x i64> undef, i64 217019414673948672, i32 0
+ %2 = insertelement <2 x i64> %1, i64 506380106026255364, i32 1
+ store <2 x i64> %2, <2 x i64>* %P, align 16
+ %3 = bitcast <2 x i64> %2 to <16 x i8>
+ %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> %3)
+ ret <16 x i8> %4
+}
+
declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index c6d118d6da69..a63d1c60e379 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -54,30 +54,21 @@ vector.ph:
}
define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
-; SSE2-LABEL: test3:
-; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: movdqu (%rdi), %xmm1
-; SSE2-NEXT: psubusw %xmm0, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: test3:
-; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: psubusw %xmm0, %xmm1
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
-; SSSE3-NEXT: retq
+; SSE-LABEL: test3:
+; SSE: ## BB#0: ## %vector.ph
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: movdqu (%rdi), %xmm1
+; SSE-NEXT: psubusw %xmm0, %xmm1
+; SSE-NEXT: movdqu %xmm1, (%rdi)
+; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
; AVX1: ## BB#0: ## %vector.ph
; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vmovdqu (%rdi), %xmm1
; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
@@ -159,9 +150,8 @@ define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: psubusb %xmm0, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rdi)
@@ -304,46 +294,34 @@ vector.ph:
}
define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
-; SSE2-LABEL: test9:
-; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: movdqu (%rdi), %xmm1
-; SSE2-NEXT: movdqu 16(%rdi), %xmm2
-; SSE2-NEXT: psubusw %xmm0, %xmm1
-; SSE2-NEXT: psubusw %xmm0, %xmm2
-; SSE2-NEXT: movdqu %xmm2, 16(%rdi)
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: test9:
-; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: movdqu 16(%rdi), %xmm2
-; SSSE3-NEXT: psubusw %xmm0, %xmm1
-; SSSE3-NEXT: psubusw %xmm0, %xmm2
-; SSSE3-NEXT: movdqu %xmm2, 16(%rdi)
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
-; SSSE3-NEXT: retq
+; SSE-LABEL: test9:
+; SSE: ## BB#0: ## %vector.ph
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: movdqu (%rdi), %xmm1
+; SSE-NEXT: movdqu 16(%rdi), %xmm2
+; SSE-NEXT: psubusw %xmm0, %xmm1
+; SSE-NEXT: psubusw %xmm0, %xmm2
+; SSE-NEXT: movdqu %xmm2, 16(%rdi)
+; SSE-NEXT: movdqu %xmm1, (%rdi)
+; SSE-NEXT: retq
;
; AVX1-LABEL: test9:
; AVX1: ## BB#0: ## %vector.ph
; AVX1-NEXT: vmovups (%rdi), %ymm0
-; AVX1-NEXT: vmovd %esi, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovd %esi, %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
@@ -471,9 +449,8 @@ define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: movdqu 16(%rdi), %xmm2
; SSE2-NEXT: psubusb %xmm0, %xmm1
diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll
index cc00fab525ab..7f438e306e4d 100644
--- a/test/CodeGen/X86/push-cfi-debug.ll
+++ b/test/CodeGen/X86/push-cfi-debug.ll
@@ -23,7 +23,7 @@ declare x86_stdcallcc void @stdfoo(i32, i32) #0
; CHECK: .cfi_adjust_cfa_offset 4
; CHECK: calll stdfoo
; CHECK: .cfi_adjust_cfa_offset -8
-; CHECK: addl $8, %esp
+; CHECK: addl $20, %esp
; CHECK: .cfi_adjust_cfa_offset -8
define void @test1() #0 !dbg !4 {
entry:
@@ -38,11 +38,10 @@ attributes #0 = { nounwind optsize }
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250289)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250289)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "foo.c", directory: "foo")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, variables: !2)
+!4 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, variables: !2)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 6389708f42cc..f0772fc28c63 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -82,7 +82,7 @@ cleanup:
; LINUX-NEXT: Ltmp{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
-; LINUX-NEXT: addl $16, %esp
+; LINUX-NEXT: addl $28, %esp
; LINUX: .cfi_adjust_cfa_offset -16
; DARWIN-NOT: .cfi_escape
; DARWIN-NOT: pushl
diff --git a/test/CodeGen/X86/ragreedy-hoist-spill.ll b/test/CodeGen/X86/ragreedy-hoist-spill.ll
index 46b65bd24fc0..1d6b4f94731b 100644
--- a/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
; This testing case is reduced from 254.gap SyFgets function.
-; We make sure a spill is not hoisted to a hotter outer loop.
+; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.
%struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
%struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
@@ -181,6 +181,10 @@ sw.bb474:
br i1 %cmp476, label %if.end517, label %do.body479.preheader
do.body479.preheader:
+ ; CHECK: do.body479.preheader
+ ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
+ ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+ ; CHECK: land.rhs485
%cmp4833314 = icmp eq i8 undef, 0
br i1 %cmp4833314, label %if.end517, label %land.rhs485
@@ -200,8 +204,8 @@ land.lhs.true490:
lor.rhs500:
; CHECK: lor.rhs500
- ; Make sure that we don't hoist the spill to outer loops.
- ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+ ; Make sure spill is hoisted to a cold preheader in outside loop.
+ ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp)
; CHECK: callq {{.*}}maskrune
%call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
br i1 undef, label %land.lhs.true504, label %do.body479.backedge
diff --git a/test/CodeGen/X86/reduce-trunc-shl.ll b/test/CodeGen/X86/reduce-trunc-shl.ll
new file mode 100644
index 000000000000..74612df4dd36
--- /dev/null
+++ b/test/CodeGen/X86/reduce-trunc-shl.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+define void @trunc_shl_7_v4i32_v4i64(<4 x i32> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+; SSE2-LABEL: trunc_shl_7_v4i32_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: pslld $7, %xmm1
+; SSE2-NEXT: movdqa %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: trunc_shl_7_v4i32_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpslld $7, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %val = load <4 x i64>, <4 x i64> addrspace(1)* %in
+ %shl = shl <4 x i64> %val, <i64 7, i64 7, i64 7, i64 7>
+ %trunc = trunc <4 x i64> %shl to <4 x i32>
+ store <4 x i32> %trunc, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
index 016b0d13fc4a..ba8ff1bc1819 100644
--- a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
+++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
@@ -37,7 +37,7 @@ declare noalias i32* @make_data()
; We use to produce a useless copy here and move %data in another temporary register.
; CHECK-NOT: movq [[ARG1]]
; End of the first basic block.
-; CHECK: .align
+; CHECK: .p2align
; Now check that %data is used in an address computation.
; CHECK: leaq ([[ARG1]]
define %struct._list* @make_list(i32* nocapture readonly %data, i32* nocapture %value, i32* nocapture %all) {
diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll
index 733b7942a6d5..cc591e5ac00b 100644
--- a/test/CodeGen/X86/rem.ll
+++ b/test/CodeGen/X86/rem.ll
@@ -1,37 +1,84 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
-; CHECK-LABEL: test1:
-; CHECK-NOT: div
define i32 @test1(i32 %X) {
- %tmp1 = srem i32 %X, 255 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: imull %edx
+; CHECK-NEXT: addl %ecx, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $7, %edx
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shll $8, %eax
+; CHECK-NEXT: subl %edx, %eax
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = srem i32 %X, 255
+ ret i32 %tmp1
}
-; CHECK-LABEL: test2:
-; CHECK-NOT: div
define i32 @test2(i32 %X) {
- %tmp1 = srem i32 %X, 256 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: sarl $31, %ecx
+; CHECK-NEXT: shrl $24, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: andl $-256, %ecx
+; CHECK-NEXT: subl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = srem i32 %X, 256
+ ret i32 %tmp1
}
-; CHECK-LABEL: test3:
-; CHECK-NOT: div
define i32 @test3(i32 %X) {
- %tmp1 = urem i32 %X, 255 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: mull %edx
+; CHECK-NEXT: shrl $7, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shll $8, %eax
+; CHECK-NEXT: subl %edx, %eax
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = urem i32 %X, 255
+ ret i32 %tmp1
}
-; CHECK-LABEL: test4:
-; CHECK-NOT: div
define i32 @test4(i32 %X) {
- %tmp1 = urem i32 %X, 256 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = urem i32 %X, 256
+ ret i32 %tmp1
}
-; CHECK-LABEL: test5:
-; CHECK-NOT: cltd
define i32 @test5(i32 %X) nounwind readnone {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $41, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: idivl {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retl
+;
entry:
- %0 = srem i32 41, %X
- ret i32 %0
+ %0 = srem i32 41, %X
+ ret i32 %0
}
diff --git a/test/CodeGen/X86/rem_crash.ll b/test/CodeGen/X86/rem_crash.ll
index 8363b22ab65f..a5529a769a0b 100644
--- a/test/CodeGen/X86/rem_crash.ll
+++ b/test/CodeGen/X86/rem_crash.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s
+; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86-64
define i8 @test_minsize_uu8(i8 %x) minsize optsize {
entry:
diff --git a/test/CodeGen/X86/return-ext.ll b/test/CodeGen/X86/return-ext.ll
new file mode 100644
index 000000000000..ef160f43b4aa
--- /dev/null
+++ b/test/CodeGen/X86/return-ext.ll
@@ -0,0 +1,138 @@
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -fixup-byte-word-insts=0 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -fixup-byte-word-insts=1 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWON %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -fixup-byte-word-insts=0 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -fixup-byte-word-insts=1 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWON %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -fixup-byte-word-insts=0 | \
+; RUN: FileCheck -check-prefix=DARWIN -check-prefix=DARWIN-BWOFF %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -fixup-byte-word-insts=1 | \
+; RUN: FileCheck -check-prefix=DARWIN -check-prefix=DARWIN-BWON %s
+
+
+@x = common global i32 0, align 4
+
+define zeroext i1 @unsigned_i1() {
+entry:
+ %0 = load i32, i32* @x
+ %cmp = icmp eq i32 %0, 42
+ ret i1 %cmp
+
+; Unsigned i1 return values are not extended.
+; CHECK-LABEL: unsigned_i1:
+; CHECK: cmp
+; CHECK-NEXT: sete
+; CHECK-NEXT: ret
+}
+
+define zeroext i8 @unsigned_i8() {
+entry:
+ %0 = load i32, i32* @x
+ %cmp = icmp eq i32 %0, 42
+ %retval = zext i1 %cmp to i8
+ ret i8 %retval
+
+; Unsigned i8 return values are not extended.
+; CHECK-LABEL: unsigned_i8:
+; CHECK: cmp
+; CHECK-NEXT: sete
+; CHECK-NEXT: ret
+
+; Except on Darwin, for legacy reasons.
+; DARWIN-LABEL: unsigned_i8:
+; DARWIN: xorl
+; DARWIN-NEXT: cmp
+; DARWIN-NEXT: sete
+; DARWIN-NEXT: ret
+}
+
+define signext i8 @signed_i8() {
+entry:
+ %0 = load i32, i32* @x
+ %cmp = icmp eq i32 %0, 42
+ %retval = zext i1 %cmp to i8
+ ret i8 %retval
+
+; Signed i8 return values are not extended.
+; CHECK-LABEL: signed_i8:
+; CHECK: cmp
+; CHECK-NEXT: sete
+; CHECK-NEXT: ret
+
+; Except on Darwin, for legacy reasons.
+; DARWIN-LABEL: signed_i8:
+; DARWIN: xorl
+; DARWIN-NEXT: cmp
+; DARWIN-NEXT: sete
+; DARWIN-NEXT: ret
+}
+
+@a = common global i16 0
+@b = common global i16 0
+define zeroext i16 @unsigned_i16() {
+entry:
+ %0 = load i16, i16* @a
+ %1 = load i16, i16* @b
+ %add = add i16 %1, %0
+ ret i16 %add
+
+; i16 return values are not extended.
+; CHECK-LABEL: unsigned_i16:
+; BWOFF: movw
+; BWON: movzwl
+; CHECK-NEXT: addw
+; CHECK-NEXT: ret
+
+; Except on Darwin, for legacy reasons.
+; DARWIN-LABEL: unsigned_i16:
+; DARWIN-BWOFF: movw
+; DARWIN-BWON: movzwl
+; DARWIN-NEXT: addw
+; DARWIN-NEXT: movzwl
+; DARWIN-NEXT: ret
+}
+
+
+define i32 @use_i1() {
+entry:
+ %0 = call i1 @unsigned_i1();
+ %1 = zext i1 %0 to i32
+ ret i32 %1
+
+; The high 24 bits of %eax from a function returning i1 are undefined.
+; CHECK-LABEL: use_i1:
+; CHECK: call
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: ret
+}
+
+define i32 @use_i8() {
+entry:
+ %0 = call i8 @unsigned_i8();
+ %1 = zext i8 %0 to i32
+ ret i32 %1
+
+; The high 24 bits of %eax from a function returning i8 are undefined.
+; CHECK-LABEL: use_i8:
+; CHECK: call
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: ret
+}
+
+define i32 @use_i16() {
+entry:
+ %0 = call i16 @unsigned_i16();
+ %1 = zext i16 %0 to i32
+ ret i32 %1
+
+; The high 16 bits of %eax from a function returning i16 are undefined.
+; CHECK-LABEL: use_i16:
+; CHECK: call
+; CHECK-NEXT: movzwl
+; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index 76eb9514f02c..fb06cac45fff 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -2,7 +2,8 @@
declare i32 @llvm.x86.xbegin() nounwind
declare void @llvm.x86.xend() nounwind
-declare void @llvm.x86.xabort(i8) noreturn nounwind
+declare void @llvm.x86.xabort(i8) nounwind
+declare void @f1()
define i32 @test_xbegin() nounwind uwtable {
entry:
@@ -24,7 +25,20 @@ entry:
define void @test_xabort() nounwind uwtable {
entry:
tail call void @llvm.x86.xabort(i8 2)
- unreachable
+ ret void
; CHECK: test_xabort
; CHECK: xabort $2
}
+
+define void @f2(i32 %x) nounwind uwtable {
+entry:
+ %x.addr = alloca i32, align 4
+ store i32 %x, i32* %x.addr, align 4
+ call void @llvm.x86.xabort(i8 1)
+ call void @f1()
+ ret void
+; CHECK-LABEL: f2
+; CHECK: xabort $1
+; CHECK: callq f1
+}
+ \ No newline at end of file
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
new file mode 100644
index 000000000000..17a933e50d0d
--- /dev/null
+++ b/test/CodeGen/X86/sad.ll
@@ -0,0 +1,1001 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+
+@a = global [1024 x i8] zeroinitializer, align 16
+@b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @sad_16i8() nounwind {
+; SSE2-LABEL: sad_16i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB0_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqu a+1024(%rax), %xmm2
+; SSE2-NEXT: movdqu b+1024(%rax), %xmm3
+; SSE2-NEXT: psadbw %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB0_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_16i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB0_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2
+; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB0_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_16i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB0_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1
+; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB0_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_16i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB0_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1
+; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB0_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <16 x i8>*
+ %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
+ %2 = zext <16 x i8> %wide.load to <16 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <16 x i8>*
+ %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
+ %5 = zext <16 x i8> %wide.load1 to <16 x i32>
+ %6 = sub nsw <16 x i32> %2, %5
+ %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %8 = sub nsw <16 x i32> zeroinitializer, %6
+ %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
+ %10 = add nsw <16 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <16 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
+ %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
+ %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
+ %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
+ %12 = extractelement <16 x i32> %bin.rdx4, i32 0
+ ret i32 %12
+}
+
+define i32 @sad_32i8() nounwind {
+; SSE2-LABEL: sad_32i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm14, %xmm14
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB1_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm1
+; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
+; SSE2-NEXT: movdqa b+1040(%rax), %xmm2
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm9, %xmm7
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm11, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm11, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm7
+; SSE2-NEXT: pxor %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm10, %xmm4
+; SSE2-NEXT: paddd %xmm1, %xmm15
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: paddd %xmm6, %xmm0
+; SSE2-NEXT: paddd %xmm7, %xmm14
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB1_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: paddd %xmm15, %xmm4
+; SSE2-NEXT: paddd %xmm14, %xmm1
+; SSE2-NEXT: paddd %xmm13, %xmm0
+; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_32i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB1_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB1_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_32i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB1_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB1_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_32i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB1_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB1_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <32 x i8>*
+ %wide.load = load <32 x i8>, <32 x i8>* %1, align 32
+ %2 = zext <32 x i8> %wide.load to <32 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <32 x i8>*
+ %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32
+ %5 = zext <32 x i8> %wide.load1 to <32 x i32>
+ %6 = sub nsw <32 x i32> %2, %5
+ %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %8 = sub nsw <32 x i32> zeroinitializer, %6
+ %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8
+ %10 = add nsw <32 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <32 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf
+ %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2
+ %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3
+ %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4
+ %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5
+ %12 = extractelement <32 x i32> %bin.rdx5, i32 0
+ ret i32 %12
+}
+
+define i32 @sad_avx64i8() nounwind {
+; SSE2-LABEL: sad_avx64i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: subq $216, %rsp
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB2_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm13
+; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
+; SSE2-NEXT: movdqa a+1056(%rax), %xmm10
+; SSE2-NEXT: movdqa a+1072(%rax), %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm12, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm13, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
+; SSE2-NEXT: movdqa b+1040(%rax), %xmm7
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm11
+; SSE2-NEXT: movdqa b+1056(%rax), %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm7, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm7, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm11, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm2, %xmm15
+; SSE2-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm0, %xmm15
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm14, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm14, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm14
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm0, %xmm11
+; SSE2-NEXT: movdqa b+1072(%rax), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm0, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm5, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm2, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm7, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm7
+; SSE2-NEXT: pxor %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm9
+; SSE2-NEXT: pxor %xmm2, %xmm9
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm11
+; SSE2-NEXT: pxor %xmm2, %xmm11
+; SSE2-NEXT: movdqa %xmm14, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm14
+; SSE2-NEXT: pxor %xmm2, %xmm14
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm15, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm15
+; SSE2-NEXT: pxor %xmm2, %xmm15
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm10, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm2, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm12, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm12
+; SSE2-NEXT: pxor %xmm2, %xmm12
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm13, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm13
+; SSE2-NEXT: pxor %xmm2, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm13, %xmm5
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm10, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm15, %xmm3
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm14, %xmm15
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm9, %xmm4
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm7, %xmm5
+; SSE2-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm7
+; SSE2-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB2_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: paddd %xmm15, %xmm3
+; SSE2-NEXT: paddd %xmm5, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm13
+; SSE2-NEXT: paddd %xmm11, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm12
+; SSE2-NEXT: paddd %xmm3, %xmm10
+; SSE2-NEXT: paddd %xmm13, %xmm10
+; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: paddd %xmm5, %xmm12
+; SSE2-NEXT: paddd %xmm10, %xmm12
+; SSE2-NEXT: paddd %xmm6, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,0,1]
+; SSE2-NEXT: paddd %xmm12, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: addq $216, %rsp
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_avx64i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5
+; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB2_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9
+; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
+; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15
+; AVX2-NEXT: vpabsd %ymm8, %ymm8
+; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3
+; AVX2-NEXT: vpabsd %ymm14, %ymm8
+; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1
+; AVX2-NEXT: vpabsd %ymm13, %ymm8
+; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
+; AVX2-NEXT: vpabsd %ymm12, %ymm8
+; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vpabsd %ymm11, %ymm8
+; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4
+; AVX2-NEXT: vpabsd %ymm10, %ymm8
+; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpabsd %ymm15, %ymm8
+; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB2_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_avx64i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB2_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7
+; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6
+; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5
+; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4
+; AVX512F-NEXT: vpabsd %zmm4, %zmm4
+; AVX512F-NEXT: vpabsd %zmm5, %zmm5
+; AVX512F-NEXT: vpabsd %zmm6, %zmm6
+; AVX512F-NEXT: vpabsd %zmm7, %zmm7
+; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3
+; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2
+; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1
+; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB2_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_avx64i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB2_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2
+; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB2_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <64 x i8>*
+ %wide.load = load <64 x i8>, <64 x i8>* %1, align 64
+ %2 = zext <64 x i8> %wide.load to <64 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <64 x i8>*
+ %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64
+ %5 = zext <64 x i8> %wide.load1 to <64 x i32>
+ %6 = sub nsw <64 x i32> %2, %5
+ %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %8 = sub nsw <64 x i32> zeroinitializer, %6
+ %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8
+ %10 = add nsw <64 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <64 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf
+ %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2
+ %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3
+ %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4
+ %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5
+ %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6
+ %12 = extractelement <64 x i32> %bin.rdx6, i32 0
+ ret i32 %12
+}
+
+define i32 @sad_2i8() nounwind {
+; SSE2-LABEL: sad_2i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB3_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psadbw %xmm3, %xmm2
+; SSE2-NEXT: paddq %xmm2, %xmm0
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB3_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_2i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB3_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB3_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_2i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB3_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB3_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_2i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB3_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB3_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %1, align 4
+ %2 = zext <2 x i8> %wide.load to <2 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <2 x i8>*
+ %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4
+ %5 = zext <2 x i8> %wide.load1 to <2 x i32>
+ %6 = sub nsw <2 x i32> %2, %5
+ %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1>
+ %8 = sub nsw <2 x i32> zeroinitializer, %6
+ %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8
+ %10 = add nsw <2 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <2 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
+ %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf
+ %12 = extractelement <2 x i32> %bin.rdx, i32 0
+ ret i32 %12
+}
+
diff --git a/test/CodeGen/X86/safestack_ssp.ll b/test/CodeGen/X86/safestack_ssp.ll
new file mode 100644
index 000000000000..5a1a465158cf
--- /dev/null
+++ b/test/CodeGen/X86/safestack_ssp.ll
@@ -0,0 +1,27 @@
+; Test codegen pipeline for SafeStack + StackProtector combination.
+; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+
+define void @_Z1fv() safestack sspreq {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @_Z7CapturePi(i32* nonnull %x)
+ ret void
+}
+
+declare void @_Z7CapturePi(i32*)
+
+; LINUX-X64-DAG: movq __safestack_unsafe_stack_ptr@GOTTPOFF(%rip), %[[A:.*]]
+; LINUX-X64-DAG: movq %fs:(%[[A]]), %[[B:.*]]
+; LINUX-X64-DAG: movq %fs:40, %[[COOKIE:.*]]
+; LINUX-X64-DAG: leaq -16(%[[B]]), %[[C:.*]]
+; LINUX-X64-DAG: movq %[[C]], %fs:(%[[A]])
+; LINUX-X64-DAG: movq %[[COOKIE]], -8(%[[B]])
+
+; LINUX-I386-DAG: movl __safestack_unsafe_stack_ptr@INDNTPOFF, %[[A:.*]]
+; LINUX-I386-DAG: movl %gs:(%[[A]]), %[[B:.*]]
+; LINUX-I386-DAG: movl %gs:20, %[[COOKIE:.*]]
+; LINUX-I386-DAG: leal -16(%[[B]]), %[[C:.*]]
+; LINUX-I386-DAG: movl %[[C]], %gs:(%[[A]])
+; LINUX-I386-DAG: movl %[[COOKIE]], -4(%[[B]])
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 55eaab91da50..a0cd1824629a 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -44,7 +44,7 @@ define void @test_basic() #0 {
; X32-Linux-NEXT: ja .LBB0_2
; X32-Linux: pushl $0
-; X32-Linux-NEXT: pushl $60
+; X32-Linux-NEXT: pushl $44
; X32-Linux-NEXT: calll __morestack
; X32-Linux-NEXT: ret
@@ -105,7 +105,7 @@ define void @test_basic() #0 {
; X32-MinGW-NEXT: ja LBB0_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $48
+; X32-MinGW-NEXT: pushl $40
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -135,7 +135,7 @@ define void @test_basic() #0 {
; X32-DFlyBSD-NEXT: ja .LBB0_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $48
+; X32-DFlyBSD-NEXT: pushl $40
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -162,7 +162,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
; X32-Linux-NEXT: ja .LBB1_2
; X32-Linux: pushl $4
-; X32-Linux-NEXT: pushl $60
+; X32-Linux-NEXT: pushl $44
; X32-Linux-NEXT: calll __morestack
; X32-Linux-NEXT: ret
@@ -209,7 +209,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
; X32-MinGW-NEXT: ja LBB1_2
; X32-MinGW: pushl $4
-; X32-MinGW-NEXT: pushl $52
+; X32-MinGW-NEXT: pushl $44
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -238,7 +238,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
; X32-DFlyBSD-NEXT: ja .LBB1_2
; X32-DFlyBSD: pushl $4
-; X32-DFlyBSD-NEXT: pushl $52
+; X32-DFlyBSD-NEXT: pushl $44
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -305,12 +305,12 @@ define void @test_large() #0 {
; X64-Darwin-NEXT: callq ___morestack
; X64-Darwin-NEXT: ret
-; X32-MinGW: leal -40008(%esp), %ecx
+; X32-MinGW: leal -40000(%esp), %ecx
; X32-MinGW-NEXT: cmpl %fs:20, %ecx
; X32-MinGW-NEXT: ja LBB2_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $40008
+; X32-MinGW-NEXT: pushl $40000
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -333,12 +333,12 @@ define void @test_large() #0 {
; X64-FreeBSD-NEXT: callq __morestack
; X64-FreeBSD-NEXT: ret
-; X32-DFlyBSD: leal -40008(%esp), %ecx
+; X32-DFlyBSD: leal -40000(%esp), %ecx
; X32-DFlyBSD-NEXT: cmpl %fs:16, %ecx
; X32-DFlyBSD-NEXT: ja .LBB2_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $40008
+; X32-DFlyBSD-NEXT: pushl $40000
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -364,7 +364,7 @@ define fastcc void @test_fastcc() #0 {
; X32-Linux-NEXT: ja .LBB3_2
; X32-Linux: pushl $0
-; X32-Linux-NEXT: pushl $60
+; X32-Linux-NEXT: pushl $44
; X32-Linux-NEXT: calll __morestack
; X32-Linux-NEXT: ret
@@ -415,7 +415,7 @@ define fastcc void @test_fastcc() #0 {
; X32-MinGW-NEXT: ja LBB3_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $48
+; X32-MinGW-NEXT: pushl $40
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -445,7 +445,7 @@ define fastcc void @test_fastcc() #0 {
; X32-DFlyBSD-NEXT: ja .LBB3_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $48
+; X32-DFlyBSD-NEXT: pushl $40
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -524,12 +524,12 @@ define fastcc void @test_fastcc_large() #0 {
; X32-MinGW-LABEL: test_fastcc_large:
-; X32-MinGW: leal -40008(%esp), %eax
+; X32-MinGW: leal -40000(%esp), %eax
; X32-MinGW-NEXT: cmpl %fs:20, %eax
; X32-MinGW-NEXT: ja LBB4_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $40008
+; X32-MinGW-NEXT: pushl $40000
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -557,12 +557,12 @@ define fastcc void @test_fastcc_large() #0 {
; X32-DFlyBSD-LABEL: test_fastcc_large:
-; X32-DFlyBSD: leal -40008(%esp), %eax
+; X32-DFlyBSD: leal -40000(%esp), %eax
; X32-DFlyBSD-NEXT: cmpl %fs:16, %eax
; X32-DFlyBSD-NEXT: ja .LBB4_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $40008
+; X32-DFlyBSD-NEXT: pushl $40000
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll
index e8da7ab971b1..5ecf37e5248c 100644
--- a/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
; 32-bit catch-all has to use a filter function because that's how it saves the
; exception code.
@@ -75,13 +75,13 @@ entry:
; CHECK: movl -24(%ebp), %esp
; EH state -1
; CHECK: movl [[code_offs]](%ebp), %[[code:[a-z]+]]
-; CHECK-DAG: movl %[[code]], 4(%esp)
-; CHECK-DAG: movl $_str, (%esp)
+; CHECK: pushl %[[code]]
+; CHECK: pushl $_str
; CHECK: calll _printf
; CHECK: .section .xdata,"dr"
; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK: L__ehtable$main
; CHECK-NEXT: .long -1
; CHECK-NEXT: .long _filt$main
diff --git a/test/CodeGen/X86/seh-safe-div-win32.ll b/test/CodeGen/X86/seh-safe-div-win32.ll
index 643af3a472fb..3d46e3867625 100644
--- a/test/CodeGen/X86/seh-safe-div-win32.ll
+++ b/test/CodeGen/X86/seh-safe-div-win32.ll
@@ -65,13 +65,13 @@ __try.cont:
; Landing pad code
-; CHECK: [[handler0:LBB0_[0-9]+]]: # %handler0
+; CHECK: [[handler1:LBB0_[0-9]+]]: # %handler1
; Restore SP
; CHECK: movl {{.*}}(%ebp), %esp
; CHECK: calll _puts
; CHECK: jmp [[cont_bb]]
-; CHECK: [[handler1:LBB0_[0-9]+]]: # %handler1
+; CHECK: [[handler0:LBB0_[0-9]+]]: # %handler0
; Restore SP
; CHECK: movl {{.*}}(%ebp), %esp
; CHECK: calll _puts
diff --git a/test/CodeGen/X86/seh-safe-div.ll b/test/CodeGen/X86/seh-safe-div.ll
index 60918cf07058..d46e235c59ac 100644
--- a/test/CodeGen/X86/seh-safe-div.ll
+++ b/test/CodeGen/X86/seh-safe-div.ll
@@ -67,14 +67,14 @@ __try.cont:
; Landing pad code
-; CHECK: [[handler0:\.LBB0_[0-9]+]]: # %handler0
+; CHECK: [[handler1:\.LBB0_[0-9]+]]: # %handler1
; CHECK: callq puts
-; CHECK: movl $-1, [[rloc]]
+; CHECK: movl $-2, [[rloc]]
; CHECK: jmp [[cont_bb]]
-; CHECK: [[handler1:\.LBB0_[0-9]+]]: # %handler1
+; CHECK: [[handler0:\.LBB0_[0-9]+]]: # %handler0
; CHECK: callq puts
-; CHECK: movl $-2, [[rloc]]
+; CHECK: movl $-1, [[rloc]]
; CHECK: jmp [[cont_bb]]
; CHECK: .seh_handlerdata
diff --git a/test/CodeGen/X86/seh-stack-realign.ll b/test/CodeGen/X86/seh-stack-realign.ll
index 654cad347f6b..1225faebdb83 100644
--- a/test/CodeGen/X86/seh-stack-realign.ll
+++ b/test/CodeGen/X86/seh-stack-realign.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
; 32-bit catch-all has to use a filter function because that's how it saves the
; exception code.
@@ -57,19 +57,19 @@ entry:
; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%esi)
; CHECK: movl $L__ehtable$main,
; EH state 0
-; CHECK: movl $0, 40(%esi)
+; CHECK: movl $0, 32(%esi)
; CHECK: calll _crash
; CHECK: retl
; CHECK: LBB0_[[lpbb:[0-9]+]]: # %__except
; Restore ESP
; CHECK: movl -24(%ebp), %esp
; Restore ESI
-; CHECK: leal -44(%ebp), %esi
+; CHECK: leal -36(%ebp), %esi
; Restore EBP
-; CHECK: movl 12(%esi), %ebp
+; CHECK: movl 4(%esi), %ebp
; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
-; CHECK-DAG: movl %[[code]], 4(%esp)
-; CHECK-DAG: movl $_str, (%esp)
+; CHECK: pushl %[[code]]
+; CHECK: pushl $_str
; CHECK: calll _printf
; CHECK: .section .xdata,"dr"
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 91b42bd67767..10658f3fa4ef 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -33,7 +33,7 @@ entry:
define void @pr26232(i64 %a) {
; KNL-32-LABEL: pr26232:
-; KNL-32: # BB#0: # %for_test11.preheader
+; KNL-32: # BB#0: # %for_loop599.preheader
; KNL-32-NEXT: pushl %esi
; KNL-32-NEXT: .Ltmp0:
; KNL-32-NEXT: .cfi_def_cfa_offset 8
@@ -42,7 +42,7 @@ define void @pr26232(i64 %a) {
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; KNL-32-NEXT: movw $-1, %dx
-; KNL-32-NEXT: .align 16, 0x90
+; KNL-32-NEXT: .p2align 4, 0x90
; KNL-32-NEXT: .LBB1_1: # %for_loop599
; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1
; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000
diff --git a/test/CodeGen/X86/setcc-narrowing.ll b/test/CodeGen/X86/setcc-narrowing.ll
index bf5b45031a24..a4259ddd2318 100644
--- a/test/CodeGen/X86/setcc-narrowing.ll
+++ b/test/CodeGen/X86/setcc-narrowing.ll
@@ -6,9 +6,9 @@
define i32 @t1() nounwind ssp {
entry:
; CHECK-LABEL: t1:
-; CHECK: cmpl $0, _t1.global
+; CHECK: xorl %eax, %eax
+; CHECK-NEXT: cmpl $0, _t1.global
; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: ret
%0 = load i64, i64* @t1.global, align 8
%and = and i64 4294967295, %0
diff --git a/test/CodeGen/X86/setcc.ll b/test/CodeGen/X86/setcc.ll
index b4847c54ffaf..eabcda4e075f 100644
--- a/test/CodeGen/X86/setcc.ll
+++ b/test/CodeGen/X86/setcc.ll
@@ -7,8 +7,8 @@
define zeroext i16 @t1(i16 zeroext %x) nounwind readnone ssp {
entry:
; CHECK-LABEL: t1:
+; CHECK: xorl %eax, %eax
; CHECK: seta %al
-; CHECK: movzbl %al, %eax
; CHECK: shll $5, %eax
%0 = icmp ugt i16 %x, 26 ; <i1> [#uses=1]
%iftmp.1.0 = select i1 %0, i16 32, i16 0 ; <i16> [#uses=1]
@@ -54,3 +54,27 @@ entry:
%add = shl nuw nsw i32 %conv4.2, 16
ret i32 %add
}
+
+define i8 @t5(i32 %a) #0 {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: testl %edi, %edi
+; CHECK: setns %al
+ %.lobit = lshr i32 %a, 31
+ %trunc = trunc i32 %.lobit to i8
+ %.not = xor i8 %trunc, 1
+ ret i8 %.not
+}
+
+define zeroext i1 @t6(i32 %a) #0 {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: testl %edi, %edi
+; CHECK: setns %al
+ %.lobit = lshr i32 %a, 31
+ %trunc = trunc i32 %.lobit to i1
+ %.not = xor i1 %trunc, 1
+ ret i1 %.not
+}
+
+attributes #0 = { "target-cpu"="skylake-avx512" }
diff --git a/test/CodeGen/X86/sext-ret-val.ll b/test/CodeGen/X86/sext-ret-val.ll
index da1a1871e7e8..33de80f02494 100644
--- a/test/CodeGen/X86/sext-ret-val.ll
+++ b/test/CodeGen/X86/sext-ret-val.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep movzbl | count 1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
; rdar://6699246
define signext i8 @t1(i8* %A) nounwind readnone ssp {
@@ -6,6 +6,11 @@ entry:
%0 = icmp ne i8* %A, null
%1 = zext i1 %0 to i8
ret i8 %1
+
+; CHECK-LABEL: t1:
+; CHECK: cmpl
+; CHECK-NEXT: setne
+; CHECK-NEXT: retl
}
define i8 @t2(i8* %A) nounwind readnone ssp {
@@ -13,4 +18,9 @@ entry:
%0 = icmp ne i8* %A, null
%1 = zext i1 %0 to i8
ret i8 %1
+
+; CHECK-LABEL: t2:
+; CHECK: cmpl
+; CHECK-NEXT: setne
+; CHECK-NEXT: retl
}
diff --git a/test/CodeGen/X86/sext-setcc-self.ll b/test/CodeGen/X86/sext-setcc-self.ll
index 23d66a24724d..e739d21e64e0 100644
--- a/test/CodeGen/X86/sext-setcc-self.ll
+++ b/test/CodeGen/X86/sext-setcc-self.ll
@@ -1,55 +1,68 @@
-; RUN: llc -march=x86-64 -mcpu=nehalem -asm-verbose=false < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
define <4 x i32> @test_ueq(<4 x float> %in) {
-entry:
- ; CHECK: pcmpeqd %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp ueq <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_ueq:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp ueq <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_uge(<4 x float> %in) {
-entry:
- ; CHECK: pcmpeqd %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp uge <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_uge:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp uge <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_ule(<4 x float> %in) {
-entry:
- ; CHECK: pcmpeqd %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp ule <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_ule:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp ule <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_one(<4 x float> %in) {
-entry:
- ; CHECK: xorps %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp one <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_one:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp one <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_ogt(<4 x float> %in) {
-entry:
- ; CHECK: xorps %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp ogt <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_ogt:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp ogt <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_olt(<4 x float> %in) {
-entry:
- ; CHECK: xorps %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp olt <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_olt:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp olt <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
diff --git a/test/CodeGen/X86/sext-trunc.ll b/test/CodeGen/X86/sext-trunc.ll
index 22b3791ba578..5c59bc00860e 100644
--- a/test/CodeGen/X86/sext-trunc.ll
+++ b/test/CodeGen/X86/sext-trunc.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=x86 > %t
-; RUN: grep movsbl %t
-; RUN: not grep movz %t
-; RUN: not grep and %t
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
-define signext i8 @foo(i16 signext %x) nounwind {
+define signext i8 @foo(i16 signext %x) nounwind {
%retval56 = trunc i16 %x to i8
ret i8 %retval56
+
+; CHECK-LABEL: foo:
+; CHECK: movb
+; CHECK-NEXT: retl
}
diff --git a/test/CodeGen/X86/shift-pcmp.ll b/test/CodeGen/X86/shift-pcmp.ll
index 365c7310559b..4945d6115dbe 100644
--- a/test/CodeGen/X86/shift-pcmp.ll
+++ b/test/CodeGen/X86/shift-pcmp.ll
@@ -1,18 +1,20 @@
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -o - -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -o - -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
define <8 x i16> @foo(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-LABEL: {{^_?foo:}}
-; CHECK-NOT: psll
-entry:
+; SSE-LABEL: foo:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: foo:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%icmp = icmp eq <8 x i16> %a, %b
%zext = zext <8 x i1> %icmp to <8 x i16>
%shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
@@ -21,10 +23,23 @@ entry:
; Don't fail with an assert due to an undef in the buildvector
define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: bar
-entry:
+; SSE-LABEL: bar:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
+; SSE-NEXT: psllw $5, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: bar:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%icmp = icmp eq <8 x i16> %a, %b
%zext = zext <8 x i1> %icmp to <8 x i16>
%shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 undef, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
ret <8 x i16> %shl
}
+
diff --git a/test/CodeGen/X86/shrink-wrap-chkstk.ll b/test/CodeGen/X86/shrink-wrap-chkstk.ll
index aecae89aee56..099ef137d8d9 100644
--- a/test/CodeGen/X86/shrink-wrap-chkstk.ll
+++ b/test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -7,7 +7,7 @@
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
target triple = "i686-pc-windows-msvc18.0.0"
-%struct.S = type { [12 x i8] }
+%struct.S = type { [8192 x i8] }
define x86_thiscallcc void @call_inalloca(i1 %x) {
entry:
@@ -29,7 +29,7 @@ bb2:
; CHECK-LABEL: _call_inalloca: # @call_inalloca
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
-; CHECK: movl $12, %eax
+; CHECK: movl $8192, %eax
; CHECK: calll __chkstk
; CHECK: calll _inalloca_params
; CHECK: movl %ebp, %esp
@@ -64,9 +64,9 @@ false:
; CHECK: cmpl %edx, %eax
; CHECK: jge LBB1_2
; CHECK: pushl %eax
-; CHECK: movl $4100, %eax
+; CHECK: movl $4092, %eax
; CHECK: calll __chkstk
-; CHECK: movl 4100(%esp), %eax
+; CHECK: movl 4092(%esp), %eax
; CHECK: calll _doSomething
; CHECK: LBB1_2:
; CHECK: retl
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
new file mode 100644
index 000000000000..58b4e986f774
--- /dev/null
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -0,0 +1,865 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
+
+@c = external global i32*, align 8
+
+; %val1 = load <2 x i8>
+; %op1 = zext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm1
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+ %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+ %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <4 x i8>
+; %op1 = zext<4 x i32> %val1
+; %val2 = load <4 x i8>
+; %op2 = zext<4 x i32> %val2
+; %rst = mul <4 x i32> %op1, %op2
+;
+define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_4xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
+ %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
+ %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
+ %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
+ %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
+ %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+ store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <8 x i8>
+; %op1 = zext<8 x i32> %val1
+; %val2 = load <8 x i8>
+; %op2 = zext<8 x i32> %val2
+; %rst = mul <8 x i32> %op1, %op2
+;
+define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_8xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
+ %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
+ %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
+ %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
+ %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
+ %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
+ store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <16 x i8>
+; %op1 = zext<16 x i32> %val1
+; %val2 = load <16 x i8>
+; %op2 = zext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_16xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; CHECK-NEXT: movdqa %xmm1, %xmm4
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm3, %xmm4
+; CHECK-NEXT: movdqa %xmm4, %xmm3
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
+ %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
+ %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
+ %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
+ %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
+ %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+ store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = zext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhuw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+ %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+ %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <4 x i16>
+; %op1 = zext<4 x i32> %val1
+; %val2 = load <4 x i16>
+; %op2 = zext<4 x i32> %val2
+; %rst = mul <4 x i32> %op1, %op2
+;
+define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_4xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhuw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
+ %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
+ %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
+ %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
+ %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
+ %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+ store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <8 x i16>
+; %op1 = zext<8 x i32> %val1
+; %val2 = load <8 x i16>
+; %op2 = zext<8 x i32> %val2
+; %rst = mul <8 x i32> %op1, %op2
+;
+define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_8xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhuw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
+ %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
+ %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
+ %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
+ %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
+ %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
+ store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <16 x i16>
+; %op1 = zext<16 x i32> %val1
+; %val2 = load <16 x i16>
+; %op2 = zext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_16xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
+; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; CHECK-NEXT: movdqa %xmm2, %xmm4
+; CHECK-NEXT: pmulhuw %xmm0, %xmm4
+; CHECK-NEXT: pmullw %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; CHECK-NEXT: movdqa %xmm3, %xmm4
+; CHECK-NEXT: pmulhuw %xmm1, %xmm4
+; CHECK-NEXT: pmullw %xmm1, %xmm3
+; CHECK-NEXT: movdqa %xmm3, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
+ %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
+ %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
+ %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
+ %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
+ %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+ store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i8>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = sext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi8_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm1
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+ %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+ %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i8>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi8_sext_zext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm1
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+ %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+ %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = sext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi16_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+ %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+ %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi16_sext_zext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: psrlq $32, %xmm3
+; CHECK-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-NEXT: psllq $32, %xmm3
+; CHECK-NEXT: psrlq $32, %xmm1
+; CHECK-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-NEXT: psllq $32, %xmm1
+; CHECK-NEXT: paddq %xmm3, %xmm1
+; CHECK-NEXT: paddq %xmm2, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+ %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+ %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <16 x i16>
+; %op1 = sext<16 x i32> %val1
+; %val2 = load <16 x i16>
+; %op2 = sext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_16xi16_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
+; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; CHECK-NEXT: movdqa %xmm2, %xmm4
+; CHECK-NEXT: pmulhw %xmm0, %xmm4
+; CHECK-NEXT: pmullw %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; CHECK-NEXT: movdqa %xmm3, %xmm4
+; CHECK-NEXT: pmulhw %xmm1, %xmm4
+; CHECK-NEXT: pmullw %xmm1, %xmm3
+; CHECK-NEXT: movdqa %xmm3, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
+ %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
+ %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
+ %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
+ %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
+ %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+ store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst5:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst6:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhuw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000
+; CHECK-NEXT: movd %rcx, %xmm1
+; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-NEXT: psrlq $32, %xmm0
+; CHECK-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-NEXT: psllq $32, %xmm0
+; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
+; CHECK-NEXT: movd %rcx, %xmm1
+; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-NEXT: psrlq $32, %xmm0
+; CHECK-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-NEXT: psllq $32, %xmm0
+; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/sibcall-5.ll b/test/CodeGen/X86/sibcall-5.ll
index aab028bd17c8..4901b4fa069c 100644
--- a/test/CodeGen/X86/sibcall-5.ll
+++ b/test/CodeGen/X86/sibcall-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin8 -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-apple-darwin9 -mattr=+sse2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse3 | FileCheck %s --check-prefix=X64_BAD
@@ -8,7 +8,7 @@
define double @foo(double %a) nounwind readonly ssp {
entry:
; X32-LABEL: foo:
-; X32: jmp L_sin$stub
+; X32: jmp _sin
; X64-LABEL: foo:
; X64: jmp _sin
@@ -18,7 +18,7 @@ entry:
define float @bar(float %a) nounwind readonly ssp {
; X32-LABEL: bar:
-; X32: jmp L_sinf$stub
+; X32: jmp _sinf
; X64-LABEL: bar:
; X64: jmp _sinf
@@ -27,10 +27,6 @@ entry:
ret float %0
}
-; X32-LABEL: L_sin$stub:
-; X32-NEXT: .indirect_symbol _sin
-; X32-LABEL: L_sinf$stub:
-; X32-NEXT: .indirect_symbol _sinf
declare float @sinf(float) nounwind readonly
diff --git a/test/CodeGen/X86/sibcall-byval.ll b/test/CodeGen/X86/sibcall-byval.ll
index c335f30a93a2..8f5833adf5a3 100644
--- a/test/CodeGen/X86/sibcall-byval.ll
+++ b/test/CodeGen/X86/sibcall-byval.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=32
+; RUN: llc < %s -mtriple=i386-apple-darwin9 | FileCheck %s -check-prefix=32
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=64
%struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
@@ -6,7 +6,7 @@
define i32 @f(%struct.p* byval align 4 %q) nounwind ssp {
entry:
; 32: _f:
-; 32: jmp L_g$stub
+; 32: jmp _g
; 64: _f:
; 64: jmp _g
@@ -19,7 +19,7 @@ declare i32 @g(%struct.p* byval align 4)
define i32 @h(%struct.p* byval align 4 %q, i32 %r) nounwind ssp {
entry:
; 32: _h:
-; 32: jmp L_i$stub
+; 32: jmp _i
; 64: _h:
; 64: jmp _i
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index 9d02bcd9a6c7..f0dff3b806c5 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -1,6 +1,8 @@
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_NOOPT
; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNUX32_SINCOS
; Combine sin / cos into a single call.
; rdar://13087969
@@ -13,6 +15,15 @@ entry:
; GNU_SINCOS: movss 4(%rsp), %xmm0
; GNU_SINCOS: addss (%rsp), %xmm0
+; GNUX32_SINCOS-LABEL: test1:
+; GNUX32_SINCOS: callq sincosf
+; GNUX32_SINCOS: movss 4(%esp), %xmm0
+; GNUX32_SINCOS: addss (%esp), %xmm0
+
+; GNU_NOOPT: test1
+; GNU_NOOPT: callq sinf
+; GNU_NOOPT: callq cosf
+
; OSX_SINCOS-LABEL: test1:
; OSX_SINCOS: callq ___sincosf_stret
; OSX_SINCOS: movshdup {{.*}} xmm1 = xmm0[1,1,3,3]
@@ -34,6 +45,15 @@ entry:
; GNU_SINCOS: movsd 16(%rsp), %xmm0
; GNU_SINCOS: addsd 8(%rsp), %xmm0
+; GNUX32_SINCOS-LABEL: test2:
+; GNUX32_SINCOS: callq sincos
+; GNUX32_SINCOS: movsd 16(%esp), %xmm0
+; GNUX32_SINCOS: addsd 8(%esp), %xmm0
+
+; GNU_NOOPT: test2:
+; GNU_NOOPT: callq sin
+; GNU_NOOPT: callq cos
+
; OSX_SINCOS-LABEL: test2:
; OSX_SINCOS: callq ___sincos_stret
; OSX_SINCOS: addsd %xmm1, %xmm0
@@ -53,6 +73,16 @@ entry:
; GNU_SINCOS: callq sinl
; GNU_SINCOS: callq cosl
; GNU_SINCOS: ret
+
+; GNUX32_SINCOS-LABEL: test3:
+; GNUX32_SINCOS: callq sinl
+; GNUX32_SINCOS: callq cosl
+; GNUX32_SINCOS: ret
+
+; GNU_NOOPT: test3:
+; GNU_NOOPT: callq sinl
+; GNU_NOOPT: callq cosl
+
%call = tail call x86_fp80 @sinl(x86_fp80 %x) nounwind
%call1 = tail call x86_fp80 @cosl(x86_fp80 %x) nounwind
%add = fadd x86_fp80 %call, %call1
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
index c2f0411901a7..5436cf248bd5 100644
--- a/test/CodeGen/X86/sink-blockfreq.ll
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -1,5 +1,5 @@
-; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
-; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
+; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
+; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
; Test that by changing BlockFrequencyInfo we change the order in which
; machine-sink looks for sucessor blocks. By not using BFI, both G and B
diff --git a/test/CodeGen/X86/sink-cheap-instructions.ll b/test/CodeGen/X86/sink-cheap-instructions.ll
index 9b9a6865af93..8966ca50142e 100644
--- a/test/CodeGen/X86/sink-cheap-instructions.ll
+++ b/test/CodeGen/X86/sink-cheap-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux -sink-insts-to-avoid-spills | FileCheck %s -check-prefix=SINK
; Ensure that we sink copy-like instructions into loops to avoid register
diff --git a/test/CodeGen/X86/sjlj-eh.ll b/test/CodeGen/X86/sjlj-eh.ll
new file mode 100644
index 000000000000..4d2e4e821f42
--- /dev/null
+++ b/test/CodeGen/X86/sjlj-eh.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s
+
+declare void @_Z20function_that_throwsv()
+declare i32 @__gxx_personality_sj0(...)
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+
+define void @_Z8functionv() personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
+entry:
+ invoke void @_Z20function_that_throwsv()
+ to label %try.cont unwind label %lpad
+
+lpad:
+ %0 = landingpad { i8*, i32 }
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont:
+ ret void
+}
+
+; struct _Unwind_FunctionContext {
+; +00 struct _Unwind_FunctionContext *prev; -64(%ebp)
+; +04 uintptr_t __callsite; -60(%ebp)
+; +08 uintptr_t __buffer[4]; -44(%ebp)
+; +28 __personality_routine __personality; -40(%ebp)
+; +32 uintptr_t __lsda; -36(%ebp)
+; +36 void *__jbuf[]; -32(%ebp)
+; };
+
+
+; CHECK-LABEL: __Z8functionv:
+; struct _Unwind_FunctionContext UFC;
+;
+; UFC.__personality = __gxx_personality_sj0
+; CHECK: movl $___gxx_personality_sj0, -40(%ebp)
+; UFC.__lsda = $LSDA
+; CHECK: movl $[[LSDA:GCC_except_table[0-9]+]], -36(%ebp)
+; UFC.__jbuf[0] = $EBP
+; CHECK: movl %ebp, -32(%ebp)
+; UFC.__jbuf[2] = $ESP
+; CHECK: movl %esp, -24(%ebp)
+; UFC.__jbuf[1] = $EIP
+; CHECK: movl $[[RESUME:LBB[0-9]+_[0-9]+]], -28(%ebp)
+; UFC.__callsite = 1
+; CHECK: movl $1, -60(%ebp)
+; _Unwind_SjLj_Register(&UFC);
+; CHECK: leal -64(%ebp), %eax
+; CHECK: pushl %eax
+; CHECK: calll __Unwind_SjLj_Register
+; CHECK: addl $4, %esp
+; function_that_throws();
+; CHECK: calll __Z20function_that_throwsv
+; _Unwind_SjLj_Unregister(&UFC);
+; CHECK: leal -64(%ebp), %eax
+; CHECK: calll __Unwind_SjLj_Unregister
+;
+; CHECK: [[RESUME]]:
+; CHECK: leal -64(%ebp), %esi
+; assert(UFC.__callsite <= 1);
+; CHECK: movl -60(%ebp), %eax
+; CHECK: cmpl $1, %eax
+; CHECK: jbe [[CONT:LBB[0-9]+_[0-9]+]]
+; CHECK: ud2
+; CHECK: [[CONT]]:
+; *Handlers[--UFC.__callsite]
+; CHECK: subl $1, %eax
+; CHECK: jmpl *LJTI
+
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll
index 27cbef681b7e..41e9a95bcdd8 100644
--- a/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -14,15 +14,15 @@
; Intel chips with fast unaligned memory accesses
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=skylake 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=skylake-avx512 2>&1 | FileCheck %s --check-prefix=FAST
; AMD chips with slow unaligned memory accesses
diff --git a/test/CodeGen/X86/sqrt-fastmath-mir.ll b/test/CodeGen/X86/sqrt-fastmath-mir.ll
new file mode 100644
index 000000000000..750b4d96e5d0
--- /dev/null
+++ b/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -recip=sqrt:2 -stop-after=expand-isel-pseudos 2>&1 | FileCheck %s
+
+declare float @llvm.sqrt.f32(float) #0
+
+define float @foo(float %f) #0 {
+; CHECK: {{name: *foo}}
+; CHECK: body:
+; CHECK: %0 = COPY %xmm0
+; CHECK: %1 = VRSQRTSSr killed %2, %0
+; CHECK: %3 = VMULSSrr %0, %1
+; CHECK: %4 = VMOVSSrm
+; CHECK: %5 = VFMADDSSr213r %1, killed %3, %4
+; CHECK: %6 = VMOVSSrm
+; CHECK: %7 = VMULSSrr %1, %6
+; CHECK: %8 = VMULSSrr killed %7, killed %5
+; CHECK: %9 = VMULSSrr %0, %8
+; CHECK: %10 = VFMADDSSr213r %8, %9, %4
+; CHECK: %11 = VMULSSrr %9, %6
+; CHECK: %12 = VMULSSrr killed %11, killed %10
+; CHECK: %13 = FsFLD0SS
+; CHECK: %14 = VCMPSSrr %0, killed %13, 0
+; CHECK: %15 = VFsANDNPSrr killed %14, killed %12
+; CHECK: %xmm0 = COPY %15
+; CHECK: RET 0, %xmm0
+ %call = tail call float @llvm.sqrt.f32(float %f) #1
+ ret float %call
+}
+
+define float @rfoo(float %f) #0 {
+; CHECK: {{name: *rfoo}}
+; CHECK: body: |
+; CHECK: %0 = COPY %xmm0
+; CHECK: %1 = VRSQRTSSr killed %2, %0
+; CHECK: %3 = VMULSSrr %0, %1
+; CHECK: %4 = VMOVSSrm
+; CHECK: %5 = VFMADDSSr213r %1, killed %3, %4
+; CHECK: %6 = VMOVSSrm
+; CHECK: %7 = VMULSSrr %1, %6
+; CHECK: %8 = VMULSSrr killed %7, killed %5
+; CHECK: %9 = VMULSSrr %0, %8
+; CHECK: %10 = VFMADDSSr213r %8, killed %9, %4
+; CHECK: %11 = VMULSSrr %8, %6
+; CHECK: %12 = VMULSSrr killed %11, killed %10
+; CHECK: %xmm0 = COPY %12
+; CHECK: RET 0, %xmm0
+ %sqrt = tail call float @llvm.sqrt.f32(float %f)
+ %div = fdiv fast float 1.0, %sqrt
+ ret float %div
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 386409a674ef..1c6b13026a72 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -34,12 +34,11 @@ define float @ff(float %f) #0 {
; ESTIMATE-LABEL: ff:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm3
-; ESTIMATE-NEXT: vmulss %xmm3, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm0, %xmm2
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm2, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm2, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
; ESTIMATE-NEXT: vxorps %xmm2, %xmm2, %xmm2
; ESTIMATE-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
; ESTIMATE-NEXT: vandnps %xmm1, %xmm0, %xmm0
@@ -78,11 +77,11 @@ define float @reciprocal_square_root(float %x) #0 {
; ESTIMATE-LABEL: reciprocal_square_root:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm2
; ESTIMATE-NEXT: vmulss %xmm2, %xmm0, %xmm0
+; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: retq
%sqrt = tail call float @llvm.sqrt.f32(float %x)
%div = fdiv fast float 1.0, %sqrt
@@ -100,11 +99,11 @@ define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtps %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
+; ESTIMATE-NEXT: vmulps %xmm1, %xmm1, %xmm2
+; ESTIMATE-NEXT: vmulps %xmm2, %xmm0, %xmm0
; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: retq
%sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
@@ -125,11 +124,11 @@ define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtps %ymm0, %ymm1
-; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
+; ESTIMATE-NEXT: vmulps %ymm1, %ymm1, %ymm2
+; ESTIMATE-NEXT: vmulps %ymm2, %ymm0, %ymm0
; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
-; ESTIMATE-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
; ESTIMATE-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
index b96ecc575021..1d5a88a1a5ec 100644
--- a/test/CodeGen/X86/sse-intel-ocl.ll
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -14,7 +14,7 @@ declare <16 x float> @func_float16(<16 x float>, <16 x float>)
; WIN64: ret
; WIN32: testf16_inp
-; WIN32: movl %eax, (%esp)
+; WIN32: pushl %eax
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..2102b4211153
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
+
+define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
+; X64-LABEL: test_mm_cvtsi64_ss:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2ssq %rdi, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cvt = sitofp i64 %a1 to float
+ %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
+define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
+; X64-LABEL: test_mm_cvtss_si64:
+; X64: # BB#0:
+; X64-NEXT: cvtss2si %xmm0, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
+; X64-LABEL: test_mm_cvttss_si64:
+; X64: # BB#0:
+; X64-NEXT: cvttss2si %xmm0, %rax
+; X64-NEXT: retq
+ %cvt = extractelement <4 x float> %a0, i32 0
+ %res = fptosi float %cvt to i64
+ ret i64 %res
+}
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..090ddfdfa93a
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -0,0 +1,2303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
+
+define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_add_ps:
+; X32: # BB#0:
+; X32-NEXT: addps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_ps:
+; X64: # BB#0:
+; X64-NEXT: addps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fadd <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_add_ss:
+; X32: # BB#0:
+; X32-NEXT: addss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_ss:
+; X64: # BB#0:
+; X64-NEXT: addss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fadd = fadd float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fadd, i32 0
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_and_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_and_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: andl %eax, %edx
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: andl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %r8d, %edi
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %eax, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %res = and <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: notl %edx
+; X32-NEXT: notl %ecx
+; X32-NEXT: notl %esi
+; X32-NEXT: notl %eax
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_andnot_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: notl %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: notl %ecx
+; X64-NEXT: andl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: notl %esi
+; X64-NEXT: notl %edx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %r8d, %edx
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %edi, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = and <4 x i32> %not, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpeqps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpeqps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp oeq <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpeqss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpeqss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpleps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpleps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ole <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpless %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpless %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpltps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpltps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp olt <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpltss %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpltss %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpleps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpleps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ole <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpless %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpless %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpltps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpltps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp olt <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpltss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpltss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpneqps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpneqps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp une <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpneqss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpneqss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnleps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnleps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ugt <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnless %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnless %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnltps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnltps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp uge <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnltss %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnltss %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnleps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnleps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ugt <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnless %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnless %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnltps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnltps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp uge <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnltss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnltss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpordps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpordps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ord <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpordss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpordss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpunordps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpunordps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp uno <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpunordss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpunordss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
+ ret <4 x float> %res
+}
+
+define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comieq_ss:
+; X32: # BB#0:
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comieq_ss:
+; X64: # BB#0:
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comige_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comige_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comigt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comigt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comile_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comile_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comilt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comilt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comineq_ss:
+; X32: # BB#0:
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comineq_ss:
+; X64: # BB#0:
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvt_ss2si:
+; X32: # BB#0:
+; X32-NEXT: cvtss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvt_ss2si:
+; X64: # BB#0:
+; X64-NEXT: cvtss2si %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm_cvtsi32_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cvtsi2ssl %eax, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi32_ss:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2ssl %edi, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cvt = sitofp i32 %a1 to float
+ %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
+define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtss_f32:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtss_f32:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = extractelement <4 x float> %a0, i32 0
+ ret float %res
+}
+
+define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtss_si32:
+; X32: # BB#0:
+; X32-NEXT: cvtss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtss_si32:
+; X64: # BB#0:
+; X64-NEXT: cvtss2si %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+ ret i32 %res
+}
+
+define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttss_si:
+; X32: # BB#0:
+; X32-NEXT: cvttss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttss_si:
+; X64: # BB#0:
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: retq
+ %cvt = extractelement <4 x float> %a0, i32 0
+ %res = fptosi float %cvt to i32
+ ret i32 %res
+}
+
+define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttss_si32:
+; X32: # BB#0:
+; X32-NEXT: cvttss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttss_si32:
+; X64: # BB#0:
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: retq
+ %cvt = extractelement <4 x float> %a0, i32 0
+ %res = fptosi float %cvt to i32
+ ret i32 %res
+}
+
+define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_div_ps:
+; X32: # BB#0:
+; X32-NEXT: divps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_ps:
+; X64: # BB#0:
+; X64-NEXT: divps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fdiv <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_div_ss:
+; X32: # BB#0:
+; X32-NEXT: divss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_ss:
+; X64: # BB#0:
+; X64-NEXT: divss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fdiv = fdiv float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fdiv, i32 0
+ ret <4 x float> %res
+}
+
+define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
+; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $8064, %eax # imm = 0x1F80
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $8064, %eax # imm = 0x1F80
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 8064
+ ret i32 %4
+}
+declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
+
+define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
+; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $63, %eax
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $63, %eax
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 63
+ ret i32 %4
+}
+
+define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
+; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $32768, %eax # imm = 0x8000
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $32768, %eax # imm = 0x8000
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 32768
+ ret i32 %4
+}
+
+define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
+; X32-LABEL: test_MM_GET_ROUNDING_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $24576, %eax # imm = 0x6000
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_ROUNDING_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $24576, %eax # imm = 0x6000
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 24576
+ ret i32 %4
+}
+
+define i32 @test_mm_getcsr() nounwind {
+; X32-LABEL: test_mm_getcsr:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_getcsr:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ ret i32 %3
+}
+
+define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %res = load <4 x float>, <4 x float>* %arg0, align 16
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ps1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_ps1:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %ld = load float, float* %a0, align 4
+ %res0 = insertelement <4 x float> undef, float %ld, i32 0
+ %res1 = insertelement <4 x float> %res0, float %ld, i32 1
+ %res2 = insertelement <4 x float> %res1, float %ld, i32 2
+ %res3 = insertelement <4 x float> %res2, float %ld, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_ss:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: retq
+ %ld = load float, float* %a0, align 1
+ %res0 = insertelement <4 x float> undef, float %ld, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
+ %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_load1_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load1_ps:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %ld = load float, float* %a0, align 4
+ %res0 = insertelement <4 x float> undef, float %ld, i32 0
+ %res1 = insertelement <4 x float> %res0, float %ld, i32 1
+ %res2 = insertelement <4 x float> %res1, float %ld, i32 2
+ %res3 = insertelement <4 x float> %res2, float %ld, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
+; X32-LABEL: test_mm_loadh_pi:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadh_pi:
+; X64: # BB#0:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a1 to <2 x float>*
+ %ld = load <2 x float>, <2 x float>* %ptr
+ %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
+; X32-LABEL: test_mm_loadl_pi:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadl_pi:
+; X64: # BB#0:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a1 to <2 x float>*
+ %ld = load <2 x float>, <2 x float>* %ptr
+ %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_loadr_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadr_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %ld = load <4 x float>, <4 x float>* %arg0, align 16
+ %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadu_ps:
+; X64: # BB#0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %res = load <4 x float>, <4 x float>* %arg0, align 1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_max_ps:
+; X32: # BB#0:
+; X32-NEXT: maxps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_ps:
+; X64: # BB#0:
+; X64-NEXT: maxps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_max_ss:
+; X32: # BB#0:
+; X32-NEXT: maxss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_ss:
+; X64: # BB#0:
+; X64-NEXT: maxss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_min_ps:
+; X32: # BB#0:
+; X32-NEXT: minps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_ps:
+; X64: # BB#0:
+; X64-NEXT: minps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_min_ss:
+; X32: # BB#0:
+; X32-NEXT: minss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_ss:
+; X64: # BB#0:
+; X64-NEXT: minss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_move_ss:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_move_ss:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_movehl_ps:
+; X32: # BB#0:
+; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movehl_ps:
+; X64: # BB#0:
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_movelh_ps:
+; X32: # BB#0:
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movelh_ps:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %res
+}
+
+define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_ps:
+; X32: # BB#0:
+; X32-NEXT: movmskps %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movemask_ps:
+; X64: # BB#0:
+; X64-NEXT: movmskps %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_mul_ps:
+; X32: # BB#0:
+; X32-NEXT: mulps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_ps:
+; X64: # BB#0:
+; X64-NEXT: mulps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fmul <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_mul_ss:
+; X32: # BB#0:
+; X32-NEXT: mulss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_ss:
+; X64: # BB#0:
+; X64-NEXT: mulss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fmul = fmul float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fmul, i32 0
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_or_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_or_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: orl %eax, %edx
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: orl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: orl %r8d, %edi
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: orl %eax, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %res = or <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+define void @test_mm_prefetch(i8* %a0) {
+; X32-LABEL: test_mm_prefetch:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: prefetchnta (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_prefetch:
+; X64: # BB#0:
+; X64-NEXT: prefetchnta (%rdi)
+; X64-NEXT: retq
+ call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
+ ret void
+}
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
+
+define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_rcp_ps:
+; X32: # BB#0:
+; X32-NEXT: rcpps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rcp_ps:
+; X64: # BB#0:
+; X64-NEXT: rcpps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_rcp_ss:
+; X32: # BB#0:
+; X32-NEXT: rcpss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rcp_ss:
+; X64: # BB#0:
+; X64-NEXT: rcpss %xmm0, %xmm0
+; X64-NEXT: retq
+ %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
+ %ext0 = extractelement <4 x float> %rcp, i32 0
+ %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+ %ext1 = extractelement <4 x float> %a0, i32 1
+ %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+ %ext2 = extractelement <4 x float> %a0, i32 2
+ %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+ %ext3 = extractelement <4 x float> %a0, i32 3
+ %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+ ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_rsqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: rsqrtps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rsqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: rsqrtps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_rsqrt_ss:
+; X32: # BB#0:
+; X32-NEXT: rsqrtss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rsqrt_ss:
+; X64: # BB#0:
+; X64-NEXT: rsqrtss %xmm0, %xmm0
+; X64-NEXT: retq
+ %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
+ %ext0 = extractelement <4 x float> %rsqrt, i32 0
+ %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+ %ext1 = extractelement <4 x float> %a0, i32 1
+ %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+ %ext2 = extractelement <4 x float> %a0, i32 2
+ %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+ %ext3 = extractelement <4 x float> %a0, i32 3
+ %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+ ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-8065, %edx # imm = 0xE07F
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-8065, %ecx # imm = 0xE07F
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -8065
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
+
+define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-64, %edx
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-64, %ecx
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -64
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+
+define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-32769, %edx # imm = 0xFFFF7FFF
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-32769, %ecx # imm = 0xFFFF7FFF
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -32769
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+
+define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
+; X32-LABEL: test_mm_set_ps:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_ps:
+; X64: # BB#0:
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: movaps %xmm3, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a3, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a2, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a1, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a0, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
+; X32-LABEL: test_mm_set_ps1:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_ps1:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a0, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a0, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a0, i32 3
+ ret <4 x float> %res3
+}
+
+define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_ROUNDING_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-24577, %edx # imm = 0x9FFF
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_ROUNDING_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-24577, %ecx # imm = 0x9FFF
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -24577
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+
+define <4 x float> @test_mm_set_ss(float %a0) nounwind {
+; X32-LABEL: test_mm_set_ss:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_ss:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
+ %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
+; X32-LABEL: test_mm_set1_ps:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a0, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a0, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a0, i32 3
+ ret <4 x float> %res3
+}
+
+define void @test_mm_setcsr(i32 %a0) nounwind {
+; X32-LABEL: test_mm_setcsr:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setcsr:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %st = alloca i32, align 4
+ store i32 %a0, i32* %st, align 4
+ %bc = bitcast i32* %st to i8*
+ call void @llvm.x86.sse.ldmxcsr(i8* %bc)
+ ret void
+}
+
+define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
+; X32-LABEL: test_mm_setr_ps:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_ps:
+; X64: # BB#0:
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a2, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a3, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_setzero_ps() {
+; X32-LABEL: test_mm_setzero_ps:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setzero_ps:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
+ ret <4 x float> zeroinitializer
+}
+
+define void @test_mm_sfence() nounwind {
+; X32-LABEL: test_mm_sfence:
+; X32: # BB#0:
+; X32-NEXT: sfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sfence:
+; X64: # BB#0:
+; X64-NEXT: sfence
+; X64-NEXT: retq
+ call void @llvm.x86.sse.sfence()
+ ret void
+}
+declare void @llvm.x86.sse.sfence() nounwind readnone
+
+define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_sqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: sqrtps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: sqrtps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_sqrt_ss:
+; X32: # BB#0:
+; X32-NEXT: sqrtss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_ss:
+; X64: # BB#0:
+; X64-NEXT: sqrtss %xmm0, %xmm0
+; X64-NEXT: retq
+ %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
+ %ext0 = extractelement <4 x float> %sqrt, i32 0
+ %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+ %ext1 = extractelement <4 x float> %a0, i32 1
+ %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+ %ext2 = extractelement <4 x float> %a0, i32 2
+ %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+ %ext3 = extractelement <4 x float> %a0, i32 3
+ %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+ ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ store <4 x float> %a1, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ps1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_ps1:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+ store <4 x float> %shuf, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_ss:
+; X64: # BB#0:
+; X64-NEXT: movss %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <4 x float> %a1, i32 0
+ store float %ext, float* %a0, align 1
+ ret void
+}
+
+define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store1_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store1_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+ store <4 x float> %shuf, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_storeh_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, 4(%eax)
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeh_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a0 to i64*
+ %bc = bitcast <4 x float> %a1 to <2 x i64>
+ %ext = extractelement <2 x i64> %bc, i32 1
+ store i64 %ext, i64* %ptr
+ ret void
+}
+
+define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_storel_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl (%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, 4(%eax)
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storel_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a0 to i64*
+ %bc = bitcast <4 x float> %a1 to <2 x i64>
+ %ext = extractelement <2 x i64> %bc, i32 0
+ store i64 %ext, i64* %ptr
+ ret void
+}
+
+define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_storer_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storer_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ store <4 x float> %shuf, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_storeu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeu_ps:
+; X64: # BB#0:
+; X64-NEXT: movups %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ store <4 x float> %a1, <4 x float>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_stream_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_ps:
+; X64: # BB#0:
+; X64-NEXT: movntps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
+ ret void
+}
+
+define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_sub_ps:
+; X32: # BB#0:
+; X32-NEXT: subps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_ps:
+; X64: # BB#0:
+; X64-NEXT: subps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fsub <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_sub_ss:
+; X32: # BB#0:
+; X32-NEXT: subss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_ss:
+; X64: # BB#0:
+; X64-NEXT: subss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fsub = fsub float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fsub, i32 0
+ ret <4 x float> %res
+}
+
+define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
+; X32-LABEL: test_MM_TRANSPOSE4_PS:
+; X32: # BB#0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps (%esi), %xmm0
+; X32-NEXT: movaps (%edx), %xmm1
+; X32-NEXT: movaps (%ecx), %xmm2
+; X32-NEXT: movaps (%eax), %xmm3
+; X32-NEXT: movaps %xmm0, %xmm4
+; X32-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X32-NEXT: movaps %xmm2, %xmm5
+; X32-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X32-NEXT: movaps %xmm4, %xmm1
+; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X32-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X32-NEXT: movaps %xmm0, %xmm3
+; X32-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X32-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X32-NEXT: movaps %xmm1, (%esi)
+; X32-NEXT: movaps %xmm5, (%edx)
+; X32-NEXT: movaps %xmm3, (%ecx)
+; X32-NEXT: movaps %xmm2, (%eax)
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_TRANSPOSE4_PS:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: movaps (%rsi), %xmm1
+; X64-NEXT: movaps (%rdx), %xmm2
+; X64-NEXT: movaps (%rcx), %xmm3
+; X64-NEXT: movaps %xmm0, %xmm4
+; X64-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X64-NEXT: movaps %xmm2, %xmm5
+; X64-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT: movaps %xmm4, %xmm1
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X64-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X64-NEXT: movaps %xmm0, %xmm3
+; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X64-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X64-NEXT: movaps %xmm1, (%rdi)
+; X64-NEXT: movaps %xmm5, (%rsi)
+; X64-NEXT: movaps %xmm3, (%rdx)
+; X64-NEXT: movaps %xmm2, (%rcx)
+; X64-NEXT: retq
+ %row0 = load <4 x float>, <4 x float>* %a0, align 16
+ %row1 = load <4 x float>, <4 x float>* %a1, align 16
+ %row2 = load <4 x float>, <4 x float>* %a2, align 16
+ %row3 = load <4 x float>, <4 x float>* %a3, align 16
+ %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ store <4 x float> %res0, <4 x float>* %a0, align 16
+ store <4 x float> %res1, <4 x float>* %a1, align 16
+ store <4 x float> %res2, <4 x float>* %a2, align 16
+ store <4 x float> %res3, <4 x float>* %a3, align 16
+ ret void
+}
+
+define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomieq_ss:
+; X32: # BB#0:
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomieq_ss:
+; X64: # BB#0:
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomige_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomige_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomigt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomigt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomile_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomile_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomilt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomilt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomineq_ss:
+; X32: # BB#0:
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomineq_ss:
+; X64: # BB#0:
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_undefined_ps() {
+; X32-LABEL: test_mm_undefined_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <4 x float> undef
+}
+
+define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_xor_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_xor_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: xorl %eax, %edx
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: xorl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %r8d, %edi
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %eax, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %res = xor <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
new file mode 100644
index 000000000000..2900c277f124
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
+
+define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
+; SSE-LABEL: test_x86_sse_storeu_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movups %xmm0, (%eax)
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_storeu_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vmovups %xmm0, (%eax)
+; KNL-NEXT: retl
+; CHECK-LABEL: test_x86_sse_storeu_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movups %xmm0, (%eax)
+; CHECK-NEXT: retl
+ call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
+ ret void
+}
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
index 0857189be734..1df432185701 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: addss
+; SSE-LABEL: test_x86_sse_add_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_add_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -10,7 +19,15 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: cmpordps
+; SSE-LABEL: test_x86_sse_cmp_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cmp_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -18,7 +35,15 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: cmpordss
+; SSE-LABEL: test_x86_sse_cmp_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cmp_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -26,9 +51,23 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comieq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comieq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -36,9 +75,19 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comige_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comige_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -46,9 +95,19 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comigt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comigt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -56,9 +115,19 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comile_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comile_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -66,8 +135,19 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: sbb
+; SSE-LABEL: test_x86_sse_comilt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comilt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -75,9 +155,23 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comineq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comineq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -85,8 +179,17 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
- ; CHECK: movl
- ; CHECK: cvtsi2ss
+; SSE-LABEL: test_x86_sse_cvtsi2ss:
+; SSE: ## BB#0:
+; SSE-NEXT: movl $7, %eax
+; SSE-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cvtsi2ss:
+; KNL: ## BB#0:
+; KNL-NEXT: movl $7, %eax
+; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -94,7 +197,15 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
- ; CHECK: cvtss2si
+; SSE-LABEL: test_x86_sse_cvtss2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtss2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cvtss2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtss2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -102,7 +213,15 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
- ; CHECK: cvttss2si
+; SSE-LABEL: test_x86_sse_cvttss2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttss2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cvttss2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttss2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -110,7 +229,15 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: divss
+; SSE-LABEL: test_x86_sse_div_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_div_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -118,8 +245,17 @@ declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind read
define void @test_x86_sse_ldmxcsr(i8* %a0) {
- ; CHECK: movl
- ; CHECK: ldmxcsr
+; SSE-LABEL: test_x86_sse_ldmxcsr:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: ldmxcsr (%eax)
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ldmxcsr:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vldmxcsr (%eax)
+; KNL-NEXT: retl
call void @llvm.x86.sse.ldmxcsr(i8* %a0)
ret void
}
@@ -128,7 +264,15 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: maxps
+; SSE-LABEL: test_x86_sse_max_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: maxps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_max_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -136,7 +280,15 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: maxss
+; SSE-LABEL: test_x86_sse_max_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: maxss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_max_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -144,7 +296,15 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: minps
+; SSE-LABEL: test_x86_sse_min_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: minps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_min_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vminps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -152,7 +312,15 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: minss
+; SSE-LABEL: test_x86_sse_min_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: minss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_min_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vminss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -160,7 +328,15 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
- ; CHECK: movmskps
+; SSE-LABEL: test_x86_sse_movmsk_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_movmsk_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovmskps %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -169,7 +345,15 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: mulss
+; SSE-LABEL: test_x86_sse_mul_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_mul_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -177,7 +361,15 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
- ; CHECK: rcpps
+; SSE-LABEL: test_x86_sse_rcp_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: rcpps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rcp_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vrcpps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -185,7 +377,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
- ; CHECK: rcpss
+; SSE-LABEL: test_x86_sse_rcp_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: rcpss %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rcp_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -193,7 +393,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
- ; CHECK: rsqrtps
+; SSE-LABEL: test_x86_sse_rsqrt_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: rsqrtps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rsqrt_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vrsqrtps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -201,7 +409,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
- ; CHECK: rsqrtss
+; SSE-LABEL: test_x86_sse_rsqrt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: rsqrtss %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rsqrt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -209,7 +425,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
- ; CHECK: sqrtps
+; SSE-LABEL: test_x86_sse_sqrt_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_sqrt_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -217,7 +441,15 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
- ; CHECK: sqrtss
+; SSE-LABEL: test_x86_sse_sqrt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtss %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_sqrt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -225,25 +457,33 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @test_x86_sse_stmxcsr(i8* %a0) {
- ; CHECK: movl
- ; CHECK: stmxcsr
+; SSE-LABEL: test_x86_sse_stmxcsr:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: stmxcsr (%eax)
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_stmxcsr:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vstmxcsr (%eax)
+; KNL-NEXT: retl
call void @llvm.x86.sse.stmxcsr(i8* %a0)
ret void
}
declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
-define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
- ; CHECK: movl
- ; CHECK: movups
- call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
- ret void
-}
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
-
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: subss
+; SSE-LABEL: test_x86_sse_sub_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: subss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_sub_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -251,9 +491,23 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomieq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomieq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -261,9 +515,19 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomige_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomige_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -271,9 +535,19 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomigt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomigt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -281,9 +555,19 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomile_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomile_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -291,8 +575,19 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: sbbl
+; SSE-LABEL: test_x86_sse_ucomilt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomilt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -300,9 +595,23 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomineq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomineq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index fd35e75d71ae..29c041ba7f6c 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -47,3 +47,18 @@ entry:
%a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
ret <4 x float> %a14
}
+
+; v4i32 isn't legal for SSE1, but this should be cmpps.
+
+define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: PR28044:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: ret
+;
+ %cmp = fcmp oeq <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..f5ecfa444d86
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define i64 @test_mm_cvtsd_si64(<2 x double> %a0) nounwind {
+; X64-LABEL: test_mm_cvtsd_si64:
+; X64: # BB#0:
+; X64-NEXT: cvtsd2si %xmm0, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
+; X64-LABEL: test_mm_cvtsi128_si64:
+; X64: # BB#0:
+; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: retq
+ %res = extractelement <2 x i64> %a0, i32 0
+ ret i64 %res
+}
+
+define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
+; X64-LABEL: test_mm_cvtsi64_sd:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2sdq %rdi, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cvt = sitofp i64 %a1 to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
+; X64-LABEL: test_mm_cvtsi64_si128:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+ ret <2 x i64> %res1
+}
+
+define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
+; X64-LABEL: test_mm_cvttsd_si64:
+; X64: # BB#0:
+; X64-NEXT: cvttsd2si %xmm0, %rax
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a0, i32 0
+ %res = fptosi double %ext to i64
+ ret i64 %res
+}
+
+define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
+; X64-LABEL: test_mm_loadu_si64:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
+ %ld = load i64, i64* %a0, align 1
+ %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+ ret <2 x i64> %res1
+}
+
+define void @test_mm_stream_si64(i64 *%a0, i64 %a1) {
+; X64-LABEL: test_mm_stream_si64:
+; X64: # BB#0:
+; X64-NEXT: movntiq %rsi, (%rdi)
+; X64-NEXT: retq
+ store i64 %a1, i64* %a0, align 1, !nontemporal !0
+ ret void
+}
+
+!0 = !{i64 1}
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..fa71325d7d6e
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -0,0 +1,3849 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define <2 x i64> @test_mm_add_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi8:
+; X32: # BB#0:
+; X32-NEXT: paddb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi8:
+; X64: # BB#0:
+; X64-NEXT: paddb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = add <16 x i8> %arg0, %arg1
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi16:
+; X32: # BB#0:
+; X32-NEXT: paddw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi16:
+; X64: # BB#0:
+; X64-NEXT: paddw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = add <8 x i16> %arg0, %arg1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi32:
+; X32: # BB#0:
+; X32-NEXT: paddd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi32:
+; X64: # BB#0:
+; X64-NEXT: paddd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = add <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi64:
+; X32: # BB#0:
+; X32-NEXT: paddq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi64:
+; X64: # BB#0:
+; X64-NEXT: paddq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = add <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_add_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_add_pd:
+; X32: # BB#0:
+; X32-NEXT: addpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_pd:
+; X64: # BB#0:
+; X64-NEXT: addpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fadd <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_add_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_add_sd:
+; X32: # BB#0:
+; X32-NEXT: addsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_sd:
+; X64: # BB#0:
+; X64-NEXT: addsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fadd = fadd double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fadd, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_adds_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epi8:
+; X32: # BB#0:
+; X32-NEXT: paddsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epi8:
+; X64: # BB#0:
+; X64-NEXT: paddsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epi16:
+; X32: # BB#0:
+; X32-NEXT: paddsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epi16:
+; X64: # BB#0:
+; X64-NEXT: paddsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epu8:
+; X32: # BB#0:
+; X32-NEXT: paddusb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epu8:
+; X64: # BB#0:
+; X64-NEXT: paddusb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epu16:
+; X32: # BB#0:
+; X32-NEXT: paddusw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epu16:
+; X64: # BB#0:
+; X64-NEXT: paddusw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_and_pd:
+; X32: # BB#0:
+; X32-NEXT: andps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_and_pd:
+; X64: # BB#0:
+; X64-NEXT: andps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %res = and <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_and_si128:
+; X32: # BB#0:
+; X32-NEXT: andps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_and_si128:
+; X64: # BB#0:
+; X64-NEXT: andps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = and <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_pd:
+; X32: # BB#0:
+; X32-NEXT: andnps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_andnot_pd:
+; X64: # BB#0:
+; X64-NEXT: andnps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = and <4 x i32> %not, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_si128:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-NEXT: pxor %xmm2, %xmm0
+; X32-NEXT: pand %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_andnot_si128:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqd %xmm2, %xmm2
+; X64-NEXT: pxor %xmm2, %xmm0
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+ %not = xor <2 x i64> %a0, <i64 -1, i64 -1>
+ %res = and <2 x i64> %not, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_avg_epu8:
+; X32: # BB#0:
+; X32-NEXT: pavgb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_avg_epu8:
+; X64: # BB#0:
+; X64-NEXT: pavgb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
+
+define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_avg_epu16:
+; X32: # BB#0:
+; X32-NEXT: pavgw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_avg_epu16:
+; X64: # BB#0:
+; X64-NEXT: pavgw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_bslli_si128:
+; X32: # BB#0:
+; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_bslli_si128:
+; X64: # BB#0:
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_bsrli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_bsrli_si128:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_bsrli_si128:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x float> @test_mm_castpd_ps(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_castpd_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castpd_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x double> %a0 to <4 x float>
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_castpd_si128(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_castpd_si128:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castpd_si128:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x double> %a0 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_castps_pd(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_castps_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castps_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x float> %a0 to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_castps_si128(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_castps_si128:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castps_si128:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x float> %a0 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_castsi128_pd(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_castsi128_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castsi128_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x i64> %a0 to <2 x double>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_castsi128_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_castsi128_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castsi128_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x i64> %a0 to <4 x float>
+ ret <4 x float> %res
+}
+
+define void @test_mm_clflush(i8* %a0) nounwind {
+; X32-LABEL: test_mm_clflush:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: clflush (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_clflush:
+; X64: # BB#0:
+; X64-NEXT: clflush (%rdi)
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.clflush(i8* %a0)
+ ret void
+}
+declare void @llvm.x86.sse2.clflush(i8*) nounwind readnone
+
+define <2 x i64> @test_mm_cmpeq_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi8:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi8:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp eq <16 x i8> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i8>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpeq_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi16:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi16:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp eq <8 x i16> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i16>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpeq_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi32:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi32:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp eq <4 x i32> %arg0, %arg1
+ %res = sext <4 x i1> %cmp to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmpeq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpeqpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpeqpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp oeq <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpeq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpeqsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpeqsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mm_cmpge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_pd:
+; X32: # BB#0:
+; X32-NEXT: cmplepd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_pd:
+; X64: # BB#0:
+; X64-NEXT: cmplepd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ole <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_sd:
+; X32: # BB#0:
+; X32-NEXT: cmplesd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_sd:
+; X64: # BB#0:
+; X64-NEXT: cmplesd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x i64> @test_mm_cmpgt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi8:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi8:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp sgt <16 x i8> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i8>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpgt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi16:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi16:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp sgt <8 x i16> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i16>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpgt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi32:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi32:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp sgt <4 x i32> %arg0, %arg1
+ %res = sext <4 x i1> %cmp to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmpgt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpltpd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpltpd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp olt <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpltsd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpltsd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x double> @test_mm_cmple_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_pd:
+; X32: # BB#0:
+; X32-NEXT: cmplepd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_pd:
+; X64: # BB#0:
+; X64-NEXT: cmplepd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ole <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmple_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_sd:
+; X32: # BB#0:
+; X32-NEXT: cmplesd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_sd:
+; X64: # BB#0:
+; X64-NEXT: cmplesd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 2)
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cmplt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi8:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtb %xmm0, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_epi8:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp sgt <16 x i8> %arg1, %arg0
+ %res = sext <16 x i1> %cmp to <16 x i8>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmplt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi16:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtw %xmm0, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_epi16:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp sgt <8 x i16> %arg1, %arg0
+ %res = sext <8 x i1> %cmp to <8 x i16>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmplt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi32:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtd %xmm0, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_epi32:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp sgt <4 x i32> %arg1, %arg0
+ %res = sext <4 x i1> %cmp to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmplt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpltpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpltpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp olt <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmplt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpltsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpltsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 1)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpneq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpneqpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpneqpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp une <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpneq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpneqsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpneqsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlepd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlepd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ugt <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlesd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlesd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x double> @test_mm_cmpngt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltpd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltpd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp uge <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltsd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltsd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x double> @test_mm_cmpnle_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlepd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlepd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ugt <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnle_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlesd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlesd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 6)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnlt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp uge <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnlt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 5)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpordpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpordpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ord <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpordsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpordsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpunord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpunordpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpunordpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp uno <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpunord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpunordsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpunordsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 3)
+ ret <2 x double> %res
+}
+
+define i32 @test_mm_comieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comieq_sd:
+; X32: # BB#0:
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comieq_sd:
+; X64: # BB#0:
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comige_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comige_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comigt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comigt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comile_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comile_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comilt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comilt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comineq_sd:
+; X32: # BB#0:
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comineq_sd:
+; X64: # BB#0:
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_cvtepi32_pd(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtepi32_pd:
+; X32: # BB#0:
+; X32-NEXT: cvtdq2pd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi32_pd:
+; X64: # BB#0:
+; X64-NEXT: cvtdq2pd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = shufflevector <4 x i32> %arg0, <4 x i32> %arg0, <2 x i32> <i32 0, i32 1>
+ %res = sitofp <2 x i32> %ext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_cvtepi32_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtepi32_ps:
+; X32: # BB#0:
+; X32-NEXT: cvtdq2ps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi32_ps:
+; X64: # BB#0:
+; X64-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %arg0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_cvtpd_epi32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvtpd2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvtpd2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_cvtpd_ps(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtpd_ps:
+; X32: # BB#0:
+; X32-NEXT: cvtpd2ps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtpd_ps:
+; X64: # BB#0:
+; X64-NEXT: cvtpd2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_cvtps_epi32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvtps2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtps_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvtps2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_cvtps_pd(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_pd:
+; X32: # BB#0:
+; X32-NEXT: cvtps2pd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtps_pd:
+; X64: # BB#0:
+; X64-NEXT: cvtps2pd %xmm0, %xmm0
+; X64-NEXT: retq
+ %ext = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
+ %res = fpext <2 x float> %ext to <2 x double>
+ ret <2 x double> %res
+}
+
+define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsd_f64:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: movlps %xmm0, (%esp)
+; X32-NEXT: fldl (%esp)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_f64:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = extractelement <2 x double> %a0, i32 0
+ ret double %res
+}
+
+define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsd_si32:
+; X32: # BB#0:
+; X32-NEXT: cvtsd2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_si32:
+; X64: # BB#0:
+; X64-NEXT: cvtsd2si %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+
+define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsi128_si32:
+; X32: # BB#0:
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi128_si32:
+; X64: # BB#0:
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = extractelement <4 x i32> %arg0, i32 0
+ ret i32 %res
+}
+
+define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm_cvtsi32_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cvtsi2sdl %eax, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi32_sd:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2sdl %edi, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cvt = sitofp i32 %a1 to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
+; X32-LABEL: test_mm_cvtsi32_si128:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi32_si128:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 0, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 0, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cvtss_sd:
+; X32: # BB#0:
+; X32-NEXT: cvtss2sd %xmm1, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtss_sd:
+; X64: # BB#0:
+; X64-NEXT: cvtss2sd %xmm1, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %ext = extractelement <4 x float> %a1, i32 0
+ %cvt = fpext float %ext to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvttpd_epi32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvttpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvttpd2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvttpd2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttps_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvttps2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttps_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvttps2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = fptosi <4 x float> %a0 to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvttsd_si32:
+; X32: # BB#0:
+; X32-NEXT: cvttsd2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttsd_si32:
+; X64: # BB#0:
+; X64-NEXT: cvttsd2si %xmm0, %eax
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a0, i32 0
+ %res = fptosi double %ext to i32
+ ret i32 %res
+}
+
+define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_div_pd:
+; X32: # BB#0:
+; X32-NEXT: divpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_pd:
+; X64: # BB#0:
+; X64-NEXT: divpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fdiv <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_div_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_div_sd:
+; X32: # BB#0:
+; X32-NEXT: divsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_sd:
+; X64: # BB#0:
+; X64-NEXT: divsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fdiv = fdiv double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fdiv, i32 0
+ ret <2 x double> %res
+}
+
+define i32 @test_mm_extract_epi16(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_extract_epi16:
+; X32: # BB#0:
+; X32-NEXT: pextrw $1, %xmm0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi16:
+; X64: # BB#0:
+; X64-NEXT: pextrw $1, %xmm0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext = extractelement <8 x i16> %arg0, i32 1
+ %res = zext i16 %ext to i32
+ ret i32 %res
+}
+
+define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
+; X32-LABEL: test_mm_insert_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: pinsrw $1, %eax, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi16:
+; X64: # BB#0:
+; X64-NEXT: pinsrw $1, %edi, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = insertelement <8 x i16> %arg0, i16 %a1,i32 1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define void @test_mm_lfence() nounwind {
+; X32-LABEL: test_mm_lfence:
+; X32: # BB#0:
+; X32-NEXT: lfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_lfence:
+; X64: # BB#0:
+; X64-NEXT: lfence
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.lfence()
+ ret void
+}
+declare void @llvm.x86.sse2.lfence() nounwind readnone
+
+define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_pd:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %res = load <2 x double>, <2 x double>* %arg0, align 16
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
+ %ld = load double, double* %a0, align 1
+ %res0 = insertelement <2 x double> undef, double %ld, i32 0
+ %res1 = insertelement <2 x double> %res0, double 0.0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm_load_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_si128:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: retq
+ %res = load <2 x i64>, <2 x i64>* %a0, align 16
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load1_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load1_pd:
+; X64: # BB#0:
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %ld = load double, double* %a0, align 8
+ %res0 = insertelement <2 x double> undef, double %ld, i32 0
+ %res1 = insertelement <2 x double> %res0, double %ld, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm_loadh_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadh_pd:
+; X64: # BB#0:
+; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-NEXT: retq
+ %ld = load double, double* %a1, align 8
+ %res = insertelement <2 x double> %a0, double %ld, i32 1
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
+; X32-LABEL: test_mm_loadl_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadl_epi64:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
+ %bc = bitcast <2 x i64>* %a1 to i64*
+ %ld = load i64, i64* %bc, align 1
+ %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm_loadl_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadl_pd:
+; X64: # BB#0:
+; X64-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; X64-NEXT: retq
+ %ld = load double, double* %a1, align 8
+ %res = insertelement <2 x double> %a0, double %ld, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_loadr_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movapd (%eax), %xmm0
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadr_pd:
+; X64: # BB#0:
+; X64-NEXT: movapd (%rdi), %xmm0
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %ld = load <2 x double>, <2 x double>* %arg0, align 16
+ %res = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadu_pd:
+; X64: # BB#0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %res = load <2 x double>, <2 x double>* %arg0, align 1
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadu_si128:
+; X64: # BB#0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: retq
+ %res = load <2 x i64>, <2 x i64>* %a0, align 1
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_madd_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_madd_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmaddwd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_madd_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmaddwd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
+define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) nounwind {
+; X32-LABEL: test_mm_maskmoveu_si128:
+; X32: # BB#0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: maskmovdqu %xmm1, %xmm0
+; X32-NEXT: popl %edi
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskmoveu_si128:
+; X64: # BB#0:
+; X64-NEXT: maskmovdqu %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %arg0, <16 x i8> %arg1, i8* %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind
+
+define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_max_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmaxsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmaxsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp sgt <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_max_epu8:
+; X32: # BB#0:
+; X32-NEXT: pmaxub %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epu8:
+; X64: # BB#0:
+; X64-NEXT: pmaxub %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp ugt <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_max_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_max_pd:
+; X32: # BB#0:
+; X32-NEXT: maxpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_pd:
+; X64: # BB#0:
+; X64-NEXT: maxpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_max_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_max_sd:
+; X32: # BB#0:
+; X32-NEXT: maxsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_sd:
+; X64: # BB#0:
+; X64-NEXT: maxsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define void @test_mm_mfence() nounwind {
+; X32-LABEL: test_mm_mfence:
+; X32: # BB#0:
+; X32-NEXT: mfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mfence:
+; X64: # BB#0:
+; X64-NEXT: mfence
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.mfence()
+ ret void
+}
+declare void @llvm.x86.sse2.mfence() nounwind readnone
+
+define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_min_epi16:
+; X32: # BB#0:
+; X32-NEXT: pminsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epi16:
+; X64: # BB#0:
+; X64-NEXT: pminsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp slt <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_min_epu8:
+; X32: # BB#0:
+; X32-NEXT: pminub %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epu8:
+; X64: # BB#0:
+; X64-NEXT: pminub %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp ult <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_min_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_min_pd:
+; X32: # BB#0:
+; X32-NEXT: minpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_pd:
+; X64: # BB#0:
+; X64-NEXT: minpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_min_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_min_sd:
+; X32: # BB#0:
+; X32-NEXT: minsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_sd:
+; X64: # BB#0:
+; X64-NEXT: minsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_move_epi64(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_move_epi64:
+; X32: # BB#0:
+; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_move_epi64:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_move_sd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_move_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a1, i32 0
+ %res0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %res1 = insertelement <2 x double> %res0, double %ext1, i32 1
+ ret <2 x double> %res1
+}
+
+define i32 @test_mm_movemask_epi8(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_epi8:
+; X32: # BB#0:
+; X32-NEXT: pmovmskb %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movemask_epi8:
+; X64: # BB#0:
+; X64-NEXT: pmovmskb %xmm0, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %arg0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_pd:
+; X32: # BB#0:
+; X32-NEXT: movmskpd %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movemask_pd:
+; X64: # BB#0:
+; X64-NEXT: movmskpd %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mul_epu32:
+; X32: # BB#0:
+; X32-NEXT: pmuludq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_epu32:
+; X64: # BB#0:
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_mul_pd:
+; X32: # BB#0:
+; X32-NEXT: mulpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_pd:
+; X64: # BB#0:
+; X64-NEXT: mulpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fmul <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mul_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_mul_sd:
+; X32: # BB#0:
+; X32-NEXT: mulsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_sd:
+; X64: # BB#0:
+; X64-NEXT: mulsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fmul = fmul double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fmul, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_mulhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmulhw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mulhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmulhw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mulhi_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhi_epu16:
+; X32: # BB#0:
+; X32-NEXT: pmulhuw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mulhi_epu16:
+; X64: # BB#0:
+; X64-NEXT: pmulhuw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mullo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mullo_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmullw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mullo_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = mul <8 x i16> %arg0, %arg1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_or_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_or_pd:
+; X32: # BB#0:
+; X32-NEXT: orps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_or_pd:
+; X64: # BB#0:
+; X64-NEXT: orps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %res = or <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_or_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_or_si128:
+; X32: # BB#0:
+; X32-NEXT: orps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_or_si128:
+; X64: # BB#0:
+; X64-NEXT: orps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = or <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_packs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packs_epi16:
+; X32: # BB#0:
+; X32-NEXT: packsswb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packs_epi16:
+; X64: # BB#0:
+; X64-NEXT: packsswb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packs_epi32:
+; X32: # BB#0:
+; X32-NEXT: packssdw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packs_epi32:
+; X64: # BB#0:
+; X64-NEXT: packssdw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packus_epi16:
+; X32: # BB#0:
+; X32-NEXT: packuswb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packus_epi16:
+; X64: # BB#0:
+; X64-NEXT: packuswb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define void @test_mm_pause() nounwind {
+; X32-LABEL: test_mm_pause:
+; X32: # BB#0:
+; X32-NEXT: pause
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_pause:
+; X64: # BB#0:
+; X64-NEXT: pause
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.pause()
+ ret void
+}
+declare void @llvm.x86.sse2.pause() nounwind readnone
+
+define <2 x i64> @test_mm_sad_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sad_epu8:
+; X32: # BB#0:
+; X32-NEXT: psadbw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sad_epu8:
+; X64: # BB#0:
+; X64-NEXT: psadbw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %arg0, <16 x i8> %arg1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
+; X32-LABEL: test_mm_set_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i8> undef, i8 %a15, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %a14, i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %a13, i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %a12, i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %a11, i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %a10, i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %a9 , i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %a8 , i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %a7 , i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %a6 , i32 9
+ %res10 = insertelement <16 x i8> %res9, i8 %a5 , i32 10
+ %res11 = insertelement <16 x i8> %res10, i8 %a4 , i32 11
+ %res12 = insertelement <16 x i8> %res11, i8 %a3 , i32 12
+ %res13 = insertelement <16 x i8> %res12, i8 %a2 , i32 13
+ %res14 = insertelement <16 x i8> %res13, i8 %a1 , i32 14
+ %res15 = insertelement <16 x i8> %res14, i8 %a0 , i32 15
+ %res = bitcast <16 x i8> %res15 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
+; X32-LABEL: test_mm_set_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm5
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm6
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm7
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %r8d, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-NEXT: movd %esi, %xmm0
+; X64-NEXT: movd %r9d, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %ecx, %xmm3
+; X64-NEXT: movd %r10d, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i16> undef, i16 %a7, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %a6, i32 1
+ %res2 = insertelement <8 x i16> %res1, i16 %a5, i32 2
+ %res3 = insertelement <8 x i16> %res2, i16 %a4, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %a3, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %a2, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 %a1, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 %a0, i32 7
+ %res = bitcast <8 x i16> %res7 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_set_epi32:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi32:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a3, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %a2, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 %a1, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 %a0, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+; TODO test_mm_set_epi64
+
+define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
+; X32-LABEL: test_mm_set_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi64x:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm1
+; X64-NEXT: movd %rsi, %xmm0
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a1, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %a0, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
+; X32-LABEL: test_mm_set_pd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_pd:
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a1, i32 0
+ %res1 = insertelement <2 x double> %res0, double %a0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_set_sd(double %a0) nounwind {
+; X32-LABEL: test_mm_set_sd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_sd:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a0, i32 0
+ %res1 = insertelement <2 x double> %res0, double 0.0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i8> undef, i8 %a0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %a0, i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %a0, i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %a0, i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %a0, i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %a0, i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %a0, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %a0, i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %a0, i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %a0, i32 9
+ %res10 = insertelement <16 x i8> %res9, i8 %a0, i32 10
+ %res11 = insertelement <16 x i8> %res10, i8 %a0, i32 11
+ %res12 = insertelement <16 x i8> %res11, i8 %a0, i32 12
+ %res13 = insertelement <16 x i8> %res12, i8 %a0, i32 13
+ %res14 = insertelement <16 x i8> %res13, i8 %a0, i32 14
+ %res15 = insertelement <16 x i8> %res14, i8 %a0, i32 15
+ %res = bitcast <16 x i8> %res15 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi16:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %a0, i32 1
+ %res2 = insertelement <8 x i16> %res1, i16 %a0, i32 2
+ %res3 = insertelement <8 x i16> %res2, i16 %a0, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %a0, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %a0, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 %a0, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 %a0, i32 7
+ %res = bitcast <8 x i16> %res7 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi32:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi32:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %a0, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 %a0, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 %a0, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+; TODO test_mm_set1_epi64
+
+define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi64x:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %a0, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
+; X32-LABEL: test_mm_set1_pd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_pd:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a0, i32 0
+ %res1 = insertelement <2 x double> %res0, double %a0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
+; X32-LABEL: test_mm_setr_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i8> undef, i8 %a0 , i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %a1 , i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %a2 , i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %a3 , i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %a4 , i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %a5 , i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %a6 , i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %a7 , i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %a8 , i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %a9 , i32 9
+ %res10 = insertelement <16 x i8> %res9, i8 %a10, i32 10
+ %res11 = insertelement <16 x i8> %res10, i8 %a11, i32 11
+ %res12 = insertelement <16 x i8> %res11, i8 %a12, i32 12
+ %res13 = insertelement <16 x i8> %res12, i8 %a13, i32 13
+ %res14 = insertelement <16 x i8> %res13, i8 %a14, i32 14
+ %res15 = insertelement <16 x i8> %res14, i8 %a15, i32 15
+ %res = bitcast <16 x i8> %res15 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
+; X32-LABEL: test_mm_setr_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm5
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm6
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm7
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movd %ecx, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %r9d, %xmm0
+; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-NEXT: movd %r10d, %xmm0
+; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %r8d, %xmm3
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %a1, i32 1
+ %res2 = insertelement <8 x i16> %res1, i16 %a2, i32 2
+ %res3 = insertelement <8 x i16> %res2, i16 %a3, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %a4, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %a5, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 %a6, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 %a7, i32 7
+ %res = bitcast <8 x i16> %res7 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_setr_epi32:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi32:
+; X64: # BB#0:
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movd %edx, %xmm2
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %a1, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 %a2, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 %a3, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+; TODO test_mm_setr_epi64
+
+define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
+; X32-LABEL: test_mm_setr_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi64x:
+; X64: # BB#0:
+; X64-NEXT: movd %rsi, %xmm1
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %a1, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
+; X32-LABEL: test_mm_setr_pd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_pd:
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a0, i32 0
+ %res1 = insertelement <2 x double> %res0, double %a1, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_setzero_pd() {
+; X32-LABEL: test_mm_setzero_pd:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setzero_pd:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
+ ret <2 x double> zeroinitializer
+}
+
+define <2 x i64> @test_mm_setzero_si128() {
+; X32-LABEL: test_mm_setzero_si128:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setzero_si128:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
+ ret <2 x i64> zeroinitializer
+}
+
+define <2 x i64> @test_mm_shuffle_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_shufflehi_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shufflehi_epi16:
+; X32: # BB#0:
+; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shufflehi_epi16:
+; X64: # BB#0:
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_shufflelo_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shufflelo_epi16:
+; X32: # BB#0:
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shufflelo_epi16:
+; X64: # BB#0:
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sll_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi16:
+; X32: # BB#0:
+; X32-NEXT: psllw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sll_epi16:
+; X64: # BB#0:
+; X64-NEXT: psllw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sll_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi32:
+; X32: # BB#0:
+; X32-NEXT: pslld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sll_epi32:
+; X64: # BB#0:
+; X64-NEXT: pslld %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_sll_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi64:
+; X32: # BB#0:
+; X32-NEXT: psllq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sll_epi64:
+; X64: # BB#0:
+; X64-NEXT: psllq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi16:
+; X32: # BB#0:
+; X32-NEXT: psllw $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_epi16:
+; X64: # BB#0:
+; X64-NEXT: psllw $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi32:
+; X32: # BB#0:
+; X32-NEXT: pslld $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_epi32:
+; X64: # BB#0:
+; X64-NEXT: pslld $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi64:
+; X32: # BB#0:
+; X32-NEXT: psllq $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_epi64:
+; X64: # BB#0:
+; X64-NEXT: psllq $1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_slli_si128:
+; X32: # BB#0:
+; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_si128:
+; X64: # BB#0:
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_sqrt_pd:
+; X32: # BB#0:
+; X32-NEXT: sqrtpd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_pd:
+; X64: # BB#0:
+; X64-NEXT: sqrtpd %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sqrt_sd:
+; X32: # BB#0:
+; X32-NEXT: sqrtsd %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_sd:
+; X64: # BB#0:
+; X64-NEXT: sqrtsd %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
+ %ext0 = extractelement <2 x double> %call, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sra_epi16:
+; X32: # BB#0:
+; X32-NEXT: psraw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sra_epi16:
+; X64: # BB#0:
+; X64-NEXT: psraw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sra_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sra_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrad %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sra_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrad %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srai_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srai_epi16:
+; X32: # BB#0:
+; X32-NEXT: psraw $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srai_epi16:
+; X64: # BB#0:
+; X64-NEXT: psraw $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %arg0, i32 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srai_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srai_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrad $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srai_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrad $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %arg0, i32 1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi16:
+; X32: # BB#0:
+; X32-NEXT: psrlw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srl_epi16:
+; X64: # BB#0:
+; X64-NEXT: psrlw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srl_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrld %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi64:
+; X32: # BB#0:
+; X32-NEXT: psrlq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srl_epi64:
+; X64: # BB#0:
+; X64-NEXT: psrlq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi16:
+; X32: # BB#0:
+; X32-NEXT: psrlw $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_epi16:
+; X64: # BB#0:
+; X64-NEXT: psrlw $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %arg0, i32 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrld $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrld $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %arg0, i32 1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi64:
+; X32: # BB#0:
+; X32-NEXT: psrlq $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_epi64:
+; X64: # BB#0:
+; X64-NEXT: psrlq $1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_srli_si128:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_si128:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_pd:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ store <2 x double> %a1, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_pd1(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_pd1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_pd1:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double * %a0 to <2 x double>*
+ %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ store <2 x double> %shuf, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a1, i32 0
+ store double %ext, double* %a0, align 1
+ ret void
+}
+
+define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_store_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_si128:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ store <2 x i64> %a1, <2 x i64>* %a0, align 16
+ ret void
+}
+
+define void @test_mm_store1_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store1_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store1_pd:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double * %a0 to <2 x double>*
+ %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ store <2 x double> %shuf, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storeh_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeh_sd:
+; X64: # BB#0:
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: movsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a1, i32 1
+ store double %ext, double* %a0, align 8
+ ret void
+}
+
+define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_storel_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storel_epi64:
+; X64: # BB#0:
+; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x i64> %a1, i32 0
+ %bc = bitcast <2 x i64> *%a0 to i64*
+ store i64 %ext, i64* %bc, align 8
+ ret void
+}
+
+define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storel_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storel_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a1, i32 0
+ store double %ext, double* %a0, align 8
+ ret void
+}
+
+define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storer_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movapd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storer_pd:
+; X64: # BB#0:
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: movapd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ store <2 x double> %shuf, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storeu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeu_pd:
+; X64: # BB#0:
+; X64-NEXT: movups %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ store <2 x double> %a1, <2 x double>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_storeu_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeu_si128:
+; X64: # BB#0:
+; X64-NEXT: movups %xmm0, (%rdi)
+; X64-NEXT: retq
+ store <2 x i64> %a1, <2 x i64>* %a0, align 1
+ ret void
+}
+
+define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_stream_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_pd:
+; X64: # BB#0:
+; X64-NEXT: movntps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ store <2 x double> %a1, <2 x double>* %arg0, align 16, !nontemporal !0
+ ret void
+}
+
+define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
+; X32-LABEL: test_mm_stream_si32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movntil %eax, (%ecx)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_si32:
+; X64: # BB#0:
+; X64-NEXT: movntil %esi, (%rdi)
+; X64-NEXT: retq
+ store i32 %a1, i32* %a0, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_stream_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_si128:
+; X64: # BB#0:
+; X64-NEXT: movntps %xmm0, (%rdi)
+; X64-NEXT: retq
+ store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0
+ ret void
+}
+
+define <2 x i64> @test_mm_sub_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi8:
+; X32: # BB#0:
+; X32-NEXT: psubb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi8:
+; X64: # BB#0:
+; X64-NEXT: psubb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = sub <16 x i8> %arg0, %arg1
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi16:
+; X32: # BB#0:
+; X32-NEXT: psubw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi16:
+; X64: # BB#0:
+; X64-NEXT: psubw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = sub <8 x i16> %arg0, %arg1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi32:
+; X32: # BB#0:
+; X32-NEXT: psubd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi32:
+; X64: # BB#0:
+; X64-NEXT: psubd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = sub <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi64:
+; X32: # BB#0:
+; X32-NEXT: psubq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi64:
+; X64: # BB#0:
+; X64-NEXT: psubq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = sub <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_sub_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sub_pd:
+; X32: # BB#0:
+; X32-NEXT: subpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_pd:
+; X64: # BB#0:
+; X64-NEXT: subpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fsub <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_sub_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sub_sd:
+; X32: # BB#0:
+; X32-NEXT: subsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_sd:
+; X64: # BB#0:
+; X64-NEXT: subsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fsub = fsub double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fsub, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_subs_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epi8:
+; X32: # BB#0:
+; X32-NEXT: psubsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epi8:
+; X64: # BB#0:
+; X64-NEXT: psubsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epi16:
+; X32: # BB#0:
+; X32-NEXT: psubsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epi16:
+; X64: # BB#0:
+; X64-NEXT: psubsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epu8:
+; X32: # BB#0:
+; X32-NEXT: psubusb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epu8:
+; X64: # BB#0:
+; X64-NEXT: psubusb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epu16:
+; X32: # BB#0:
+; X32-NEXT: psubusw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epu16:
+; X64: # BB#0:
+; X64-NEXT: psubusw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomieq_sd:
+; X32: # BB#0:
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomieq_sd:
+; X64: # BB#0:
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomige_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomige_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomigt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomigt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomile_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomile_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomilt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomilt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomineq_sd:
+; X32: # BB#0:
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomineq_sd:
+; X64: # BB#0:
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_undefined_pd() {
+; X32-LABEL: test_mm_undefined_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <2 x double> undef
+}
+
+define <2 x i64> @test_mm_undefined_si128() {
+; X32-LABEL: test_mm_undefined_si128:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_si128:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <2 x i64> undef
+}
+
+define <2 x i64> @test_mm_unpackhi_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_unpackhi_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_unpacklo_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_unpacklo_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_xor_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_xor_pd:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_xor_pd:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %res = xor <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_xor_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_xor_si128:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_xor_si128:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = xor <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index b0412b96bdb2..ae6626bb0dc5 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -1,7 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=pentium4 -mattr=sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
- ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+; CHECK-LABEL: test_x86_sse2_psll_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -9,14 +13,20 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
- ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+; CHECK-LABEL: test_x86_sse2_psrl_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
- ; CHECK: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-LABEL: test_x86_sse2_psll_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -24,8 +34,166 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
- ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-LABEL: test_x86_sse2_psrl_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtps2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: cvtps2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvttps2dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_x86_sse2_storel_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movlps %xmm0, (%eax)
+; CHECK-NEXT: retl
+ call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
+ ret void
+}
+declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
+
+
+define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: paddb LCPI8_0, %xmm0
+; CHECK-NEXT: movdqu %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+
+define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
+ ; fadd operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: addpd %xmm0, %xmm1
+; CHECK-NEXT: movupd %xmm1, (%eax)
+; CHECK-NEXT: retl
+ %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
+ call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
+
+define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {
+; CHECK-LABEL: test_x86_sse2_pshuf_d:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: retl
+entry:
+ %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone
+
+define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_pshufl_w:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NEXT: retl
+entry:
+ %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone
+
+define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_pshufh_w:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NEXT: retl
+entry:
+ %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone
+
+define <16 x i8> @max_epu8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: max_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxub %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @min_epu8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: min_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminub %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @max_epi16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: max_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxsw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: min_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminsw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 53132a167fb8..617e30e4b92c 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: addsd
+; SSE-LABEL: test_x86_sse2_add_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_add_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -10,7 +19,15 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: cmpordpd
+; SSE-LABEL: test_x86_sse2_cmp_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordpd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cmp_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -18,7 +35,15 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: cmpordsd
+; SSE-LABEL: test_x86_sse2_cmp_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cmp_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -26,9 +51,23 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comieq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comieq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -36,9 +75,19 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comige_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comige_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -46,9 +95,19 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comigt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comigt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -56,9 +115,19 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comile_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comile_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -66,9 +135,19 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: sbbl %eax, %eax
- ; CHECK: andl $1, %eax
+; SSE-LABEL: test_x86_sse2_comilt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comilt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -76,25 +155,39 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comineq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comineq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
- ; CHECK: cvtdq2pd
- %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
-
-
define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
- ; CHECK: cvtdq2ps
+; SSE-LABEL: test_x86_sse2_cvtdq2ps:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtdq2ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -102,7 +195,15 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
- ; CHECK: cvtpd2dq
+; SSE-LABEL: test_x86_sse2_cvtpd2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtpd2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtpd2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2dq %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -110,7 +211,15 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
- ; CHECK: cvtpd2ps
+; SSE-LABEL: test_x86_sse2_cvtpd2ps:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtpd2ps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtpd2ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2ps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -118,23 +227,31 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
- ; CHECK: cvtps2dq
+; SSE-LABEL: test_x86_sse2_cvtps2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtps2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtps2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtps2dq %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
- ; CHECK: cvtps2pd
- %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
-
-
define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
- ; CHECK: cvtsd2si
+; SSE-LABEL: test_x86_sse2_cvtsd2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtsd2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtsd2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -142,25 +259,47 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
- ; CHECK: cvtsd2ss
- ; SSE-NOT: cvtsd2ss %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; SSE-LABEL: test_x86_sse2_cvtsd2ss:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtsd2ss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0) {
- ; CHECK: movl
- ; CHECK: cvtsi2sd
- %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
+; SSE-LABEL: test_x86_sse2_cvtsi2sd:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsi2sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; KNL-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
- ; CHECK: cvtss2sd
+; SSE-LABEL: test_x86_sse2_cvtss2sd:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtss2sd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtss2sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -168,23 +307,31 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
- ; CHECK: cvttpd2dq
+; SSE-LABEL: test_x86_sse2_cvttpd2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvttpd2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
-define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
- ; CHECK: cvttps2dq
- %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-
define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
- ; CHECK: cvttsd2si
+; SSE-LABEL: test_x86_sse2_cvttsd2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttsd2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvttsd2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttsd2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -192,7 +339,15 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: divsd
+; SSE-LABEL: test_x86_sse2_div_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_div_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -201,7 +356,15 @@ declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: maxpd
+; SSE-LABEL: test_x86_sse2_max_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: maxpd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_max_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -209,7 +372,15 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: maxsd
+; SSE-LABEL: test_x86_sse2_max_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: maxsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_max_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -217,7 +388,15 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: minpd
+; SSE-LABEL: test_x86_sse2_min_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: minpd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_min_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -225,7 +404,15 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: minsd
+; SSE-LABEL: test_x86_sse2_min_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: minsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_min_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -233,7 +420,15 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
- ; CHECK: movmskpd
+; SSE-LABEL: test_x86_sse2_movmsk_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: movmskpd %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_movmsk_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovmskpd %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -243,8 +438,15 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: test_x86_sse2_mul_sd
- ; CHECK: mulsd
+; SSE-LABEL: test_x86_sse2_mul_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: mulsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_mul_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -252,7 +454,15 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind
define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: packssdw
+; SSE-LABEL: test_x86_sse2_packssdw_128:
+; SSE: ## BB#0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_packssdw_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -260,7 +470,15 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea
define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: packsswb
+; SSE-LABEL: test_x86_sse2_packsswb_128:
+; SSE: ## BB#0:
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_packsswb_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -268,7 +486,15 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: packuswb
+; SSE-LABEL: test_x86_sse2_packuswb_128:
+; SSE: ## BB#0:
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_packuswb_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -276,7 +502,15 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: paddsb
+; SSE-LABEL: test_x86_sse2_padds_b:
+; SSE: ## BB#0:
+; SSE-NEXT: paddsb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_padds_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -284,7 +518,15 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: paddsw
+; SSE-LABEL: test_x86_sse2_padds_w:
+; SSE: ## BB#0:
+; SSE-NEXT: paddsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_padds_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -292,7 +534,15 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: paddusb
+; SSE-LABEL: test_x86_sse2_paddus_b:
+; SSE: ## BB#0:
+; SSE-NEXT: paddusb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_paddus_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -300,7 +550,15 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: paddusw
+; SSE-LABEL: test_x86_sse2_paddus_w:
+; SSE: ## BB#0:
+; SSE-NEXT: paddusw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_paddus_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -308,7 +566,15 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pavgb
+; SSE-LABEL: test_x86_sse2_pavg_b:
+; SSE: ## BB#0:
+; SSE-NEXT: pavgb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pavg_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -316,7 +582,15 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pavgw
+; SSE-LABEL: test_x86_sse2_pavg_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pavgw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pavg_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -324,7 +598,15 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmaddwd
+; SSE-LABEL: test_x86_sse2_pmadd_wd:
+; SSE: ## BB#0:
+; SSE-NEXT: pmaddwd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmadd_wd:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -332,7 +614,15 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmaxsw
+; SSE-LABEL: test_x86_sse2_pmaxs_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pmaxsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmaxs_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -340,7 +630,15 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pmaxub
+; SSE-LABEL: test_x86_sse2_pmaxu_b:
+; SSE: ## BB#0:
+; SSE-NEXT: pmaxub %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmaxu_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -348,7 +646,15 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pminsw
+; SSE-LABEL: test_x86_sse2_pmins_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pminsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmins_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -356,7 +662,15 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pminub
+; SSE-LABEL: test_x86_sse2_pminu_b:
+; SSE: ## BB#0:
+; SSE-NEXT: pminub %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pminu_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -364,7 +678,15 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
- ; CHECK: pmovmskb
+; SSE-LABEL: test_x86_sse2_pmovmskb_128:
+; SSE: ## BB#0:
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmovmskb_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovmskb %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -372,7 +694,15 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmulhw
+; SSE-LABEL: test_x86_sse2_pmulh_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pmulhw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmulh_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -380,7 +710,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmulhuw
+; SSE-LABEL: test_x86_sse2_pmulhu_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pmulhuw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmulhu_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -388,7 +726,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmuludq
+; SSE-LABEL: test_x86_sse2_pmulu_dq:
+; SSE: ## BB#0:
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmulu_dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -396,7 +742,15 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: psadbw
+; SSE-LABEL: test_x86_sse2_psad_bw:
+; SSE: ## BB#0:
+; SSE-NEXT: psadbw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psad_bw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -404,7 +758,15 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pslld
+; SSE-LABEL: test_x86_sse2_psll_d:
+; SSE: ## BB#0:
+; SSE-NEXT: pslld %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psll_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -412,7 +774,15 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: psllq
+; SSE-LABEL: test_x86_sse2_psll_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psllq %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psll_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -420,7 +790,15 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psllw
+; SSE-LABEL: test_x86_sse2_psll_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psllw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psll_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -428,7 +806,15 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
- ; CHECK: pslld
+; SSE-LABEL: test_x86_sse2_pslli_d:
+; SSE: ## BB#0:
+; SSE-NEXT: pslld $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pslli_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -436,7 +822,15 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
- ; CHECK: psllq
+; SSE-LABEL: test_x86_sse2_pslli_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psllq $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pslli_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -444,7 +838,15 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
- ; CHECK: psllw
+; SSE-LABEL: test_x86_sse2_pslli_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pslli_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllw $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -452,7 +854,15 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: psrad
+; SSE-LABEL: test_x86_sse2_psra_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrad %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psra_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -460,7 +870,15 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psraw
+; SSE-LABEL: test_x86_sse2_psra_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psraw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psra_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -468,7 +886,15 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
- ; CHECK: psrad
+; SSE-LABEL: test_x86_sse2_psrai_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrad $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrai_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrad $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -476,7 +902,15 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
- ; CHECK: psraw
+; SSE-LABEL: test_x86_sse2_psrai_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psraw $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrai_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsraw $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -484,7 +918,15 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: psrld
+; SSE-LABEL: test_x86_sse2_psrl_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrld %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrl_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -492,7 +934,15 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: psrlq
+; SSE-LABEL: test_x86_sse2_psrl_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlq %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrl_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -500,7 +950,15 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psrlw
+; SSE-LABEL: test_x86_sse2_psrl_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrl_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -508,7 +966,15 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
- ; CHECK: psrld
+; SSE-LABEL: test_x86_sse2_psrli_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrld $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrli_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrld $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -516,7 +982,15 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
- ; CHECK: psrlq
+; SSE-LABEL: test_x86_sse2_psrli_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlq $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrli_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlq $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -524,7 +998,15 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
- ; CHECK: psrlw
+; SSE-LABEL: test_x86_sse2_psrli_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlw $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrli_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlw $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -532,7 +1014,15 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: psubsb
+; SSE-LABEL: test_x86_sse2_psubs_b:
+; SSE: ## BB#0:
+; SSE-NEXT: psubsb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubs_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -540,7 +1030,15 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psubsw
+; SSE-LABEL: test_x86_sse2_psubs_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psubsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubs_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -548,7 +1046,15 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: psubusb
+; SSE-LABEL: test_x86_sse2_psubus_b:
+; SSE: ## BB#0:
+; SSE-NEXT: psubusb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubus_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -556,7 +1062,15 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psubusw
+; SSE-LABEL: test_x86_sse2_psubus_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psubusw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubus_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -564,7 +1078,15 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
- ; CHECK: sqrtpd
+; SSE-LABEL: test_x86_sse2_sqrt_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtpd %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_sqrt_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtpd %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -572,50 +1094,31 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
- ; CHECK: sqrtsd
+; SSE-LABEL: test_x86_sse2_sqrt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtsd %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_sqrt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
- ; CHECK: test_x86_sse2_storel_dq
- ; CHECK: movl
- ; CHECK: movlps
- call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
- ret void
-}
-declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
-
-
-define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
- ; CHECK: test_x86_sse2_storeu_dq
- ; CHECK: movl
- ; CHECK: movdqu
- ; add operation forces the execution domain.
- %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
-
-
-define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
- ; CHECK: test_x86_sse2_storeu_pd
- ; CHECK: movl
- ; CHECK: movupd
- ; fadd operation forces the execution domain.
- %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
- call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
-
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: test_x86_sse2_sub_sd
- ; CHECK: subsd
+; SSE-LABEL: test_x86_sse2_sub_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: subsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_sub_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -623,9 +1126,23 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomieq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomieq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -633,9 +1150,19 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomige_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomige_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -643,9 +1170,19 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomigt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomigt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -653,9 +1190,19 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomile_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomile_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -663,8 +1210,19 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: sbbl
+; SSE-LABEL: test_x86_sse2_ucomilt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomilt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -672,44 +1230,39 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomineq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomineq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
define void @test_x86_sse2_pause() {
- ; CHECK: pause
+; SSE-LABEL: test_x86_sse2_pause:
+; SSE: ## BB#0:
+; SSE-NEXT: pause
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pause:
+; KNL: ## BB#0:
+; KNL-NEXT: pause
+; KNL-NEXT: retl
tail call void @llvm.x86.sse2.pause()
- ret void
+ ret void
}
declare void @llvm.x86.sse2.pause() nounwind
-
-define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {
-; CHECK-LABEL: test_x86_sse2_pshuf_d:
-; CHECK: pshufd $27
-entry:
- %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone
-
-define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {
-; CHECK-LABEL: test_x86_sse2_pshufl_w:
-; CHECK: pshuflw $27
-entry:
- %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone
-
-define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {
-; CHECK-LABEL: test_x86_sse2_pshufh_w:
-; CHECK: pshufhw $27
-entry:
- %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index ed84905b1907..85e57e0dbdd1 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Tests for SSE2 and below, without SSE3+.
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+sse2 -O3 | FileCheck %s
define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; CHECK-LABEL: test1:
@@ -8,7 +8,7 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movapd (%ecx), %xmm0
-; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; CHECK-NEXT: movapd %xmm0, (%eax)
; CHECK-NEXT: retl
%tmp3 = load <2 x double>, <2 x double>* %A, align 16
@@ -24,7 +24,7 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movapd (%ecx), %xmm0
-; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movapd %xmm0, (%eax)
; CHECK-NEXT: retl
%tmp3 = load <2 x double>, <2 x double>* %A, align 16
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index 79317e4576b9..3f47d987aeda 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -388,7 +388,7 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 1
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index c4da546ed77e..17586a811f40 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -121,8 +121,8 @@ define <16 x float> @test5(<16 x float> %A, <16 x float> %B) {
; AVX512: # BB#0:
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,17,2,19,4,21,6,23,8,25,10,27,12,29,14,31]
-; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm2[1,3],zmm0[4,6],zmm2[5,7],zmm0[8,10],zmm2[9,11],zmm0[12,14],zmm2[13,15]
+; AVX512-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
; AVX512-NEXT: retq
%add = fadd <16 x float> %A, %B
%sub = fsub <16 x float> %A, %B
@@ -149,8 +149,7 @@ define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
; AVX512: # BB#0:
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vsubpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,9,2,11,4,13,6,15]
-; AVX512-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm2[1],zmm0[2],zmm2[3],zmm0[4],zmm2[5],zmm0[6],zmm2[7]
; AVX512-NEXT: retq
%add = fadd <8 x double> %A, %B
%sub = fsub <8 x double> %A, %B
diff --git a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
index 217be9aeae3a..0111de2f5211 100644
--- a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
@@ -94,7 +94,7 @@ define <4 x float> @test_mm_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
}
declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
-define <2 x i64> @test_mm_lddqu_si128(i8* %a0) {
+define <2 x i64> @test_mm_lddqu_si128(<2 x i64>* %a0) {
; X32-LABEL: test_mm_lddqu_si128:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -105,7 +105,8 @@ define <2 x i64> @test_mm_lddqu_si128(i8* %a0) {
; X64: # BB#0:
; X64-NEXT: lddqu (%rdi), %xmm0
; X64-NEXT: retq
- %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
+ %bc = bitcast <2 x i64>* %a0 to i8*
+ %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %bc)
%res = bitcast <16 x i8> %call to <2 x i64>
ret <2 x i64> %res
}
@@ -115,12 +116,12 @@ define <2 x double> @test_mm_loaddup_pd(double* %a0) {
; X32-LABEL: test_mm_loaddup_pd:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movddup (%eax), %xmm0
+; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loaddup_pd:
; X64: # BB#0:
-; X64-NEXT: movddup (%rdi), %xmm0
+; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
%ld = load double, double* %a0
%res0 = insertelement <2 x double> undef, double %ld, i32 0
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 2c24478706e6..6d51fb54f8b8 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -140,7 +140,7 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
; X64-LABEL: t9:
; X64: ## BB#0:
; X64-NEXT: movapd (%rdi), %xmm0
-; X64-NEXT: movhpd (%rsi), %xmm0
+; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X64-NEXT: movapd %xmm0, (%rdi)
; X64-NEXT: retq
%tmp = load <4 x float>, <4 x float>* %r
@@ -207,7 +207,7 @@ define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X64: ## BB#0: ## %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
@@ -220,7 +220,7 @@ define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X64: ## BB#0: ## %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
diff --git a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..16868d854df7
--- /dev/null
+++ b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -0,0 +1,1008 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse41-builtins.c
+
+define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_blend_epi16:
+; X32: # BB#0:
+; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_epi16:
+; X64: # BB#0:
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
+ %res = bitcast <8 x i16> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_blend_pd:
+; X32: # BB#0:
+; X32-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_pd:
+; X64: # BB#0:
+; X64-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_blend_ps:
+; X32: # BB#0:
+; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_ps:
+; X64: # BB#0:
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_blendv_epi8:
+; X32: # BB#0:
+; X32-NEXT: movdqa %xmm0, %xmm3
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: pblendvb %xmm1, %xmm3
+; X32-NEXT: movdqa %xmm3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blendv_epi8:
+; X64: # BB#0:
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: pblendvb %xmm1, %xmm3
+; X64-NEXT: movdqa %xmm3, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %call = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2)
+ %res = bitcast <16 x i8> %call to <2 x i64>
+ ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_blendv_pd:
+; X32: # BB#0:
+; X32-NEXT: movapd %xmm0, %xmm3
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: blendvpd %xmm1, %xmm3
+; X32-NEXT: movapd %xmm3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blendv_pd:
+; X64: # BB#0:
+; X64-NEXT: movapd %xmm0, %xmm3
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: blendvpd %xmm1, %xmm3
+; X64-NEXT: movapd %xmm3, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_blendv_ps:
+; X32: # BB#0:
+; X32-NEXT: movaps %xmm0, %xmm3
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: blendvps %xmm1, %xmm3
+; X32-NEXT: movaps %xmm3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blendv_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, %xmm3
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: blendvps %xmm1, %xmm3
+; X64-NEXT: movaps %xmm3, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_ceil_pd:
+; X32: # BB#0:
+; X32-NEXT: roundpd $2, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_pd:
+; X64: # BB#0:
+; X64-NEXT: roundpd $2, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_ceil_ps:
+; X32: # BB#0:
+; X32-NEXT: roundps $2, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_ps:
+; X64: # BB#0:
+; X64-NEXT: roundps $2, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_ceil_sd:
+; X32: # BB#0:
+; X32-NEXT: roundsd $2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_sd:
+; X64: # BB#0:
+; X64-NEXT: roundsd $2, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_ceil_ss:
+; X32: # BB#0:
+; X32-NEXT: roundss $2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_ss:
+; X64: # BB#0:
+; X64-NEXT: roundss $2, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpeq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi64:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqq %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = icmp eq <2 x i64> %a0, %a1
+ %res = sext <2 x i1> %cmp to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi8_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmovsxbw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi8_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmovsxbw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %sext = sext <8 x i8> %ext0 to <8 x i16>
+ %res = bitcast <8 x i16> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi8_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovsxbd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi8_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovsxbd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = sext <4 x i8> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi8_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovsxbq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi8_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovsxbq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %sext = sext <2 x i8> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi16_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovsxwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi16_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovsxwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = sext <4 x i16> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi16_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovsxwq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi16_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovsxwq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %sext = sext <2 x i16> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi32_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovsxdq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi32_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovsxdq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %sext = sext <2 x i32> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu8_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu8_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %sext = zext <8 x i8> %ext0 to <8 x i16>
+ %res = bitcast <8 x i16> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu8_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu8_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = zext <4 x i8> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu8_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu8_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %sext = zext <2 x i8> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu16_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu16_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = zext <4 x i16> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu16_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu16_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %sext = zext <2 x i16> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu32_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu32_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %sext = zext <2 x i32> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_dp_pd:
+; X32: # BB#0:
+; X32-NEXT: dppd $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_dp_pd:
+; X64: # BB#0:
+; X64-NEXT: dppd $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_dp_ps:
+; X32: # BB#0:
+; X32-NEXT: dpps $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_dp_ps:
+; X64: # BB#0:
+; X64-NEXT: dpps $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define i32 @test_mm_extract_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_extract_epi8:
+; X32: # BB#0:
+; X32-NEXT: pextrb $1, %xmm0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi8:
+; X64: # BB#0:
+; X64-NEXT: pextrb $1, %xmm0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext = extractelement <16 x i8> %arg0, i32 1
+ %res = zext i8 %ext to i32
+ ret i32 %res
+}
+
+define i32 @test_mm_extract_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_extract_epi32:
+; X32: # BB#0:
+; X32-NEXT: pextrd $1, %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi32:
+; X64: # BB#0:
+; X64-NEXT: pextrd $1, %xmm0, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = extractelement <4 x i32> %arg0, i32 1
+ ret i32 %ext
+}
+
+define i64 @test_mm_extract_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_extract_epi64:
+; X32: # BB#0:
+; X32-NEXT: pextrd $2, %xmm0, %eax
+; X32-NEXT: pextrd $3, %xmm0, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi64:
+; X64: # BB#0:
+; X64-NEXT: pextrq $1, %xmm0, %rax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = extractelement <2 x i64> %a0, i32 1
+ ret i64 %ext
+}
+
+; TODO test_mm_extract_ps
+
+define <2 x double> @test_mm_floor_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_floor_pd:
+; X32: # BB#0:
+; X32-NEXT: roundpd $1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_pd:
+; X64: # BB#0:
+; X64-NEXT: roundpd $1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_floor_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_floor_ps:
+; X32: # BB#0:
+; X32-NEXT: roundps $1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_ps:
+; X64: # BB#0:
+; X64-NEXT: roundps $1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1)
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_floor_sd:
+; X32: # BB#0:
+; X32-NEXT: roundsd $1, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_sd:
+; X64: # BB#0:
+; X64-NEXT: roundsd $1, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_floor_ss:
+; X32: # BB#0:
+; X32-NEXT: roundss $1, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_ss:
+; X64: # BB#0:
+; X64-NEXT: roundss $1, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1)
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) {
+; X32-LABEL: test_mm_insert_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pinsrb $1, %eax, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: pinsrb $1, %eax, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) {
+; X32-LABEL: test_mm_insert_epi32:
+; X32: # BB#0:
+; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi32:
+; X64: # BB#0:
+; X64-NEXT: pinsrd $1, %edi, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = insertelement <4 x i32> %arg0, i32 %a1,i32 1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) {
+; X32-LABEL: test_mm_insert_epi64:
+; X32: # BB#0:
+; X32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi64:
+; X64: # BB#0:
+; X64-NEXT: pinsrq $1, %rdi, %xmm0
+; X64-NEXT: retq
+ %res = insertelement <2 x i64> %a0, i64 %a1,i32 1
+ ret <2 x i64> %res
+}
+
+define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_insert_ps:
+; X32: # BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_ps:
+; X64: # BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epi8:
+; X32: # BB#0:
+; X32-NEXT: pmaxsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epi8:
+; X64: # BB#0:
+; X64-NEXT: pmaxsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp sgt <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmaxsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmaxsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp sgt <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epu16:
+; X32: # BB#0:
+; X32-NEXT: pmaxuw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epu16:
+; X64: # BB#0:
+; X64-NEXT: pmaxuw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp ugt <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epu32:
+; X32: # BB#0:
+; X32-NEXT: pmaxud %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epu32:
+; X64: # BB#0:
+; X64-NEXT: pmaxud %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp ugt <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epi8:
+; X32: # BB#0:
+; X32-NEXT: pminsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epi8:
+; X64: # BB#0:
+; X64-NEXT: pminsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp slt <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epi32:
+; X32: # BB#0:
+; X32-NEXT: pminsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epi32:
+; X64: # BB#0:
+; X64-NEXT: pminsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp slt <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epu16:
+; X32: # BB#0:
+; X32-NEXT: pminuw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epu16:
+; X64: # BB#0:
+; X64-NEXT: pminuw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp ult <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epu32:
+; X32: # BB#0:
+; X32-NEXT: pminud %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epu32:
+; X64: # BB#0:
+; X64-NEXT: pminud %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp ult <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_minpos_epu16:
+; X32: # BB#0:
+; X32-NEXT: phminposuw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_minpos_epu16:
+; X64: # BB#0:
+; X64-NEXT: phminposuw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mpsadbw_epu8:
+; X32: # BB#0:
+; X32-NEXT: mpsadbw $1, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mpsadbw_epu8:
+; X64: # BB#0:
+; X64-NEXT: mpsadbw $1, %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %arg0, <16 x i8> %arg1, i8 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mul_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmuldq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmuldq %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %arg0, <4 x i32> %arg1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mullo_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmulld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mullo_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmulld %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = mul <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packus_epi32:
+; X32: # BB#0:
+; X32-NEXT: packusdw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packus_epi32:
+; X64: # BB#0:
+; X64-NEXT: packusdw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_round_pd:
+; X32: # BB#0:
+; X32-NEXT: roundpd $4, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_pd:
+; X64: # BB#0:
+; X64-NEXT: roundpd $4, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_round_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_round_ps:
+; X32: # BB#0:
+; X32-NEXT: roundps $4, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_ps:
+; X64: # BB#0:
+; X64-NEXT: roundps $4, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_round_sd:
+; X32: # BB#0:
+; X32-NEXT: roundsd $4, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_sd:
+; X64: # BB#0:
+; X64-NEXT: roundsd $4, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_round_ss:
+; X32: # BB#0:
+; X32-NEXT: roundss $4, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_ss:
+; X64: # BB#0:
+; X64-NEXT: roundss $4, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4)
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_stream_load_si128(<2 x i64>* %a0) {
+; X32-LABEL: test_mm_stream_load_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntdqa (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_load_si128:
+; X64: # BB#0:
+; X64-NEXT: movntdqa (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64>* %a0 to i8*
+ %res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
+
+define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
+; X32-LABEL: test_mm_test_all_ones:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqd %xmm1, %xmm1
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_all_ones:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_test_all_zeros:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_all_zeros:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+
+define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_test_mix_ones_zeros:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_mix_ones_zeros:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_testc_si128:
+; X32: # BB#0:
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testc_si128:
+; X64: # BB#0:
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+
+define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_testnzc_si128:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testnzc_si128:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+
+define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_testz_si128:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testz_si128:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index 75f69ffd6db9..4f6aa798faf0 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -1,17 +1,25 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse4.1 | FileCheck %s
+
; This test works just like the non-upgrade one except that it only checks
; forms which require auto-upgrading.
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: blendpd
- %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+; CHECK-LABEL: test_x86_sse41_blendpd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: blendps
+; CHECK-LABEL: test_x86_sse41_blendps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -19,7 +27,10 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounw
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: dppd
+; CHECK-LABEL: test_x86_sse41_dppd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: dppd $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -27,7 +38,10 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounw
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: dpps
+; CHECK-LABEL: test_x86_sse41_dpps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: dpps $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -35,15 +49,21 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: insertps
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+; CHECK-LABEL: test_x86_sse41_insertps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
+; CHECK-NEXT: retl
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 17) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: mpsadbw
+; CHECK-LABEL: test_x86_sse41_mpsadbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: mpsadbw $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -51,7 +71,10 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind re
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pblendw
+; CHECK-LABEL: test_x86_sse41_pblendw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -59,7 +82,10 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind re
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
- ; CHECK: pmovsxbd
+; CHECK-LABEL: test_x86_sse41_pmovsxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxbd %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -67,7 +93,10 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
- ; CHECK: pmovsxbq
+; CHECK-LABEL: test_x86_sse41_pmovsxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxbq %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -75,7 +104,10 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
- ; CHECK: pmovsxbw
+; CHECK-LABEL: test_x86_sse41_pmovsxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxbw %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -83,7 +115,10 @@ declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
- ; CHECK: pmovsxdq
+; CHECK-LABEL: test_x86_sse41_pmovsxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -91,7 +126,10 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
- ; CHECK: pmovsxwd
+; CHECK-LABEL: test_x86_sse41_pmovsxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxwd %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -99,8 +137,166 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
- ; CHECK: pmovsxwq
+; CHECK-LABEL: test_x86_sse41_pmovsxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxwq %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: retl
+ %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+define <16 x i8> @max_epi8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: max_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxsb %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @min_epi8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: min_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminsb %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @max_epu16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: max_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxuw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @min_epu16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: min_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminuw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @max_epi32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: max_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxsd %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @min_epi32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: min_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminsd %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @max_epu32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: max_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxud %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @min_epu32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: min_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminud %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index ceff4f9782e9..58eae1057f89 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -1,24 +1,20 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s
-
-define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: blendpd
- %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: blendps
- %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: blendvpd
+; SSE41-LABEL: test_x86_sse41_blendvpd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movapd %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_blendvpd:
+; KNL: ## BB#0:
+; KNL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -26,7 +22,18 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: blendvps
+; SSE41-LABEL: test_x86_sse41_blendvps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm1, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_blendvps:
+; KNL: ## BB#0:
+; KNL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -34,7 +41,15 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: dppd
+; SSE41-LABEL: test_x86_sse41_dppd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: dppd $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_dppd:
+; KNL: ## BB#0:
+; KNL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -42,7 +57,15 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: dpps
+; SSE41-LABEL: test_x86_sse41_dpps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: dpps $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_dpps:
+; KNL: ## BB#0:
+; KNL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -50,8 +73,16 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: insertps
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+; SSE41-LABEL: test_x86_sse41_insertps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_insertps:
+; KNL: ## BB#0:
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
+; KNL-NEXT: retl
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
@@ -59,7 +90,15 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: mpsadbw
+; SSE41-LABEL: test_x86_sse41_mpsadbw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: mpsadbw $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_mpsadbw:
+; KNL: ## BB#0:
+; KNL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -67,7 +106,15 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: packusdw
+; SSE41-LABEL: test_x86_sse41_packusdw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_packusdw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -75,23 +122,34 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno
define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
- ; CHECK: pblendvb
+; SSE41-LABEL: test_x86_sse41_pblendvb:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pblendvb:
+; KNL: ## BB#0:
+; KNL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pblendw
- %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
-
-
define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
- ; CHECK: phminposuw
+; SSE41-LABEL: test_x86_sse41_phminposuw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_phminposuw:
+; KNL: ## BB#0:
+; KNL-NEXT: vphminposuw %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -99,7 +157,15 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pmaxsb
+; SSE41-LABEL: test_x86_sse41_pmaxsb:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxsb %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxsb:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -107,7 +173,15 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmaxsd
+; SSE41-LABEL: test_x86_sse41_pmaxsd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxsd %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxsd:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -115,7 +189,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmaxud
+; SSE41-LABEL: test_x86_sse41_pmaxud:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxud:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -123,7 +205,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmaxuw
+; SSE41-LABEL: test_x86_sse41_pmaxuw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxuw %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxuw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -131,7 +221,15 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pminsb
+; SSE41-LABEL: test_x86_sse41_pminsb:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminsb %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminsb:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -139,7 +237,15 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pminsd
+; SSE41-LABEL: test_x86_sse41_pminsd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminsd %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminsd:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -147,7 +253,15 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pminud
+; SSE41-LABEL: test_x86_sse41_pminud:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminud %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminud:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -155,63 +269,31 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pminuw
+; SSE41-LABEL: test_x86_sse41_pminuw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminuw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
-define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
- ; CHECK: pmovzxbd
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
- ; CHECK: pmovzxbq
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
- ; CHECK: pmovzxbw
- %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
- ; CHECK: pmovzxdq
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
- ; CHECK: pmovzxwd
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
- ; CHECK: pmovzxwq
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
-
-
define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmuldq
+; SSE41-LABEL: test_x86_sse41_pmuldq:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmuldq %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmuldq:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -219,8 +301,19 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: ptest
- ; CHECK: sbbl
+; SSE41-LABEL: test_x86_sse41_ptestc:
+; SSE41: ## BB#0:
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: sbbl %eax, %eax
+; SSE41-NEXT: andl $1, %eax
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_ptestc:
+; KNL: ## BB#0:
+; KNL-NEXT: vptest %xmm1, %xmm0
+; KNL-NEXT: sbbl %eax, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -228,9 +321,19 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: ptest
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE41-LABEL: test_x86_sse41_ptestnzc:
+; SSE41: ## BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: seta %al
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_ptestnzc:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vptest %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -238,9 +341,19 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: ptest
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE41-LABEL: test_x86_sse41_ptestz:
+; SSE41: ## BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_ptestz:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vptest %xmm1, %xmm0
+; KNL-NEXT: sete %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -248,7 +361,15 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
- ; CHECK: roundpd
+; SSE41-LABEL: test_x86_sse41_round_pd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $7, %xmm0, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundpd $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -256,7 +377,15 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
- ; CHECK: roundps
+; SSE41-LABEL: test_x86_sse41_round_ps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $7, %xmm0, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundps $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -264,7 +393,15 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: roundsd
+; SSE41-LABEL: test_x86_sse41_round_sd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundsd $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -272,7 +409,15 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: roundss
+; SSE41-LABEL: test_x86_sse41_round_ss:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundss $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
diff --git a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll b/test/CodeGen/X86/sse41-pmovxrm.ll
index a7e48d8ac038..756beb995c06 100644
--- a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll
+++ b/test/CodeGen/X86/sse41-pmovxrm.ll
@@ -109,8 +109,9 @@ define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) {
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %1)
- ret <8 x i16> %2
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = zext <8 x i8> %2 to <8 x i16>
+ ret <8 x i16> %3
}
define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
@@ -124,8 +125,9 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %1)
- ret <4 x i32> %2
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i8> %2 to <4 x i32>
+ ret <4 x i32> %3
}
define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
@@ -139,8 +141,9 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %1)
- ret <2 x i64> %2
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %3 = zext <2 x i8> %2 to <2 x i64>
+ ret <2 x i64> %3
}
define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
@@ -154,8 +157,9 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %1)
- ret <4 x i32> %2
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i16> %2 to <4 x i32>
+ ret <4 x i32> %3
}
define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
@@ -169,8 +173,9 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %1)
- ret <2 x i64> %2
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %3 = zext <2 x i16> %2 to <2 x i64>
+ ret <2 x i64> %3
}
define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) {
@@ -184,13 +189,7 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) {
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; AVX-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a, align 1
- %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %1)
- ret <2 x i64> %2
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = zext <2 x i32> %2 to <2 x i64>
+ ret <2 x i64> %3
}
-
-declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>)
-declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>)
-declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>)
-declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>)
-declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>)
-declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 0a83a9753b81..3cb754c8f93f 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -141,14 +141,14 @@ define i32 @ext_3(<4 x i32> %v) nounwind {
define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
; X32-LABEL: insertps_1:
; X32: ## BB#0:
-; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_1:
; X64: ## BB#0:
-; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
; X64-NEXT: retq
- %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
+ %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
ret <4 x float> %tmp1
}
@@ -208,16 +208,16 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
; X32-LABEL: ptestz_1:
; X32: ## BB#0:
+; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: sete %al
-; X32-NEXT: movzbl %al, %eax
; X32-NEXT: retl
;
; X64-LABEL: ptestz_1:
; X64: ## BB#0:
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: sete %al
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
%tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
ret i32 %tmp1
@@ -244,16 +244,16 @@ define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
; X32-LABEL: ptestz_3:
; X32: ## BB#0:
+; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: seta %al
-; X32-NEXT: movzbl %al, %eax
; X32-NEXT: retl
;
; X64-LABEL: ptestz_3:
; X64: ## BB#0:
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: seta %al
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
%tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
ret i32 %tmp1
@@ -507,16 +507,12 @@ define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_X00A:
; X32: ## BB#0:
-; X32-NEXT: xorps %xmm2, %xmm2
-; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: shuf_X00A:
; X64: ## BB#0:
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -701,16 +697,16 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_X00X:
; X32: ## BB#0:
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
-; X32-NEXT: pxor %xmm0, %xmm0
-; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; X32-NEXT: pxor %xmm1, %xmm1
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_X00X:
; X64: ## BB#0:
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -850,16 +846,12 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadf32:
; X64: ## BB#0:
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = getelementptr inbounds float, float* %fb, i64 %index
%2 = load float, float* %1, align 4
@@ -875,16 +867,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
; X32-LABEL: insertps_from_broadcast_loadv4f32:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movups (%eax), %xmm1
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadv4f32:
; X64: ## BB#0:
-; X64-NEXT: movups (%rdi), %xmm1
-; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b, align 4
%2 = extractelement <4 x float> %1, i32 0
@@ -896,14 +884,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
ret <4 x float> %7
}
-;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
; X32-LABEL: insertps_from_broadcast_multiple_use:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
@@ -916,7 +902,6 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X64-LABEL: insertps_from_broadcast_multiple_use:
; X64: ## BB#0:
; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..0a69d2632123
--- /dev/null
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse42-builtins.c
+
+define i64 @test_mm_crc64_u8(i64 %a0, i8 %a1) nounwind{
+; X64-LABEL: test_mm_crc64_u8:
+; X64: # BB#0:
+; X64-NEXT: crc32b %sil, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind readnone
+
+define i64 @test_mm_crc64_u64(i64 %a0, i64 %a1) nounwind{
+; X64-LABEL: test_mm_crc64_u64:
+; X64: # BB#0:
+; X64-NEXT: crc32q %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..53b94e7f0d39
--- /dev/null
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -0,0 +1,401 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse42-builtins.c
+
+define i32 @test_mm_cmpestra(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestra:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: seta %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestra:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: seta %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestrc(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
+; X32-LABEL: test_mm_cmpestrc:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrc:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestri(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
+; X32-LABEL: test_mm_cmpestri:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestri:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <2 x i64> @test_mm_cmpestrm(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
+; X32-LABEL: test_mm_cmpestrm:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pcmpestrm $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrm:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestrm $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestro(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestro:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: seto %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestro:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: seto %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestrs(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestrs:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: sets %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrs:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: sets %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestrz(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestrz:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: sete %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrz:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: sete %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <2 x i64> @test_mm_cmpgt_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpgt_epi64:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi64:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = icmp sgt <2 x i64> %a0, %a1
+ %res = sext <2 x i1> %cmp to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define i32 @test_mm_cmpistra(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistra:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistra:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistrc(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrc:
+; X32: # BB#0:
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrc:
+; X64: # BB#0:
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistri(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistri:
+; X32: # BB#0:
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistri:
+; X64: # BB#0:
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_cmpistrm(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrm:
+; X32: # BB#0:
+; X32-NEXT: pcmpistrm $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrm:
+; X64: # BB#0:
+; X64-NEXT: pcmpistrm $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistro(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistro:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: seto %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistro:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: seto %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistrs(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrs:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: sets %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrs:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: sets %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistrz(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrz:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrz:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_crc32_u8(i32 %a0, i8 %a1) {
+; X32-LABEL: test_mm_crc32_u8:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32b %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_crc32_u8:
+; X64: # BB#0:
+; X64-NEXT: crc32b %sil, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone
+
+define i32 @test_mm_crc32_u16(i32 %a0, i16 %a1) {
+; X32-LABEL: test_mm_crc32_u16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32w %cx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_crc32_u16:
+; X64: # BB#0:
+; X64-NEXT: crc32w %si, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind readnone
+
+define i32 @test_mm_crc32_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test_mm_crc32_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_crc32_u32:
+; X64: # BB#0:
+; X64-NEXT: crc32l %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86.ll b/test/CodeGen/X86/sse42-intrinsics-x86.ll
index 706c86b71a4a..2b31109ce45c 100644
--- a/test/CodeGen/X86/sse42-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -1,10 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 | FileCheck %s
define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl $7
- ; CHECK: movl $7
- ; CHECK: pcmpestri $7
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpestri128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -12,10 +16,16 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
- ; CHECK: movl $7
- ; CHECK: movl $7
- ; CHECK: pcmpestri $7, (
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpestri128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movdqa (%eax), %xmm0
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestri $7, (%ecx), %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a2
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
@@ -23,11 +33,18 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
}
-define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: seta
+define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestria128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: seta %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -35,43 +52,68 @@ declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: sbbl
+; CHECK-LABEL: test_x86_sse42_pcmpestric128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: seto
+define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestrio128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: seto %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: sets
+define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestris128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: sets %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: sete
+define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestriz128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: sete %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -79,10 +121,12 @@ declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestrm
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpestrm128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestrm $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -90,10 +134,13 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
- ; CHECK: movl $7
- ; CHECK: movl $7
- ; CHECK: pcmpestrm $7,
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpestrm128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestrm $7, (%ecx), %xmm0
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a2
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
@@ -101,8 +148,11 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2
define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri $7
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpistri128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -110,8 +160,14 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
- ; CHECK: pcmpistri $7, (
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpistri128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movdqa (%ecx), %xmm0
+; CHECK-NEXT: pcmpistri $7, (%eax), %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a1
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
@@ -120,8 +176,12 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: seta
+; CHECK-LABEL: test_x86_sse42_pcmpistria128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -129,8 +189,12 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: sbbl
+; CHECK-LABEL: test_x86_sse42_pcmpistric128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -138,8 +202,12 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: seto
+; CHECK-LABEL: test_x86_sse42_pcmpistrio128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: seto %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -147,8 +215,12 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: sets
+; CHECK-LABEL: test_x86_sse42_pcmpistris128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: sets %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -156,8 +228,12 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: sete
+; CHECK-LABEL: test_x86_sse42_pcmpistriz128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -165,8 +241,10 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistrm $7
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpistrm128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pcmpistrm $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -174,8 +252,11 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
- ; CHECK: pcmpistrm $7, (
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpistrm128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: pcmpistrm $7, (%eax), %xmm0
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a1
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
diff --git a/test/CodeGen/X86/sse42.ll b/test/CodeGen/X86/sse42.ll
index db51d9973688..2d05f9884c42 100644
--- a/test/CodeGen/X86/sse42.ll
+++ b/test/CodeGen/X86/sse42.ll
@@ -1,39 +1,58 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s --check-prefix=X64
declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
+; X32-LABEL: crc32_32_8:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32b {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: crc32_32_8:
+; X64: ## BB#0:
+; X64-NEXT: crc32b %sil, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
ret i32 %tmp
-; X32: _crc32_32_8:
-; X32: crc32b 8(%esp), %eax
-
-; X64: _crc32_32_8:
-; X64: crc32b %sil,
}
define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
+; X32-LABEL: crc32_32_16:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32w {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: crc32_32_16:
+; X64: ## BB#0:
+; X64-NEXT: crc32w %si, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
ret i32 %tmp
-; X32: _crc32_32_16:
-; X32: crc32w 8(%esp), %eax
-
-; X64: _crc32_32_16:
-; X64: crc32w %si,
}
define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
+; X32-LABEL: crc32_32_32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: crc32_32_32:
+; X64: ## BB#0:
+; X64-NEXT: crc32l %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
ret i32 %tmp
-; X32: _crc32_32_32:
-; X32: crc32l 8(%esp), %eax
-
-; X64: _crc32_32_32:
-; X64: crc32l %esi,
}
diff --git a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
index f93a16a5eb3d..f45abf1d85df 100644
--- a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse4a-builtins.c
@@ -65,7 +67,7 @@ define <2 x i64> @test_mm_insert_si64(<2 x i64> %x, <2 x i64> %y) {
}
declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind readnone
-define void @test_stream_sd(i8* %p, <2 x double> %a) {
+define void @test_stream_sd(double* %p, <2 x double> %a) {
; X32-LABEL: test_stream_sd:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -76,12 +78,12 @@ define void @test_stream_sd(i8* %p, <2 x double> %a) {
; X64: # BB#0:
; X64-NEXT: movntsd %xmm0, (%rdi)
; X64-NEXT: retq
- call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
+ %1 = extractelement <2 x double> %a, i64 0
+ store double %1, double* %p, align 1, !nontemporal !1
ret void
}
-declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) nounwind readnone
-define void @test_mm_stream_ss(i8* %p, <4 x float> %a) {
+define void @test_mm_stream_ss(float* %p, <4 x float> %a) {
; X32-LABEL: test_mm_stream_ss:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -92,7 +94,9 @@ define void @test_mm_stream_ss(i8* %p, <4 x float> %a) {
; X64: # BB#0:
; X64-NEXT: movntss %xmm0, (%rdi)
; X64-NEXT: retq
- call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
+ %1 = extractelement <4 x float> %a, i64 0
+ store float %1, float* %p, align 1, !nontemporal !1
ret void
}
-declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) nounwind readnone
+
+!1 = !{i32 1}
diff --git a/test/CodeGen/X86/sse4a-upgrade.ll b/test/CodeGen/X86/sse4a-upgrade.ll
new file mode 100644
index 000000000000..a129c658f4b9
--- /dev/null
+++ b/test/CodeGen/X86/sse4a-upgrade.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
+
+define void @test_movntss(i8* %p, <4 x float> %a) nounwind optsize ssp {
+; X32-LABEL: test_movntss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_movntss:
+; X64: # BB#0:
+; X64-NEXT: movntss %xmm0, (%rdi)
+; X64-NEXT: retq
+ tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
+ ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
+define void @test_movntsd(i8* %p, <2 x double> %a) nounwind optsize ssp {
+; X32-LABEL: test_movntsd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_movntsd:
+; X64: # BB#0:
+; X64-NEXT: movntsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
+ ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll
index f8fa125f98e7..1f582fb4ed9d 100644
--- a/test/CodeGen/X86/sse4a.ll
+++ b/test/CodeGen/X86/sse4a.ll
@@ -1,36 +1,35 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4a | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=sse4a | FileCheck %s
-
-define void @test1(i8* %p, <4 x float> %a) nounwind optsize ssp {
-; CHECK-LABEL: test1:
-; CHECK: movntss
- tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
- ret void
-}
-
-declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
-
-define void @test2(i8* %p, <2 x double> %a) nounwind optsize ssp {
-; CHECK-LABEL: test2:
-; CHECK: movntsd
- tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
- ret void
-}
-
-declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
-define <2 x i64> @test3(<2 x i64> %x) nounwind uwtable ssp {
-; CHECK-LABEL: test3:
-; CHECK: extrq
+define <2 x i64> @test_extrqi(<2 x i64> %x) nounwind uwtable ssp {
+; X32-LABEL: test_extrqi:
+; X32: # BB#0:
+; X32-NEXT: extrq $2, $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_extrqi:
+; X64: # BB#0:
+; X64-NEXT: extrq $2, $3, %xmm0
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
ret <2 x i64> %1
}
declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
-define <2 x i64> @test4(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
-; CHECK-LABEL: test4:
-; CHECK: extrq
+define <2 x i64> @test_extrq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; X32-LABEL: test_extrq:
+; X32: # BB#0:
+; X32-NEXT: extrq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_extrq:
+; X64: # BB#0:
+; X64-NEXT: extrq %xmm1, %xmm0
+; X64-NEXT: retq
%1 = bitcast <2 x i64> %y to <16 x i8>
%2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
ret <2 x i64> %2
@@ -38,18 +37,32 @@ define <2 x i64> @test4(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
-define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
-; CHECK-LABEL: test5:
-; CHECK: insertq
+define <2 x i64> @test_insertqi(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; X32-LABEL: test_insertqi:
+; X32: # BB#0:
+; X32-NEXT: insertq $6, $5, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_insertqi:
+; X64: # BB#0:
+; X64-NEXT: insertq $6, $5, %xmm1, %xmm0
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
ret <2 x i64> %1
}
declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
-define <2 x i64> @test6(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
-; CHECK-LABEL: test6:
-; CHECK: insertq
+define <2 x i64> @test_insertq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; X32-LABEL: test_insertq:
+; X32: # BB#0:
+; X32-NEXT: insertq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_insertq:
+; X64: # BB#0:
+; X64-NEXT: insertq %xmm1, %xmm0
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
ret <2 x i64> %1
}
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
index 8d61428420f6..51359d1790af 100644
--- a/test/CodeGen/X86/sse_partial_update.ll
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
; rdar: 12558838
@@ -8,14 +9,15 @@
; destination of each scalar unary op are the same.
define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: rsqrtss:
-; CHECK: rsqrtss %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movshdup
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: rsqrtss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
+; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee ## TAILCALL
+entry:
%0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
@@ -29,14 +31,15 @@ declare void @callee(double, double)
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define void @rcpss(<4 x float> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: rcpss:
-; CHECK: rcpss %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movshdup
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: rcpss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
+; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee ## TAILCALL
+entry:
%0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
@@ -49,14 +52,15 @@ entry:
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: sqrtss:
-; CHECK: sqrtss %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movshdup
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: sqrtss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
+; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee ## TAILCALL
+entry:
%0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
@@ -69,14 +73,15 @@ entry:
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: sqrtsd:
-; CHECK: sqrtsd %xmm0, %xmm0
-; CHECK-NEXT: cvtsd2ss %xmm0
-; CHECK-NEXT: shufpd
-; CHECK-NEXT: cvtsd2ss %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: sqrtsd %xmm0, %xmm0
+; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee2 ## TAILCALL
+entry:
%0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind
%a0 = extractelement <2 x double> %0, i32 0
@@ -92,10 +97,11 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
; CHECK-LABEL: load_fold_cvtss2sd_int:
-; CHECK: movaps (%rdi), %xmm1
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK: ## BB#0:
+; CHECK-NEXT: movaps (%rdi), %xmm1
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm1, %xmm0
+; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
@@ -103,9 +109,10 @@ define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_optsize:
-; CHECK: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
-; CHECK-NEXT: retq
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
+; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
@@ -113,9 +120,10 @@ define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
define <2 x double> @load_fold_cvtss2sd_int_minsize(<4 x float> *%a) minsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_minsize:
-; CHECK: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
-; CHECK-NEXT: retq
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
+; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
diff --git a/test/CodeGen/X86/ssp-data-layout.ll b/test/CodeGen/X86/ssp-data-layout.ll
index 4a63aceb7ccf..e954d9c1042a 100644
--- a/test/CodeGen/X86/ssp-data-layout.ll
+++ b/test/CodeGen/X86/ssp-data-layout.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
; This test is fairly fragile. The goal is to ensure that "large" stack
; objects are allocated closest to the stack protector (i.e., farthest away
; from the Stack Pointer.) In standard SSP mode this means that large (>=
diff --git a/test/CodeGen/X86/ssp-guard-spill.ll b/test/CodeGen/X86/ssp-guard-spill.ll
new file mode 100644
index 000000000000..7364dee4f080
--- /dev/null
+++ b/test/CodeGen/X86/ssp-guard-spill.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "x86_64-apple-macosx10.4.0"
+
+; __stack_chk_guard must be loaded for twice, once for setting up the canary,
+; another time for performing the check. It is because if we reuse the same
+; stack guard value, it may get spilled to the stack, then the for loop may
+; corrupt it.
+;
+; bool Bar(int*);
+; bool Foo(int n) {
+; int a[10];
+; for (int i = 0; i < n; i++) {
+; a[i] = 0;
+; }
+; return Bar(a);
+; }
+;
+; CHECK: movq ___stack_chk_guard
+; CHECK: movq ___stack_chk_guard
+define zeroext i1 @_Z3Fooi(i32 %n) sspstrong {
+entry:
+ %n.addr = alloca i32, align 4
+ %a = alloca [10 x i32], align 16
+ %i = alloca i32, align 4
+ store i32 %n, i32* %n.addr, align 4
+ store i32 0, i32* %i, align 4
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %tmp = load i32, i32* %i, align 4
+ %tmp1 = load i32, i32* %n.addr, align 4
+ %cmp = icmp slt i32 %tmp, %tmp1
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %tmp2 = load i32, i32* %i, align 4
+ %idxprom = sext i32 %tmp2 to i64
+ %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 %idxprom
+ store i32 0, i32* %arrayidx, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %tmp3 = load i32, i32* %i, align 4
+ %inc = add nsw i32 %tmp3, 1
+ store i32 %inc, i32* %i, align 4
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %a, i32 0, i32 0
+ %call = call zeroext i1 @_Z3BarPi(i32* %arraydecay)
+ ret i1 %call
+}
+
+declare zeroext i1 @_Z3BarPi(i32*)
diff --git a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
index 4f7ff20c6e0d..163dc0bc9a0c 100644
--- a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/ssse3-builtins.c
@@ -57,13 +58,13 @@ declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_alignr_epi8:
; X32: # BB#0:
-; X32-NEXT: palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X32-NEXT: palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_alignr_epi8:
; X64: # BB#0:
-; X64-NEXT: palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X64-NEXT: palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -73,6 +74,25 @@ define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
ret <2 x i64> %res
}
+define <2 x i64> @test2_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test2_mm_alignr_epi8:
+; X32: # BB#0:
+; X32-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test2_mm_alignr_epi8:
+; X64: # BB#0:
+; X64-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+ %res = bitcast <16 x i8> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hadd_epi16:
; X32: # BB#0:
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index 0cff95f266a9..04bae023984f 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -61,3 +61,31 @@ entry:
; CHECK-NOT: and
; CHECK: ret
}
+
+%struct.sixteen = type { [16 x i8] }
+
+; Accessing stack parameters shouldn't assume stack alignment. Here we should
+; emit two 8-byte loads, followed by two 8-byte stores.
+define x86_stdcallcc void @test5(%struct.sixteen* byval nocapture readonly align 4 %s) #0 {
+ %d.sroa.0 = alloca [16 x i8], align 1
+ %1 = getelementptr inbounds [16 x i8], [16 x i8]* %d.sroa.0, i32 0, i32 0
+ call void @llvm.lifetime.start(i64 16, i8* %1)
+ %2 = getelementptr inbounds %struct.sixteen, %struct.sixteen* %s, i32 0, i32 0, i32 0
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 16, i32 1, i1 true)
+ call void @llvm.lifetime.end(i64 16, i8* %1)
+ ret void
+; CHECK-LABEL: test5:
+; CHECK: and
+; CHECK: movsd
+; CHECK-NEXT: movsd
+; CHECK-NEXT: movsd
+; CHECK-NEXT: movsd
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) argmemonly nounwind
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) argmemonly nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) argmemonly nounwind
+
+attributes #0 = { nounwind alignstack=16 "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/stack-align2.ll b/test/CodeGen/X86/stack-align2.ll
index 18cce7266d13..7239198000c9 100644
--- a/test/CodeGen/X86/stack-align2.ll
+++ b/test/CodeGen/X86/stack-align2.ll
@@ -1,7 +1,9 @@
; RUN: llc < %s -mcpu=generic -mtriple=i386-linux | FileCheck %s -check-prefix=LINUX-I386
+; RUN: llc < %s -mcpu=generic -mtriple=i386-kfreebsd | FileCheck %s -check-prefix=KFREEBSD-I386
; RUN: llc < %s -mcpu=generic -mtriple=i386-netbsd | FileCheck %s -check-prefix=NETBSD-I386
; RUN: llc < %s -mcpu=generic -mtriple=i686-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-I386
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=LINUX-X86_64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-kfreebsd | FileCheck %s -check-prefix=KFREEBSD-X86_64
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-netbsd | FileCheck %s -check-prefix=NETBSD-X86_64
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-X86_64
@@ -11,6 +13,7 @@ entry:
ret i32 0
; LINUX-I386: subl $12, %esp
+; KFREEBSD-I386: subl $12, %esp
; DARWIN-I386: subl $12, %esp
; NETBSD-I386-NOT: subl {{.*}}, %esp
@@ -20,6 +23,8 @@ entry:
; DARWIN-X86_64-NOT: subq {{.*}}, %rsp
; NETBSD-X86_64: pushq %{{.*}}
; NETBSD-X86_64-NOT: subq {{.*}}, %rsp
+; KFREEBSD-X86_64: pushq %{{.*}}
+; KFREEBSD-X86_64-NOT: subq {{.*}}, %rsp
}
declare void @test2()
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index b86ec0ea22ff..5dfdf4b98adf 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -384,6 +384,14 @@ define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_cvtdq2pd
;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = sitofp <2 x i32> %2 to <2 x double>
+ ret <2 x double> %3
+}
+define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_int
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
ret <2 x double> %2
}
@@ -393,6 +401,14 @@ define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = sitofp <4 x i32> %a0 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0)
ret <4 x double> %2
}
@@ -488,6 +504,15 @@ define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtps2pd
;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+ %3 = fpext <2 x float> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtps2pd_int
+ ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
ret <2 x double> %2
}
@@ -497,6 +522,14 @@ define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtps2pd_ymm
;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fpext <4 x float> %a0 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtps2pd_ymm_int
+ ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0)
ret <4 x double> %2
}
@@ -524,7 +557,7 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
;CHECK-LABEL: stack_fold_cvtsd2si_int
- ;CHECK: cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
ret i32 %2
@@ -535,7 +568,7 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
;CHECK-LABEL: stack_fold_cvtsd2si64_int
- ;CHECK: cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
ret i64 %2
@@ -546,7 +579,7 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) {
;CHECK-LABEL: stack_fold_cvtsd2ss_int
- ;CHECK: cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
ret <4 x float> %2
@@ -555,7 +588,7 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
define double @stack_fold_cvtsi2sd(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2sd
- ;CHECK: cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i32 %a0 to double
ret double %2
@@ -563,7 +596,7 @@ define double @stack_fold_cvtsi2sd(i32 %a0) {
define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2sd_int
- ;CHECK: cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
ret <2 x double> %2
@@ -572,7 +605,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnon
define double @stack_fold_cvtsi642sd(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642sd
- ;CHECK: cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i64 %a0 to double
ret double %2
@@ -580,7 +613,7 @@ define double @stack_fold_cvtsi642sd(i64 %a0) {
define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642sd_int
- ;CHECK: cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
ret <2 x double> %2
@@ -589,7 +622,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readn
define float @stack_fold_cvtsi2ss(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2ss
- ;CHECK: cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i32 %a0 to float
ret float %2
@@ -597,7 +630,7 @@ define float @stack_fold_cvtsi2ss(i32 %a0) {
define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2ss_int
- ;CHECK: cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
ret <4 x float> %2
@@ -606,7 +639,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define float @stack_fold_cvtsi642ss(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642ss
- ;CHECK: cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i64 %a0 to float
ret float %2
@@ -614,7 +647,7 @@ define float @stack_fold_cvtsi642ss(i64 %a0) {
define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642ss_int
- ;CHECK: cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
ret <4 x float> %2
@@ -625,7 +658,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtss2sd_int
- ;CHECK: cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
ret <2 x double> %2
@@ -748,7 +781,7 @@ define i64 @stack_fold_cvttss2si64(float %a0) {
define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvttss2si64_int
- ;CHECK: cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
ret i64 %2
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
new file mode 100644
index 000000000000..3ab96e3f4629
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -0,0 +1,137 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define double @stack_fold_addsd(double %a0, double %a1) {
+ ;CHECK-LABEL: stack_fold_addsd
+ ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fadd double %a0, %a1
+ ret double %2
+}
+
+define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_addsd_int
+ ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_addss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_addss
+ ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fadd float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_addss_int
+ ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_divsd_int
+ ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_divss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_divss
+ ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fdiv float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_divss_int
+ ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define double @stack_fold_mulsd(double %a0, double %a1) {
+ ;CHECK-LABEL: stack_fold_mulsd
+ ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fmul double %a0, %a1
+ ret double %2
+}
+
+define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_mulsd_int
+ ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_mulss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_mulss
+ ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fmul float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_mulss_int
+ ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define double @stack_fold_subsd(double %a0, double %a1) {
+ ;CHECK-LABEL: stack_fold_subsd
+ ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fsub double %a0, %a1
+ ret double %2
+}
+
+define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_subsd_int
+ ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_subss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_subss
+ ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fsub float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_subss_int
+ ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 9f689cfe85e5..4c675356df6d 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -244,6 +244,15 @@ define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_cvtdq2pd
;CHECK: cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = sitofp <2 x i32> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_int
+ ;CHECK: cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
ret <2 x double> %2
}
@@ -287,6 +296,15 @@ define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtps2pd
;CHECK: cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+ %3 = fpext <2 x float> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtps2pd_int
+ ;CHECK: cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
ret <2 x double> %2
}
diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
index 235a10ed4678..ef7fa2217145 100644
--- a/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -253,7 +253,9 @@ define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK: vpblendd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
- ret <4 x i32> %2
+ ; add forces execution domain
+ %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %3
}
define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
@@ -261,7 +263,9 @@ define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK: vpblendd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i32> %2
+ ; add forces execution domain
+ %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %3
}
define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) {
@@ -658,19 +662,19 @@ define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbd
;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0)
- ret <8 x i32> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = sext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
}
-declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbq
;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbw
@@ -700,64 +704,61 @@ define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovsxwq
;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbd
;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0)
- ret <8 x i32> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = zext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
}
-declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbq
;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbw
;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0)
+ %2 = zext <16 x i8> %a0 to <16 x i16>
ret <16 x i16> %2
}
-declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_pmovzxdq
;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0)
+ %2 = zext <4 x i32> %a0 to <4 x i64>
ret <4 x i64> %2
}
-declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovzxwd
;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0)
+ %2 = zext <8 x i16> %a0 to <8 x i32>
ret <8 x i32> %2
}
-declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovzxwq
;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK-LABEL: stack_fold_pmuldq
diff --git a/test/CodeGen/X86/stack-folding-xop.ll b/test/CodeGen/X86/stack-folding-xop.ll
index d0c48b400804..115d1a9cad3a 100644
--- a/test/CodeGen/X86/stack-folding-xop.ll
+++ b/test/CodeGen/X86/stack-folding-xop.ll
@@ -166,69 +166,69 @@ define <8 x i16> @stack_fold_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
}
declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
-define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_rm
;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 0)
+ %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
ret <2 x double> %2
}
-define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_mr
;CHECK: vpermil2pd $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x double> %a1, i8 0)
+ %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x i64> %a1, i8 0)
ret <2 x double> %2
}
-declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
-define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_rm
;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 0)
+ %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
ret <4 x double> %2
}
-define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x double> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_mr
;CHECK: vpermil2pd $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x double> %a1, i8 0)
+ %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x i64> %a1, i8 0)
ret <4 x double> %2
}
-declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
-define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_rm
;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 0)
+ %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 0)
ret <4 x float> %2
}
-define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_mr
;CHECK: vpermil2ps $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x float> %a1, i8 0)
+ %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x i32> %a1, i8 0)
ret <4 x float> %2
}
-declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
-define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_rm
;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 0)
+ %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 0)
ret <8 x float> %2
}
-define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x float> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_mr
;CHECK: vpermil2ps $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x float> %a1, i8 0)
+ %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x i32> %a1, i8 0)
ret <8 x float> %2
}
-declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_vphaddbd
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index 237b96603c00..8413b8ef82cb 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -25,7 +25,7 @@ attributes #0 = { sspreq }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!21, !72}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.4 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !5, subprograms: !8, globals: !20, imports: !5)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.4 ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !5, globals: !20, imports: !5)
!1 = !DIFile(filename: "<unknown>", directory: "/Users/matt/ryan_bug")
!2 = !{!3}
!3 = !DICompositeType(tag: DW_TAG_enumeration_type, line: 20, size: 32, align: 32, file: !1, scope: !4, elements: !6)
@@ -33,8 +33,7 @@ attributes #0 = { sspreq }
!5 = !{}
!6 = !{!7}
!7 = !DIEnumerator(name: "max_frame_size", value: 0) ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
-!8 = !{!9, !24, !41, !65}
-!9 = distinct !DISubprogram(name: "read_response_size", linkageName: "_Z18read_response_sizev", line: 27, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 27, file: !1, scope: !10, type: !11, variables: !14)
+!9 = distinct !DISubprogram(name: "read_response_size", linkageName: "_Z18read_response_sizev", line: 27, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 27, file: !1, scope: !10, type: !11, variables: !14)
!10 = !DIFile(filename: "<unknown>", directory: "/Users/matt/ryan_bug")
!11 = !DISubroutineType(types: !12)
!12 = !{!13}
@@ -49,7 +48,7 @@ attributes #0 = { sspreq }
!21 = !{i32 2, !"Dwarf Version", i32 2}
!22 = !{i64* getelementptr inbounds ({ i64, [56 x i8] }, { i64, [56 x i8] }* @a, i32 0, i32 0)}
!23 = !DILocalVariable(name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
-!24 = distinct !DISubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
+!24 = distinct !DISubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
!25 = !DINamespace(name: "__1", line: 1, file: !26, scope: null)
!26 = !DIFile(filename: "main.cpp", directory: "/Users/matt/ryan_bug")
!27 = !DISubroutineType(types: !28)
@@ -66,7 +65,7 @@ attributes #0 = { sspreq }
!38 = !DILocation(line: 33, scope: !9)
!39 = !DILocation(line: 12, scope: !24, inlinedAt: !38)
!40 = !DILocation(line: 9, scope: !41, inlinedAt: !59)
-!41 = distinct !DISubprogram(name: "min<unsigned long long, __1::A>", linkageName: "_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !1, scope: !25, type: !42, templateParams: !53, variables: !55)
+!41 = distinct !DISubprogram(name: "min<unsigned long long, __1::A>", linkageName: "_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 8, file: !1, scope: !25, type: !42, templateParams: !53, variables: !55)
!42 = !DISubroutineType(types: !43)
!43 = !{!29, !29, !32, !44}
!44 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", size: 8, align: 8, file: !1, scope: !25, elements: !45)
@@ -86,7 +85,7 @@ attributes #0 = { sspreq }
!59 = !DILocation(line: 13, scope: !24, inlinedAt: !38)
!63 = !{i32 undef}
!64 = !DILocalVariable(name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
-!65 = distinct !DISubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
+!65 = distinct !DISubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
!66 = !{!67, !69, !70}
!67 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !65, type: !68)
!68 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !44)
diff --git a/test/CodeGen/X86/stack-protector-msvc.ll b/test/CodeGen/X86/stack-protector-msvc.ll
new file mode 100644
index 000000000000..5eccc65f2dec
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-msvc.ll
@@ -0,0 +1,40 @@
+
+; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-64 %s
+
+; MSVC-I386: movl ___security_cookie, %[[REG1:[a-z]*]]
+; MSVC-I386: movl %[[REG1]], [[SLOT:[0-9]*]](%esp)
+; MSVC-I386: calll _strcpy
+; MSVC-I386: movl [[SLOT]](%esp), %ecx
+; MSVC-I386: calll @__security_check_cookie@4
+; MSVC-I386: retl
+
+; MSVC-64: movq __security_cookie(%rip), %[[REG1:[a-z]*]]
+; MSVC-64: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp)
+; MSVC-64: callq strcpy
+; MSVC-64: movq [[SLOT]](%rsp), %rcx
+; MSVC-64: callq __security_check_cookie
+
+@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00" ; <[11 x i8]*> [#uses=1]
+
+define void @test(i8* %a) nounwind ssp {
+entry:
+ %a_addr = alloca i8* ; <i8**> [#uses=2]
+ %buf = alloca [8 x i8] ; <[8 x i8]*> [#uses=2]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ store i8* %a, i8** %a_addr
+ %buf1 = bitcast [8 x i8]* %buf to i8* ; <i8*> [#uses=1]
+ %0 = load i8*, i8** %a_addr, align 4 ; <i8*> [#uses=1]
+ %1 = call i8* @strcpy(i8* %buf1, i8* %0) nounwind ; <i8*> [#uses=0]
+ %buf2 = bitcast [8 x i8]* %buf to i8* ; <i8*> [#uses=1]
+ %2 = call i32 (i8*, ...) @printf(i8* getelementptr ([11 x i8], [11 x i8]* @"\01LC", i32 0, i32 0), i8* %buf2) nounwind ; <i32> [#uses=0]
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+declare i8* @strcpy(i8*, i8*) nounwind
+
+declare i32 @printf(i8*, ...) nounwind
+
diff --git a/test/CodeGen/X86/stack-protector-target.ll b/test/CodeGen/X86/stack-protector-target.ll
new file mode 100644
index 000000000000..66e45055b2b5
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-target.ll
@@ -0,0 +1,27 @@
+; Test target-specific stack cookie location.
+; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -mtriple=i386-kfreebsd < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-kfreebsd < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+
+define void @_Z1fv() sspreq {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @_Z7CapturePi(i32* nonnull %x)
+ ret void
+}
+
+declare void @_Z7CapturePi(i32*)
+
+; LINUX-X64: movq %fs:40, %[[B:.*]]
+; LINUX-X64: movq %[[B]], 16(%rsp)
+; LINUX-X64: movq %fs:40, %[[C:.*]]
+; LINUX-X64: cmpq 16(%rsp), %[[C]]
+
+; LINUX-I386: movl %gs:20, %[[B:.*]]
+; LINUX-I386: movl %[[B]], 8(%esp)
+; LINUX-I386: movl %gs:20, %[[C:.*]]
+; LINUX-I386: cmpl 8(%esp), %[[C]]
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index dea66d28e3dd..58c6c713941d 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -1,17 +1,31 @@
-; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=SELDAG
-; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DARWIN-SELDAG
+; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DARWIN-IR
+; RUN: llc -mtriple=i386-pc-windows-msvc -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=MSVC-SELDAG
+; RUN: llc -mtriple=i386-pc-windows-msvc -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=MSVC-IR
-; SELDAG: # Machine code for function test_branch_weights:
-; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
-; SELDAG: BB#[[FAILURE]]:
-; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
-; SELDAG: BB#[[SUCCESS]]:
+; DARWIN-SELDAG: # Machine code for function test_branch_weights:
+; DARWIN-SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
+; DARWIN-SELDAG: BB#[[FAILURE]]:
+; DARWIN-SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
+; DARWIN-SELDAG: BB#[[SUCCESS]]:
-; IR: # Machine code for function test_branch_weights:
-; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
-; IR: BB#[[SUCCESS]]:
-; IR: BB#[[FAILURE]]:
-; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
+; DARWIN-IR: # Machine code for function test_branch_weights:
+; DARWIN-IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
+; DARWIN-IR: BB#[[SUCCESS]]:
+; DARWIN-IR: BB#[[FAILURE]]:
+; DARWIN-IR: CALL64pcrel32 <ga:@__stack_chk_fail>
+
+; MSVC-SELDAG: # Machine code for function test_branch_weights:
+; MSVC-SELDAG: mem:Volatile LD4[@__security_cookie]
+; MSVC-SELDAG: ST4[FixedStack0]
+; MSVC-SELDAG: LD4[FixedStack0]
+; MSVC-SELDAG: CALLpcrel32 <ga:@__security_check_cookie>
+
+; MSVC-IR: # Machine code for function test_branch_weights:
+; MSVC-IR: mem:Volatile LD4[@__security_cookie]
+; MSVC-IR: ST4[FixedStack0]
+; MSVC-IR: LD4[%StackGuardSlot]
+; MSVC-IR: CALLpcrel32 <ga:@__security_check_cookie>
define i32 @test_branch_weights(i32 %n) #0 {
entry:
@@ -33,4 +47,4 @@ declare void @foo2(i32*)
declare void @llvm.lifetime.end(i64, i8* nocapture)
-attributes #0 = { ssp "stack-protector-buffer-size"="8" }
+attributes #0 = { sspstrong "stack-protector-buffer-size"="8" }
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index 398b8548747b..ddfb14ca8cfe 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -3,6 +3,7 @@
; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-KERNEL-X64 %s
; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | FileCheck --check-prefix=DARWIN-X64 %s
; RUN: llc -mtriple=amd64-pc-openbsd < %s -o - | FileCheck --check-prefix=OPENBSD-AMD64 %s
+; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
%struct.foo = type { [16 x i8] }
%struct.foo.0 = type { [4 x i8] }
@@ -40,6 +41,10 @@ entry:
; DARWIN-X64-LABEL: test1a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test1a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -76,6 +81,10 @@ entry:
; OPENBSD-AMD64-LABEL: test1b:
; OPENBSD-AMD64: movq __guard_local(%rip)
; OPENBSD-AMD64: callq __stack_smash_handler
+
+; MSVC-I386-LABEL: test1b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -108,6 +117,10 @@ entry:
; DARWIN-X64-LABEL: test1c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test1c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -140,6 +153,10 @@ entry:
; DARWIN-X64-LABEL: test1d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test1d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -171,6 +188,10 @@ entry:
; DARWIN-X64-LABEL: test2a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test2a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo, align 1
store i8* %a, i8** %a.addr, align 8
@@ -239,6 +260,10 @@ entry:
; DARWIN-X64-LABEL: test2c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test2c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo, align 1
store i8* %a, i8** %a.addr, align 8
@@ -273,6 +298,10 @@ entry:
; DARWIN-X64-LABEL: test2d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test2d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo, align 1
store i8* %a, i8** %a.addr, align 8
@@ -306,6 +335,10 @@ entry:
; DARWIN-X64-LABEL: test3a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test3a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -338,6 +371,10 @@ entry:
; DARWIN-X64-LABEL: test3b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test3b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -370,6 +407,10 @@ entry:
; DARWIN-X64-LABEL: test3c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test3c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -402,6 +443,10 @@ entry:
; DARWIN-X64-LABEL: test3d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test3d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -433,6 +478,10 @@ entry:
; DARWIN-X64-LABEL: test4a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test4a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -467,6 +516,10 @@ entry:
; DARWIN-X64-LABEL: test4b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test4b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -501,6 +554,10 @@ entry:
; DARWIN-X64-LABEL: test4c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test4c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -535,6 +592,10 @@ entry:
; DARWIN-X64-LABEL: test4d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test4d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -568,6 +629,10 @@ entry:
; DARWIN-X64-LABEL: test5a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test5a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -596,6 +661,10 @@ entry:
; DARWIN-X64-LABEL: test5b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test5b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -624,6 +693,10 @@ entry:
; DARWIN-X64-LABEL: test5c:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test5c:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -652,6 +725,10 @@ entry:
; DARWIN-X64-LABEL: test5d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test5d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -679,6 +756,10 @@ entry:
; DARWIN-X64-LABEL: test6a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test6a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -711,6 +792,11 @@ entry:
; DARWIN-X64-LABEL: test6b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+
+; MSVC-I386-LABEL: test6b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -743,6 +829,10 @@ entry:
; DARWIN-X64-LABEL: test6c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test6c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -775,6 +865,10 @@ entry:
; DARWIN-X64-LABEL: test6d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test6d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -806,6 +900,10 @@ entry:
; DARWIN-X64-LABEL: test7a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test7a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -833,6 +931,10 @@ entry:
; DARWIN-X64-LABEL: test7b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test7b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -860,6 +962,10 @@ entry:
; DARWIN-X64-LABEL: test7c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test7c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -887,6 +993,10 @@ entry:
; DARWIN-X64-LABEL: test7d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test7d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -913,6 +1023,10 @@ entry:
; DARWIN-X64-LABEL: test8a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test8a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -939,6 +1053,10 @@ entry:
; DARWIN-X64-LABEL: test8b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test8b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -965,6 +1083,10 @@ entry:
; DARWIN-X64-LABEL: test8c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test8c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -991,6 +1113,10 @@ entry:
; DARWIN-X64-LABEL: test8d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test8d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -1016,6 +1142,10 @@ entry:
; DARWIN-X64-LABEL: test9a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test9a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1046,6 +1176,10 @@ entry:
; DARWIN-X64-LABEL: test9b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test9b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1076,6 +1210,10 @@ entry:
; DARWIN-X64-LABEL: test9c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test9c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1106,6 +1244,10 @@ entry:
; DARWIN-X64-LABEL: test9d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test9d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1135,6 +1277,10 @@ entry:
; DARWIN-X64-LABEL: test10a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test10a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1180,6 +1326,10 @@ entry:
; DARWIN-X64-LABEL: test10b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test10b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1225,6 +1375,10 @@ entry:
; DARWIN-X64-LABEL: test10c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test10c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1270,6 +1424,10 @@ entry:
; DARWIN-X64-LABEL: test10d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test10d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1314,6 +1472,10 @@ entry:
; DARWIN-X64-LABEL: test11a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test11a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1344,6 +1506,10 @@ entry:
; DARWIN-X64-LABEL: test11b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test11b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1374,6 +1540,10 @@ entry:
; DARWIN-X64-LABEL: test11c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test11c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1404,6 +1574,10 @@ entry:
; DARWIN-X64-LABEL: test11d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test11d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1433,6 +1607,10 @@ entry:
; DARWIN-X64-LABEL: test12a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test12a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1462,6 +1640,10 @@ entry:
; DARWIN-X64-LABEL: test12b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test12b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1490,6 +1672,10 @@ entry:
; DARWIN-X64-LABEL: test12c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test12c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1519,6 +1705,10 @@ entry:
; DARWIN-X64-LABEL: test12d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test12d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1547,6 +1737,10 @@ entry:
; DARWIN-X64-LABEL: test13a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test13a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1574,6 +1768,10 @@ entry:
; DARWIN-X64-LABEL: test13b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test13b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1601,6 +1799,10 @@ entry:
; DARWIN-X64-LABEL: test13c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test13c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1628,6 +1830,10 @@ entry:
; DARWIN-X64-LABEL: test13d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test13d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1654,6 +1860,10 @@ entry:
; DARWIN-X64-LABEL: test14a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test14a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1681,6 +1891,10 @@ entry:
; DARWIN-X64-LABEL: test14b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test14b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1708,6 +1922,10 @@ entry:
; DARWIN-X64-LABEL: test14c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test14c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1735,6 +1953,10 @@ entry:
; DARWIN-X64-LABEL: test14d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test14d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1762,6 +1984,10 @@ entry:
; DARWIN-X64-LABEL: test15a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test15a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1794,6 +2020,10 @@ entry:
; DARWIN-X64-LABEL: test15b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test15b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1826,6 +2056,10 @@ entry:
; DARWIN-X64-LABEL: test15c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test15c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1858,6 +2092,10 @@ entry:
; DARWIN-X64-LABEL: test15d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test15d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1889,6 +2127,10 @@ entry:
; DARWIN-X64-LABEL: test16a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test16a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -1918,6 +2160,10 @@ entry:
; DARWIN-X64-LABEL: test16b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test16b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -1947,6 +2193,10 @@ entry:
; DARWIN-X64-LABEL: test16c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test16c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -1976,6 +2226,10 @@ entry:
; DARWIN-X64-LABEL: test16d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test16d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -2003,6 +2257,10 @@ entry:
; DARWIN-X64-LABEL: test17a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test17a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2031,6 +2289,10 @@ entry:
; DARWIN-X64-LABEL: test17b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test17b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2059,6 +2321,10 @@ entry:
; DARWIN-X64-LABEL: test17c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test17c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2087,6 +2353,10 @@ entry:
; DARWIN-X64-LABEL: test17d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test17d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2114,6 +2384,10 @@ entry:
; DARWIN-X64-LABEL: test18a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test18a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2151,6 +2425,10 @@ entry:
; DARWIN-X64-LABEL: test18b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test18b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2188,6 +2466,10 @@ entry:
; DARWIN-X64-LABEL: test18c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test18c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2225,6 +2507,10 @@ entry:
; DARWIN-X64-LABEL: test18d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test18d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2261,6 +2547,10 @@ entry:
; DARWIN-X64-LABEL: test19a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test19a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2301,6 +2591,10 @@ entry:
; DARWIN-X64-LABEL: test19b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test19b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2341,6 +2635,10 @@ entry:
; DARWIN-X64-LABEL: test19c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test19c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2385,6 +2683,10 @@ entry:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
; DARWIN-X64-NOT: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test19d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2423,6 +2725,10 @@ entry:
; DARWIN-X64-LABEL: test20a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test20a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2454,6 +2760,10 @@ entry:
; DARWIN-X64-LABEL: test20b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test20b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2485,6 +2795,10 @@ entry:
; DARWIN-X64-LABEL: test20c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test20c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2516,6 +2830,10 @@ entry:
; DARWIN-X64-LABEL: test20d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test20d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2546,6 +2864,10 @@ entry:
; DARWIN-X64-LABEL: test21a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test21a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2578,6 +2900,10 @@ entry:
; DARWIN-X64-LABEL: test21b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test21b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2610,6 +2936,10 @@ entry:
; DARWIN-X64-LABEL: test21c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test21c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2642,6 +2972,10 @@ entry:
; DARWIN-X64-LABEL: test21d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test21d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2673,6 +3007,10 @@ entry:
; DARWIN-X64-LABEL: test22a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test22a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2701,6 +3039,10 @@ entry:
; DARWIN-X64-LABEL: test22b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test22b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2729,6 +3071,10 @@ entry:
; DARWIN-X64-LABEL: test22c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test22c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2757,6 +3103,10 @@ entry:
; DARWIN-X64-LABEL: test22d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test22d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2784,6 +3134,10 @@ entry:
; DARWIN-X64-LABEL: test23a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test23a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2816,6 +3170,10 @@ entry:
; DARWIN-X64-LABEL: test23b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test23b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2848,6 +3206,10 @@ entry:
; DARWIN-X64-LABEL: test23c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test23c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2880,6 +3242,10 @@ entry:
; DARWIN-X64-LABEL: test23d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test23d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2911,6 +3277,10 @@ entry:
; DARWIN-X64-LABEL: test24a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test24a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -2943,6 +3313,10 @@ entry:
; DARWIN-X64-LABEL: test24b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test24b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -2975,6 +3349,10 @@ entry:
; DARWIN-X64-LABEL: test24c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test24c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -3007,6 +3385,10 @@ entry:
; DARWIN-X64-LABEL: test24d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test24d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -3038,6 +3420,10 @@ entry:
; DARWIN-X64-LABEL: test25a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test25a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3065,6 +3451,10 @@ entry:
; DARWIN-X64-LABEL: test25b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test25b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3092,6 +3482,10 @@ entry:
; DARWIN-X64-LABEL: test25c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test25c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3119,6 +3513,10 @@ entry:
; DARWIN-X64-LABEL: test25d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test25d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3148,6 +3546,10 @@ entry:
; DARWIN-X64-LABEL: test26:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test26:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.nest, align 4
%b = getelementptr inbounds %struct.nest, %struct.nest* %c, i32 0, i32 1
%_a = getelementptr inbounds %struct.pair, %struct.pair* %b, i32 0, i32 0
@@ -3180,6 +3582,10 @@ bb:
; DARWIN-X64-LABEL: test27:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test27:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%tmp = alloca %struct.small*, align 8
%tmp1 = call i32 (...) @dummy(%struct.small** %tmp)
%tmp2 = load %struct.small*, %struct.small** %tmp, align 8
@@ -3233,6 +3639,10 @@ entry:
; DARWIN-X64-LABEL: test28a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test28a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca [32 x i8], align 16
%arraydecay = getelementptr inbounds [32 x i8], [32 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3259,6 +3669,10 @@ entry:
; DARWIN-X64-LABEL: test28b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test28b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca [33 x i8], align 16
%arraydecay = getelementptr inbounds [33 x i8], [33 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3285,6 +3699,10 @@ entry:
; DARWIN-X64-LABEL: test29a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test29a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca [4 x i8], align 1
%arraydecay = getelementptr inbounds [4 x i8], [4 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3311,6 +3729,10 @@ entry:
; DARWIN-X64-LABEL: test29b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test29b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca [5 x i8], align 1
%arraydecay = getelementptr inbounds [5 x i8], [5 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3338,6 +3760,10 @@ entry:
; DARWIN-X64-LABEL: test30a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test30a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca %struct.small_char, align 4
%test.coerce = alloca { i64, i8 }
%0 = bitcast { i64, i8 }* %test.coerce to i8*
@@ -3372,6 +3798,10 @@ entry:
; DARWIN-X64-LABEL: test30b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test30b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca %struct.small_char, align 4
%test.coerce = alloca { i64, i8 }
%0 = bitcast { i64, i8 }* %test.coerce to i8*
@@ -3406,6 +3836,10 @@ entry:
; DARWIN-X64-LABEL: test31a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test31a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca i8*, align 8
%0 = alloca i8, i64 4
store i8* %0, i8** %test, align 8
@@ -3434,6 +3868,10 @@ entry:
; DARWIN-X64-LABEL: test31b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test31b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca i8*, align 8
%0 = alloca i8, i64 5
store i8* %0, i8** %test, align 8
@@ -3442,6 +3880,17 @@ entry:
ret i32 %call
}
+define void @__stack_chk_fail() #1 !dbg !6 {
+entry:
+ ret void
+}
+
+define void @test32() #1 !dbg !7 {
+entry:
+ %0 = alloca [5 x i8], align 1
+ ret void
+}
+
declare double @testi_aux()
declare i8* @strcpy(i8*, i8*)
declare i32 @printf(i8*, ...)
@@ -3461,3 +3910,16 @@ attributes #2 = { sspreq }
attributes #3 = { ssp "stack-protector-buffer-size"="33" }
attributes #4 = { ssp "stack-protector-buffer-size"="5" }
attributes #5 = { ssp "stack-protector-buffer-size"="6" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version x.y.z"}
+!6 = distinct !DISubprogram(name: "__stack_chk_fail", scope: !1, unit: !0)
+!7 = distinct !DISubprogram(name: "foo", scope: !1, unit: !0)
diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll
index 90ac2cc601fa..d38c68a8a5bb 100644
--- a/test/CodeGen/X86/stack_guard_remat.ll
+++ b/test/CodeGen/X86/stack_guard_remat.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
;CHECK: foo2
;CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), [[R0:%[a-z0-9]+]]
diff --git a/test/CodeGen/X86/stackguard-internal.ll b/test/CodeGen/X86/stackguard-internal.ll
new file mode 100644
index 000000000000..328e04b9a718
--- /dev/null
+++ b/test/CodeGen/X86/stackguard-internal.ll
@@ -0,0 +1,15 @@
+; Check that the backend doesn't crash.
+; RUN: llc -mtriple=x86_64-pc-freebsd %s -o - | FileCheck %s
+
+@__stack_chk_guard = internal global [8 x i64] zeroinitializer, align 16
+
+define void @f() sspstrong {
+ %tbl = alloca [4 x i64], align 16
+ ret void
+}
+
+; CHECK: movq __stack_chk_guard(%rip), %rax
+; CHECK: movq __stack_chk_guard(%rip), %rax
+; CHECK: .type __stack_chk_guard,@object
+; CHECK: .local __stack_chk_guard
+; CHECK: .comm __stack_chk_guard,64,16
diff --git a/test/CodeGen/X86/stackmap-frame-setup.ll b/test/CodeGen/X86/stackmap-frame-setup.ll
index 076e2482f8ba..b83a8d61f6a2 100644
--- a/test/CodeGen/X86/stackmap-frame-setup.ll
+++ b/test/CodeGen/X86/stackmap-frame-setup.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
define void @caller_meta_leaf() {
entry:
diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll
index a38b9209a1cf..0143a4e0fbc8 100644
--- a/test/CodeGen/X86/stackmap-large-constants.ll
+++ b/test/CodeGen/X86/stackmap-large-constants.ll
@@ -46,7 +46,7 @@
; NumLiveOuts
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 599b6265abfa..d2dd263a6174 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -34,7 +34,7 @@ entry:
; Num LiveOut Entries: 0
; CHECK-NEXT: .short 0
; Align
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; StackMap 1 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_stackmap_liveness
@@ -49,7 +49,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 12, i8* null, i32 0)
%a2 = call i64 asm sideeffect "", "={r8}"() nounwind
%a3 = call i8 asm sideeffect "", "={ah}"() nounwind
@@ -65,7 +65,7 @@ entry:
; Num LiveOut Entries: 0
; CHECK-NEXT: .short 0
; Align
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; StackMap 2 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_stackmap_liveness
@@ -96,7 +96,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 2, i32 12, i8* null, i32 0)
call void asm sideeffect "", "{r8},{ah},{ymm0},{ymm1}"(i64 %a2, i8 %a3, <4 x double> %a4, <4 x double> %a5) nounwind
@@ -109,7 +109,7 @@ entry:
; Num LiveOut Entries: 0
; CHECK-NEXT: .short 0
; Align
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; StackMap 3 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_stackmap_liveness
@@ -128,7 +128,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 12, i8* null, i32 0)
call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
ret void
@@ -146,7 +146,7 @@ entry:
; Num LiveOut Entries: 0
; PATCH-NEXT: .short 0
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
; StackMap 5 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_mixed_liveness
@@ -165,7 +165,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 5)
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
diff --git a/test/CodeGen/X86/statepoint-allocas.ll b/test/CodeGen/X86/statepoint-allocas.ll
index fa2621e7d2fe..040ab614d0a8 100644
--- a/test/CodeGen/X86/statepoint-allocas.ll
+++ b/test/CodeGen/X86/statepoint-allocas.ll
@@ -96,7 +96,7 @@ declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i3
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
; The Deopt one
; CHECK: .long .Ltmp3-test2
@@ -126,5 +126,5 @@ declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i3
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 1d38b2facc73..3e8b8ca49f1d 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -41,7 +41,7 @@ exceptional_return:
; CHECK: .long .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
; CHECK: .long .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
; CHECK: .byte 0
-; CHECK: .align 4
+; CHECK: .p2align 4
define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj,
i64 addrspace(1)* %obj1)
@@ -71,7 +71,7 @@ exceptional_return:
; CHECK: .long .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
; CHECK: .long .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
; CHECK: .byte 0
-; CHECK: .align 4
+; CHECK: .p2align 4
define i64 addrspace(1)* @test_same_val(i1 %cond, i64 addrspace(1)* %val1, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
gc "statepoint-example" personality i32 ()* @"personality_function" {
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index d4784212810f..d4bc7d47f669 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index 4f8b2ce6efd9..2b1357a1179a 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
-; RUN: llc < %s -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
; This test is a sanity check to ensure statepoints are generating StackMap
; sections correctly. This is not intended to be a rigorous test of the
@@ -168,7 +168,7 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
;
; test_derived_arg
@@ -235,7 +235,7 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
; Records for the test_id function:
@@ -275,5 +275,5 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
; No padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
diff --git a/test/CodeGen/X86/statepoint-uniqueing.ll b/test/CodeGen/X86/statepoint-uniqueing.ll
new file mode 100644
index 000000000000..e791bc6b2333
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-uniqueing.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+
+; Checks for a crash we had when two gc.relocate calls would
+; relocating identical values
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare void @f()
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) #3
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) #3
+
+define void @test_gcrelocate_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test_gcrelocate_uniqueing
+ %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
+ @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 undef, i32 addrspace(1)* %ptr, i32 addrspace(1)* %ptr)
+ %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 9, i32 9)
+ %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 10, i32 10)
+ ret void
+}
+
+define void @test_gcptr_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test_gcptr_uniqueing
+ %ptr2 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
+ %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
+ @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 undef, i32 addrspace(1)* %ptr, i8 addrspace(1)* %ptr2)
+ %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 9, i32 9)
+ %b = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 10, i32 10)
+ ret void
+}
diff --git a/test/CodeGen/X86/statepoint-vector-bad-spill.ll b/test/CodeGen/X86/statepoint-vector-bad-spill.ll
new file mode 100644
index 000000000000..848988589cb0
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-vector-bad-spill.ll
@@ -0,0 +1,39 @@
+; RUN: llc -O3 < %s | FileCheck %s
+
+; This is checking for a crash.
+
+target triple = "x86_64-pc-linux-gnu"
+
+define <2 x i8 addrspace(1)*> @test0(i8 addrspace(1)* %el, <2 x i8 addrspace(1)*>* %vec_ptr) gc "statepoint-example" {
+; CHECK-LABEL: test0:
+
+entry:
+ %tok0 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %el)
+ %el.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok0, i32 7, i32 7)
+
+ %obj.pre = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*>* %vec_ptr
+ %obj = insertelement <2 x i8 addrspace(1)*> %obj.pre, i8 addrspace(1)* %el.relocated, i32 0 ; No real objective here, except to use %el
+
+ %tok1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> %obj)
+ %obj.relocated = call <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %tok1, i32 7, i32 7)
+ ret <2 x i8 addrspace(1)*> %obj.relocated
+}
+
+define i8 addrspace(1)* @test1(<2 x i8 addrspace(1)*> %obj) gc "statepoint-example" {
+; CHECK-LABEL: test1:
+
+entry:
+ %tok1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> %obj)
+ %obj.relocated = call <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %tok1, i32 7, i32 7)
+
+ %el = extractelement <2 x i8 addrspace(1)*> %obj.relocated, i32 0
+ %tok0 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %el)
+ %el.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok0, i32 7, i32 7)
+ ret i8 addrspace(1)* %el.relocated
+}
+
+declare void @do_safepoint()
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32)
+declare <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token, i32, i32)
diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll
index 9d80e9217b49..21e7b204a070 100644
--- a/test/CodeGen/X86/statepoint-vector.ll
+++ b/test/CodeGen/X86/statepoint-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=core-avx -debug-only=stackmaps < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mcpu=nehalem -debug-only=stackmaps < %s | FileCheck %s
; REQUIRES: asserts
target triple = "x86_64-pc-linux-gnu"
diff --git a/test/CodeGen/X86/stdarg.ll b/test/CodeGen/X86/stdarg.ll
index 42cbcb1008d3..7b4f4e845fce 100644
--- a/test/CodeGen/X86/stdarg.ll
+++ b/test/CodeGen/X86/stdarg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s -mtriple=x86_64-linux | FileCheck %s
%struct.__va_list_tag = type { i32, i32, i8*, i8* }
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 6c1c56e43a4c..16f152d169d3 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -1,6 +1,7 @@
; rdar://7860110
; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=X64
-; RUN: llc -march=x86 -asm-verbose=false < %s | FileCheck %s -check-prefix=X32
+; RUN: llc -march=x86 -asm-verbose=false -fixup-byte-word-insts=1 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWON
+; RUN: llc -march=x86 -asm-verbose=false -fixup-byte-word-insts=0 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWOFF
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.2"
@@ -50,7 +51,8 @@ entry:
; X64: movw %si, (%rdi)
; X32-LABEL: test3:
-; X32: movw 8(%esp), %ax
+; X32-BWON: movzwl 8(%esp), %eax
+; X32-BWOFF: movw 8(%esp), %ax
; X32: movw %ax, (%{{.*}})
}
@@ -67,7 +69,8 @@ entry:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test4:
-; X32: movw 8(%esp), %[[REG:[abcd]]]x
+; X32-BWON: movzwl 8(%esp), %e[[REG:[abcd]]]x
+; X32-BWOFF: movw 8(%esp), %[[REG:[abcd]]]x
; X32: movw %[[REG]]x, 2(%{{.*}})
}
@@ -84,7 +87,8 @@ entry:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test5:
-; X32: movw 8(%esp), %[[REG:[abcd]]]x
+; X32-BWON: movzwl 8(%esp), %e[[REG:[abcd]]]x
+; X32-BWOFF: movw 8(%esp), %[[REG:[abcd]]]x
; X32: movw %[[REG]]x, 2(%{{.*}})
}
diff --git a/test/CodeGen/X86/store-zero-and-minus-one.ll b/test/CodeGen/X86/store-zero-and-minus-one.ll
new file mode 100644
index 000000000000..14790018e050
--- /dev/null
+++ b/test/CodeGen/X86/store-zero-and-minus-one.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK32 --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK
+
+define void @zero_optsize(i32* %p) optsize {
+entry:
+ store i32 0, i32* %p
+ ret void
+
+; CHECK-LABEL: zero_optsize:
+; CHECK: movl $0
+; CHECK: ret
+}
+
+define void @minus_one_optsize(i32* %p) optsize {
+entry:
+ store i32 -1, i32* %p
+ ret void
+
+; CHECK-LABEL: minus_one_optsize:
+; CHECK: movl $-1
+; CHECK: ret
+}
+
+
+define void @zero_64(i64* %p) minsize {
+entry:
+ store i64 0, i64* %p
+ ret void
+
+; CHECK-LABEL: zero_64:
+; CHECK32: andl $0
+; CHECK32: andl $0
+; CHECK64: andq $0
+; CHECK: ret
+}
+
+define void @zero_32(i32* %p) minsize {
+entry:
+ store i32 0, i32* %p
+ ret void
+
+; CHECK-LABEL: zero_32:
+; CHECK: andl $0
+; CHECK: ret
+}
+
+define void @zero_16(i16* %p) minsize {
+entry:
+ store i16 0, i16* %p
+ ret void
+
+; CHECK-LABEL: zero_16:
+; CHECK: andw $0
+; CHECK: ret
+}
+
+
+define void @minus_one_64(i64* %p) minsize {
+entry:
+ store i64 -1, i64* %p
+ ret void
+
+; CHECK-LABEL: minus_one_64:
+; CHECK32: orl $-1
+; CHECK32: orl $-1
+; CHECK64: orq $-1
+; CHECK: ret
+}
+
+define void @minus_one_32(i32* %p) minsize {
+entry:
+ store i32 -1, i32* %p
+ ret void
+
+; CHECK-LABEL: minus_one_32:
+; CHECK: orl $-1
+; CHECK: ret
+}
+
+define void @minus_one_16(i16* %p) minsize {
+entry:
+ store i16 -1, i16* %p
+ ret void
+
+; CHECK-LABEL: minus_one_16:
+; CHECK: orw $-1
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/swift-return.ll b/test/CodeGen/X86/swift-return.ll
new file mode 100644
index 000000000000..cd028d0c16ad
--- /dev/null
+++ b/test/CodeGen/X86/swift-return.ll
@@ -0,0 +1,206 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-unknown-unknown -O0 | FileCheck --check-prefix=CHECK-O0 %s
+
+@var = global i32 0
+
+; Test how llvm handles return type of {i16, i8}. The return value will be
+; passed in %eax and %dl.
+; CHECK-LABEL: test:
+; CHECK: movl %edi
+; CHECK: callq gen
+; CHECK: movsbl %dl
+; CHECK: addl %{{.*}}, %eax
+; CHECK-O0-LABEL: test
+; CHECK-O0: movl %edi
+; CHECK-O0: callq gen
+; CHECK-O0: movswl %ax
+; CHECK-O0: movsbl %dl
+; CHECK-O0: addl
+; CHECK-O0: movw %{{.*}}, %ax
+define i16 @test(i32 %key) {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i16, i8 } @gen(i32 %0)
+ %v3 = extractvalue { i16, i8 } %call, 0
+ %v1 = sext i16 %v3 to i32
+ %v5 = extractvalue { i16, i8 } %call, 1
+ %v2 = sext i8 %v5 to i32
+ %add = add nsw i32 %v1, %v2
+ %conv = trunc i32 %add to i16
+ ret i16 %conv
+}
+
+declare swiftcc { i16, i8 } @gen(i32)
+
+; If we can't pass every return value in register, we will pass everything
+; in memroy. The caller provides space for the return value and passes
+; the address in %rax. The first input argument will be in %rdi.
+; CHECK-LABEL: test2:
+; CHECK: leaq (%rsp), %rax
+; CHECK: callq gen2
+; CHECK: movl (%rsp)
+; CHECK-DAG: addl 4(%rsp)
+; CHECK-DAG: addl 8(%rsp)
+; CHECK-DAG: addl 12(%rsp)
+; CHECK-DAG: addl 16(%rsp)
+; CHECK-O0-LABEL: test2:
+; CHECK-O0-DAG: leaq (%rsp), %rax
+; CHECK-O0: callq gen2
+; CHECK-O0-DAG: movl (%rsp)
+; CHECK-O0-DAG: movl 4(%rsp)
+; CHECK-O0-DAG: movl 8(%rsp)
+; CHECK-O0-DAG: movl 12(%rsp)
+; CHECK-O0-DAG: movl 16(%rsp)
+; CHECK-O0: addl
+; CHECK-O0: addl
+; CHECK-O0: addl
+; CHECK-O0: addl
+; CHECK-O0: movl %{{.*}}, %eax
+define i32 @test2(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32, i32 } %call, 3
+ %v8 = extractvalue { i32, i32, i32, i32, i32 } %call, 4
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ %add3 = add nsw i32 %add2, %v8
+ ret i32 %add3
+}
+
+; The address of the return value is passed in %rax.
+; On return, we don't keep the address in %rax.
+; CHECK-LABEL: gen2:
+; CHECK: movl %edi, 16(%rax)
+; CHECK: movl %edi, 12(%rax)
+; CHECK: movl %edi, 8(%rax)
+; CHECK: movl %edi, 4(%rax)
+; CHECK: movl %edi, (%rax)
+; CHECK-O0-LABEL: gen2:
+; CHECK-O0-DAG: movl %edi, 16(%rax)
+; CHECK-O0-DAG: movl %edi, 12(%rax)
+; CHECK-O0-DAG: movl %edi, 8(%rax)
+; CHECK-O0-DAG: movl %edi, 4(%rax)
+; CHECK-O0-DAG: movl %edi, (%rax)
+define swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %key) {
+ %Y = insertvalue { i32, i32, i32, i32, i32 } undef, i32 %key, 0
+ %Z = insertvalue { i32, i32, i32, i32, i32 } %Y, i32 %key, 1
+ %Z2 = insertvalue { i32, i32, i32, i32, i32 } %Z, i32 %key, 2
+ %Z3 = insertvalue { i32, i32, i32, i32, i32 } %Z2, i32 %key, 3
+ %Z4 = insertvalue { i32, i32, i32, i32, i32 } %Z3, i32 %key, 4
+ ret { i32, i32, i32, i32, i32 } %Z4
+}
+
+; The return value {i32, i32, i32, i32} will be returned via registers %eax,
+; %edx, %ecx, %r8d.
+; CHECK-LABEL: test3:
+; CHECK: callq gen3
+; CHECK: addl %edx, %eax
+; CHECK: addl %ecx, %eax
+; CHECK: addl %r8d, %eax
+; CHECK-O0-LABEL: test3:
+; CHECK-O0: callq gen3
+; CHECK-O0: addl %edx, %eax
+; CHECK-O0: addl %ecx, %eax
+; CHECK-O0: addl %r8d, %eax
+define i32 @test3(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32 } @gen3(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32 } %call, 3
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ ret i32 %add2
+}
+
+declare swiftcc { i32, i32, i32, i32 } @gen3(i32 %key)
+
+; The return value {float, float, float, float} will be returned via registers
+; %xmm0, %xmm1, %xmm2, %xmm3.
+; CHECK-LABEL: test4:
+; CHECK: callq gen4
+; CHECK: addss %xmm1, %xmm0
+; CHECK: addss %xmm2, %xmm0
+; CHECK: addss %xmm3, %xmm0
+; CHECK-O0-LABEL: test4:
+; CHECK-O0: callq gen4
+; CHECK-O0: addss %xmm1, %xmm0
+; CHECK-O0: addss %xmm2, %xmm0
+; CHECK-O0: addss %xmm3, %xmm0
+define float @test4(float %key) #0 {
+entry:
+ %key.addr = alloca float, align 4
+ store float %key, float* %key.addr, align 4
+ %0 = load float, float* %key.addr, align 4
+ %call = call swiftcc { float, float, float, float } @gen4(float %0)
+
+ %v3 = extractvalue { float, float, float, float } %call, 0
+ %v5 = extractvalue { float, float, float, float } %call, 1
+ %v6 = extractvalue { float, float, float, float } %call, 2
+ %v7 = extractvalue { float, float, float, float } %call, 3
+
+ %add = fadd float %v3, %v5
+ %add1 = fadd float %add, %v6
+ %add2 = fadd float %add1, %v7
+ ret float %add2
+}
+
+declare swiftcc { float, float, float, float } @gen4(float %key)
+
+; CHECK-LABEL: consume_i1_ret:
+; CHECK: callq produce_i1_ret
+; CHECK: andb $1, %al
+; CHECK: andb $1, %dl
+; CHECK: andb $1, %cl
+; CHECK: andb $1, %r8b
+; CHECK-O0-LABEL: consume_i1_ret:
+; CHECK-O0: callq produce_i1_ret
+; CHECK-O0: andb $1, %al
+; CHECK-O0: andb $1, %dl
+; CHECK-O0: andb $1, %cl
+; CHECK-O0: andb $1, %r8b
+define void @consume_i1_ret() {
+ %call = call swiftcc { i1, i1, i1, i1 } @produce_i1_ret()
+ %v3 = extractvalue { i1, i1, i1, i1 } %call, 0
+ %v5 = extractvalue { i1, i1, i1, i1 } %call, 1
+ %v6 = extractvalue { i1, i1, i1, i1 } %call, 2
+ %v7 = extractvalue { i1, i1, i1, i1 } %call, 3
+ %val = zext i1 %v3 to i32
+ store i32 %val, i32* @var
+ %val2 = zext i1 %v5 to i32
+ store i32 %val2, i32* @var
+ %val3 = zext i1 %v6 to i32
+ store i32 %val3, i32* @var
+ %val4 = zext i1 %v7 to i32
+ store i32 %val4, i32* @var
+ ret void
+}
+
+declare swiftcc { i1, i1, i1, i1 } @produce_i1_ret()
+
+; CHECK-LABEL: foo:
+; CHECK: movq %rdi, (%rax)
+; CHECK-O0-LABEL: foo:
+; CHECK-O0: movq %rdi, (%rax)
+define swiftcc void @foo(i64* sret %agg.result, i64 %val) {
+ store i64 %val, i64* %agg.result
+ ret void
+}
diff --git a/test/CodeGen/X86/swifterror.ll b/test/CodeGen/X86/swifterror.ll
new file mode 100644
index 000000000000..d8db36b09c25
--- /dev/null
+++ b/test/CodeGen/X86/swifterror.ll
@@ -0,0 +1,359 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-APPLE %s
+; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-O0 %s
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+%swift_error = type {i64, i8}
+
+; This tests the basic usage of a swifterror parameter. "foo" is the function
+; that takes a swifterror parameter and "caller" is the caller of "foo".
+define float @foo(%swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo:
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: movq %rax, %r12
+
+; CHECK-O0-LABEL: foo:
+; CHECK-O0: movl $16
+; CHECK-O0: malloc
+; CHECK-O0: movb $1, 8(%rax)
+; CHECK-O0: movq %{{.*}}, %r12
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+}
+
+; "caller" calls "foo" that takes a swifterror parameter.
+define float @caller(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: jne
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "caller2" is the caller of "foo", it calls "foo" inside a loop.
+define float @caller2(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller2:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; CHECK-APPLE: ucomiss
+; CHECK-APPLE: jbe
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller2:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: movq %r12, [[ID:%[a-z]+]]
+; CHECK-O0: cmpq $0, [[ID]]
+; CHECK-O0: jne
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ br label %bb_loop
+bb_loop:
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %cmp = fcmp ogt float %call, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "foo_if" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition.
+define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
+; CHECK-APPLE-LABEL: foo_if:
+; CHECK-APPLE: testl %edi, %edi
+; CHECK-APPLE: je
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: movq %rax, %r12
+; CHECK-APPLE-NOT: %r12
+; CHECK-APPLE: ret
+
+; CHECK-O0-LABEL: foo_if:
+; CHECK-O0: cmpl $0
+; spill to stack
+; CHECK-O0: movq %r12, {{.*}}(%rsp)
+; CHECK-O0: je
+; CHECK-O0: movl $16,
+; CHECK-O0: malloc
+; CHECK-O0: movq %rax, [[ID:%[a-z]+]]
+; CHECK-O0-DAG: movb $1, 8(%rax)
+; CHECK-O0-DAG: movq [[ID]], %r12
+; CHECK-O0: ret
+; reload from stack
+; CHECK-O0: movq {{.*}}(%rsp), %r12
+; CHECK-O0: ret
+entry:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %normal
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+
+normal:
+ ret float 0.0
+}
+
+; "foo_loop" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition inside a loop.
+define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float %cc2) {
+; CHECK-APPLE-LABEL: foo_loop:
+; CHECK-APPLE: movq %r12, %rax
+; CHECK-APPLE: testl
+; CHECK-APPLE: je
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: ucomiss
+; CHECK-APPLE: jbe
+; CHECK-APPLE: movq %rax, %r12
+; CHECK-APPLE: ret
+
+; CHECK-O0-LABEL: foo_loop:
+; spill to stack
+; CHECK-O0: movq %r12, {{.*}}(%rsp)
+; CHECK-O0: cmpl $0
+; CHECK-O0: je
+; CHECK-O0: movl $16,
+; CHECK-O0: malloc
+; CHECK-O0: movq %rax, [[ID:%[a-z]+]]
+; CHECK-O0: movb $1, 8([[ID]])
+; CHECK-O0: jbe
+; reload from stack
+; CHECK-O0: movq {{.*}}(%rsp), %r12
+; CHECK-O0: ret
+entry:
+ br label %bb_loop
+
+bb_loop:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %bb_cont
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ br label %bb_cont
+
+bb_cont:
+ %cmp = fcmp ogt float %cc2, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ ret float 0.0
+}
+
+%struct.S = type { i32, i32, i32, i32, i32, i32 }
+
+; "foo_sret" is a function that takes a swifterror parameter, it also has a sret
+; parameter.
+define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo_sret:
+; CHECK-APPLE: movq %rdi, %{{.*}}
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: movl %{{.*}}, 4(%{{.*}})
+; CHECK-APPLE: movq %rax, %r12
+; CHECK-APPLE: movq %{{.*}}, %rax
+; CHECK-APPLE-NOT: x19
+
+; CHECK-O0-LABEL: foo_sret:
+; CHECK-O0: movl $16,
+; spill sret to stack
+; CHECK-O0: movq %rdi,
+; CHECK-O0: movq {{.*}}, %rdi
+; CHECK-O0: malloc
+; CHECK-O0: movb $1, 8(%rax)
+; CHECK-O0: movl %{{.*}}, 4(%{{.*}})
+; CHECK-O0: movq %{{.*}}, %r12
+; reload sret from stack
+; CHECK-O0: movq {{.*}}(%rsp), %rax
+; CHECK-O0: ret
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ %v2 = getelementptr inbounds %struct.S, %struct.S* %agg.result, i32 0, i32 1
+ store i32 %val1, i32* %v2
+ ret void
+}
+
+; "caller3" calls "foo_sret" that takes a swifterror parameter.
+define float @caller3(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller3:
+; CHECK-APPLE: movl $1, %esi
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo_sret
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12),
+; CHECK-APPLE: movb %{{.*}},
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller3:
+; CHECK-O0: xorl
+; CHECK-O0: movl {{.*}}, %r12d
+; CHECK-O0: movl $1, %esi
+; CHECK-O0: movq {{.*}}, %rdi
+; CHECK-O0: callq {{.*}}foo_sret
+; CHECK-O0: movq %r12,
+; CHECK-O0: cmpq $0
+; CHECK-O0: jne
+; Access part of the error object and save it to error_ref
+; CHECK-O0: movb 8(%{{.*}}),
+; CHECK-O0: movb %{{.*}},
+; reload from stack
+; CHECK-O0: movq {{.*}}(%rsp), %rdi
+; CHECK-O0: callq {{.*}}free
+entry:
+ %s = alloca %struct.S, align 8
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ call void @foo_sret(%struct.S* sret %s, i32 1, %swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; This is a caller with multiple swifterror values, it calls "foo" twice, each
+; time with a different swifterror value, from "alloca swifterror".
+define float @caller_with_multiple_swifterror_values(i8* %error_ref, i8* %error_ref2) {
+; CHECK-APPLE-LABEL: caller_with_multiple_swifterror_values:
+
+; The first swifterror value:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; The second swifterror value:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller_with_multiple_swifterror_values:
+
+; The first swifterror value:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: jne
+
+; The second swifterror value:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: jne
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+
+ %error_ptr_ref2 = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref2
+ %call2 = call float @foo(%swift_error** swifterror %error_ptr_ref2)
+ %error_from_foo2 = load %swift_error*, %swift_error** %error_ptr_ref2
+ %had_error_from_foo2 = icmp ne %swift_error* %error_from_foo2, null
+ %bitcast2 = bitcast %swift_error* %error_from_foo2 to i8*
+ br i1 %had_error_from_foo2, label %handler2, label %cont2
+cont2:
+ %v2 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo2, i64 0, i32 1
+ %t2 = load i8, i8* %v2
+ store i8 %t2, i8* %error_ref2
+ br label %handler2
+handler2:
+ call void @free(i8* %bitcast2)
+
+ ret float 1.0
+}
diff --git a/test/CodeGen/X86/swiftself.ll b/test/CodeGen/X86/swiftself.ll
new file mode 100644
index 000000000000..c5e905945605
--- /dev/null
+++ b/test/CodeGen/X86/swiftself.ll
@@ -0,0 +1,62 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s
+
+; Parameter with swiftself should be allocated to r13.
+; CHECK-LABEL: swiftself_param:
+; CHECK: movq %r13, %rax
+define i8 *@swiftself_param(i8* swiftself %addr0) {
+ ret i8 *%addr0
+}
+
+; Check that r13 is used to pass a swiftself argument.
+; CHECK-LABEL: call_swiftself:
+; CHECK: movq %rdi, %r13
+; CHECK: callq {{_?}}swiftself_param
+define i8 *@call_swiftself(i8* %arg) {
+ %res = call i8 *@swiftself_param(i8* swiftself %arg)
+ ret i8 *%res
+}
+
+; r13 should be saved by the callee even if used for swiftself
+; CHECK-LABEL: swiftself_clobber:
+; CHECK: pushq %r13
+; ...
+; CHECK: popq %r13
+define i8 *@swiftself_clobber(i8* swiftself %addr0) {
+ call void asm sideeffect "nop", "~{r13}"()
+ ret i8 *%addr0
+}
+
+; Demonstrate that we do not need any movs when calling multiple functions
+; with swiftself argument.
+; CHECK-LABEL: swiftself_passthrough:
+; OPT-NOT: mov{{.*}}r13
+; OPT: callq {{_?}}swiftself_param
+; OPT-NOT: mov{{.*}}r13
+; OPT-NEXT: callq {{_?}}swiftself_param
+define void @swiftself_passthrough(i8* swiftself %addr0) {
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ ret void
+}
+
+; We can use a tail call if the callee swiftself is the same as the caller one.
+; CHECK-LABEL: swiftself_tail:
+; OPT: jmp {{_?}}swiftself_param
+; OPT-NOT: ret
+define i8* @swiftself_tail(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{r13}"()
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr0)
+ ret i8* %res
+}
+
+; We can not use a tail call if the callee swiftself is not the same as the
+; caller one.
+; CHECK-LABEL: swiftself_notail:
+; CHECK: movq %rdi, %r13
+; CHECK: callq {{_?}}swiftself_param
+; CHECK: retq
+define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
+ ret i8* %res
+}
diff --git a/test/CodeGen/X86/switch-bt.ll b/test/CodeGen/X86/switch-bt.ll
index 6a2cbe1ec6ca..e4fbbeb26c3a 100644
--- a/test/CodeGen/X86/switch-bt.ll
+++ b/test/CodeGen/X86/switch-bt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -asm-verbose=false < %s -jump-table-density=40 | FileCheck %s
; This switch should use bit tests, and the third bit test case is just
; testing for one possible value, so it doesn't need a bt.
diff --git a/test/CodeGen/X86/switch-density.ll b/test/CodeGen/X86/switch-density.ll
new file mode 100644
index 000000000000..52216fb4d7c2
--- /dev/null
+++ b/test/CodeGen/X86/switch-density.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=25 | FileCheck %s --check-prefix=DENSE --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=10 | FileCheck %s --check-prefix=SPARSE --check-prefix=CHECK
+
+declare void @g(i32)
+
+define void @sparse(i32 %x) {
+entry:
+ switch i32 %x, label %return [
+ i32 300, label %bb0
+ i32 100, label %bb1
+ i32 400, label %bb1
+ i32 500, label %bb2
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Should pivot around 400 for two subtrees with two jump tables each.
+; CHECK-LABEL: sparse
+; CHECK-NOT: cmpl
+; CHECK: cmpl $399
+; CHECK: cmpl $100
+; CHECK: cmpl $300
+; CHECK: cmpl $400
+; CHECK: cmpl $500
+}
+
+define void @med(i32 %x) {
+entry:
+ switch i32 %x, label %return [
+ i32 30, label %bb0
+ i32 10, label %bb1
+ i32 40, label %bb1
+ i32 50, label %bb2
+ i32 20, label %bb3
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as a jump table when sparse, and branches when dense.
+; CHECK-LABEL: med
+; SPARSE: addl $-10
+; SPARSE: cmpl $40
+; SPARSE: ja
+; SPARSE: jmpq *.LJTI
+; DENSE-NOT: cmpl
+; DENSE: cmpl $29
+; DENSE-DAG: cmpl $10
+; DENSE-DAG: cmpl $20
+; DENSE-DAG: cmpl $30
+; DENSE-DAG: cmpl $40
+; DENSE-DAG: cmpl $50
+; DENSE: retq
+}
+
+define void @dense(i32 %x) {
+entry:
+ switch i32 %x, label %return [
+ i32 12, label %bb0
+ i32 4, label %bb1
+ i32 16, label %bb1
+ i32 20, label %bb2
+ i32 8, label %bb3
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as a jump table when sparse, and branches when dense.
+; CHECK-LABEL: dense
+; CHECK: addl $-4
+; CHECK: cmpl $16
+; CHECK: ja
+; CHECK: jmpq *.LJTI
+}
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index b8cb7b1280ad..3679433c372f 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -233,11 +233,11 @@ entry:
; block.
switch i32 %x, label %sw.default [
- i32 1, label %sw.bb
- i32 5, label %sw.bb2
- i32 7, label %sw.bb3
- i32 9, label %sw.bb4
- i32 31, label %sw.bb5
+ i32 4, label %sw.bb
+ i32 20, label %sw.bb2
+ i32 28, label %sw.bb3
+ i32 36, label %sw.bb4
+ i32 124, label %sw.bb5
], !prof !2
sw.bb:
@@ -272,7 +272,7 @@ sw.epilog:
;
; CHECK: BB#0:
; BB#0 to BB#6: [10, UINT32_MAX] (15)
-; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45)
+; BB#0 to BB#8: [4, 20, 28, 36] (jump table) (45)
; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
}
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index 896a067da230..6393c688e282 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s
; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB
diff --git a/test/CodeGen/X86/switch.ll b/test/CodeGen/X86/switch.ll
index 46587341ea74..5d52f95e71cc 100644
--- a/test/CodeGen/X86/switch.ll
+++ b/test/CodeGen/X86/switch.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -O0 | FileCheck --check-prefix=NOOPT %s
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=40 -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -O0 -jump-table-density=40 -verify-machineinstrs | FileCheck --check-prefix=NOOPT %s
declare void @g(i32)
@@ -30,6 +30,47 @@ return: ret void
; NOOPT: jmpq
}
+; Should never be lowered as a jump table because of the attribute
+define void @basic_nojumptable(i32 %x) "no-jump-tables"="true" {
+entry:
+ switch i32 %x, label %return [
+ i32 3, label %bb0
+ i32 1, label %bb1
+ i32 4, label %bb1
+ i32 5, label %bb2
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Lowered as a jump table, both with and without optimization.
+; CHECK-LABEL: basic_nojumptable
+; CHECK-NOT: jmpq *.LJTI
+}
+
+; Should be lowered as a jump table because of the attribute
+define void @basic_nojumptable_false(i32 %x) "no-jump-tables"="false" {
+entry:
+ switch i32 %x, label %return [
+ i32 3, label %bb0
+ i32 1, label %bb1
+ i32 4, label %bb1
+ i32 5, label %bb2
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Lowered as a jump table, both with and without optimization.
+; CHECK-LABEL: basic_nojumptable_false
+; CHECK: decl
+; CHECK: cmpl $4
+; CHECK: ja
+; CHECK: jmpq *.LJTI
+}
+
define void @simple_ranges(i32 %x) {
entry:
@@ -47,6 +88,8 @@ bb0: tail call void @g(i32 0) br label %return
bb1: tail call void @g(i32 1) br label %return
return: ret void
+
+
; Should be lowered to two range checks.
; CHECK-LABEL: simple_ranges
; CHECK: leal -100
@@ -705,3 +748,33 @@ return: ret void
; Don't assert due to truncating the bitwidth (64) to i4 when checking
; that the bit-test range fits in a word.
}
+
+
+define i32 @pr27135(i32 %i) {
+entry:
+ br i1 undef, label %sw, label %end
+sw:
+ switch i32 %i, label %end [
+ i32 99, label %sw.bb
+ i32 98, label %sw.bb
+ i32 101, label %sw.bb
+ i32 97, label %sw.bb2
+ i32 96, label %sw.bb2
+ i32 100, label %sw.bb2
+ ]
+sw.bb:
+ unreachable
+sw.bb2:
+ unreachable
+end:
+ %p = phi i32 [ 1, %sw ], [ 0, %entry ]
+ ret i32 %p
+
+; CHECK-LABEL: pr27135:
+; The switch is lowered with bit tests. Since the case range is contiguous, the
+; second bit test is redundant and can be skipped. Check that we don't update
+; the phi node with an incoming value from the MBB of the skipped bit test
+; (-verify-machine-instrs cathces this).
+; CHECK: btl
+; CHECK-NOT: btl
+}
diff --git a/test/CodeGen/X86/tail-call-attrs.ll b/test/CodeGen/X86/tail-call-attrs.ll
index 17ebe997c8c1..90f1346de9aa 100644
--- a/test/CodeGen/X86/tail-call-attrs.ll
+++ b/test/CodeGen/X86/tail-call-attrs.ll
@@ -13,11 +13,11 @@ define zeroext i1 @test_bool() {
; Here, there's more zero extension to be done between the call and the return,
; so a tail call is impossible (well, according to current Clang practice
; anyway. The AMD64 ABI isn't crystal clear on the matter).
+; FIXME: The high 24 bits returned from test_i32 are undefined; do tail call!
declare zeroext i32 @give_i32()
define zeroext i8 @test_i32() {
; CHECK-LABEL: test_i32:
; CHECK: callq _give_i32
-; CHECK: movzbl %al, %eax
; CHECK: ret
%call = tail call zeroext i32 @give_i32()
@@ -27,11 +27,11 @@ define zeroext i8 @test_i32() {
; Here, one function is zeroext and the other is signext. To the extent that
; these both mean something they are incompatible so no tail call is possible.
+; FIXME: The high 16 bits returned are undefined; do tail call!
declare zeroext i16 @give_unsigned_i16()
define signext i16 @test_incompatible_i16() {
; CHECK-LABEL: test_incompatible_i16:
; CHECK: callq _give_unsigned_i16
-; CHECK: cwtl
; CHECK: ret
%call = tail call zeroext i16 @give_unsigned_i16()
diff --git a/test/CodeGen/X86/tail-call-casts.ll b/test/CodeGen/X86/tail-call-casts.ll
new file mode 100644
index 000000000000..5421b498e1ea
--- /dev/null
+++ b/test/CodeGen/X86/tail-call-casts.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -o - %s | FileCheck %s
+
+declare void @g_bool(i1 zeroext)
+define void @f_bool(i1 zeroext %x) {
+entry:
+ tail call void @g_bool(i1 zeroext %x)
+ ret void
+
+; Forwarding a bool in a tail call works.
+; CHECK-LABEL: f_bool:
+; CHECK-NOT: movz
+; CHECK: jmp g_bool
+}
+
+
+declare void @g_float(float)
+define void @f_i32(i32 %x) {
+entry:
+ %0 = bitcast i32 %x to float
+ tail call void @g_float(float %0)
+ ret void
+
+; Forwarding a bitcasted value works too.
+; CHECK-LABEL: f_i32
+; CHECK-NOT: mov
+; CHECK: jmp g_float
+}
diff --git a/test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll b/test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll
new file mode 100644
index 000000000000..73ce3b781f9d
--- /dev/null
+++ b/test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -o - %s | FileCheck %s
+
+declare void @f(i16 signext)
+declare void @g(i32 signext)
+
+
+define void @flags_match(i16 signext %x) {
+entry:
+ tail call void @f(i16 signext %x)
+ ret void
+
+; The parameter flags match; do the tail call.
+; CHECK-LABEL: flags_match:
+; CHECK: jmp f
+}
+
+define void @flags_mismatch(i16 zeroext %x) {
+entry:
+ tail call void @f(i16 signext %x)
+ ret void
+
+; The parameter flags mismatch. %x has not been sign-extended,
+; so tail call is not possible.
+; CHECK-LABEL: flags_mismatch:
+; CHECK: movswl
+; CHECK: calll f
+}
+
+
+define void @mismatch_doesnt_matter(i32 zeroext %x) {
+entry:
+ tail call void @g(i32 signext %x)
+ ret void
+
+; The parameter flags mismatch, but the type is wide enough that
+; no extension takes place in practice, so do the tail call.
+
+; CHECK-LABEL: mismatch_doesnt_matter:
+; CHECK: jmp g
+}
diff --git a/test/CodeGen/X86/tail-merge-unreachable.ll b/test/CodeGen/X86/tail-merge-unreachable.ll
new file mode 100644
index 000000000000..7b2c0f727215
--- /dev/null
+++ b/test/CodeGen/X86/tail-merge-unreachable.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s
+
+define i32 @tail_merge_unreachable(i32 %i) {
+entry:
+ br i1 undef, label %sw, label %end
+sw:
+ switch i32 %i, label %end [
+ i32 99, label %sw.bb
+ i32 98, label %sw.bb
+ i32 101, label %sw.bb
+ i32 97, label %sw.bb2
+ i32 96, label %sw.bb2
+ i32 100, label %sw.bb2
+ ]
+sw.bb:
+ unreachable
+sw.bb2:
+ unreachable
+end:
+ %p = phi i32 [ 1, %sw ], [ 0, %entry ]
+ ret i32 %p
+
+; CHECK-LABEL: tail_merge_unreachable:
+; Range Check
+; CHECK: addl $-96
+; CHECK: cmpl $5
+; CHECK: jbe [[JUMP_TABLE_BLOCK:[.][A-Za-z0-9_]+]]
+; CHECK: retq
+; CHECK: [[JUMP_TABLE_BLOCK]]:
+; CHECK: btl
+; CHECK: jae [[UNREACHABLE_BLOCK:[.][A-Za-z0-9_]+]]
+; CHECK [[UNREACHABLE_BLOCK]]:
+; CHECK: .Lfunc_end0
+}
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index bf778e5bad2b..12c90c1a5fa9 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -376,7 +376,7 @@ return:
; CHECK-LABEL: two_minsize:
; CHECK-NOT: XYZ
; CHECK: ret
-; CHECK: movl $0, XYZ(%rip)
+; CHECK: andl $0, XYZ(%rip)
; CHECK: movl $1, XYZ(%rip)
; CHECK-NOT: XYZ
diff --git a/test/CodeGen/X86/tailcall-stackalign.ll b/test/CodeGen/X86/tailcall-stackalign.ll
index d3f811cff248..256477d52cde 100644
--- a/test/CodeGen/X86/tailcall-stackalign.ll
+++ b/test/CodeGen/X86/tailcall-stackalign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-unknown-linux -tailcallopt | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux -tailcallopt -no-x86-call-frame-opt | FileCheck %s
; Linux has 8 byte alignment so the params cause stack size 20 when tailcallopt
; is enabled, ensure that a normal fastcc call has matching stack size
diff --git a/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..f6c49cab71b2
--- /dev/null
+++ b/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+tbm | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/tbm-builtins.c
+
+define i64 @test__bextri_u64(i64 %a0) {
+; X64-LABEL: test__bextri_u64:
+; X64: # BB#0:
+; X64-NEXT: bextr $1, %rdi, %rax
+; X64-NEXT: retq
+ %1 = call i64 @llvm.x86.tbm.bextri.u64(i64 %a0, i64 1)
+ ret i64 %1
+}
+
+define i64 @test__blcfill_u64(i64 %a0) {
+; X64-LABEL: test__blcfill_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = and i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blci_u64(i64 %a0) {
+; X64-LABEL: test__blci_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = xor i64 %1, -1
+ %3 = or i64 %a0, %2
+ ret i64 %3
+}
+
+define i64 @test__blcic_u64(i64 %a0) {
+; X64-LABEL: test__blcic_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: addq $1, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = add i64 %a0, 1
+ %3 = and i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @test__blcmsk_u64(i64 %a0) {
+; X64-LABEL: test__blcmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: xorq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = xor i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blcs_u64(i64 %a0) {
+; X64-LABEL: test__blcs_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = or i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blsfill_u64(i64 %a0) {
+; X64-LABEL: test__blsfill_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: retq
+ %1 = sub i64 %a0, 1
+ %2 = or i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blsic_u64(i64 %a0) {
+; X64-LABEL: test__blsic_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: subq $1, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = sub i64 %a0, 1
+ %3 = or i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @test__t1mskc_u64(i64 %a0) {
+; X64-LABEL: test__t1mskc_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: addq $1, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = add i64 %a0, 1
+ %3 = or i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @test__tzmsk_u64(i64 %a0) {
+; X64-LABEL: test__tzmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: subq $1, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = sub i64 %a0, 1
+ %3 = and i64 %1, %2
+ ret i64 %3
+}
+
+declare i64 @llvm.x86.tbm.bextri.u64(i64, i64)
diff --git a/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll b/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..a264adffe790
--- /dev/null
+++ b/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+tbm | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+tbm | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/tbm-builtins.c
+
+define i32 @test__bextri_u32(i32 %a0) {
+; X32-LABEL: test__bextri_u32:
+; X32: # BB#0:
+; X32-NEXT: bextr $1, {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__bextri_u32:
+; X64: # BB#0:
+; X64-NEXT: bextr $1, %edi, %eax
+; X64-NEXT: retq
+ %1 = call i32 @llvm.x86.tbm.bextri.u32(i32 %a0, i32 1)
+ ret i32 %1
+}
+
+define i32 @test__blcfill_u32(i32 %a0) {
+; X32-LABEL: test__blcfill_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcfill_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = and i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blci_u32(i32 %a0) {
+; X32-LABEL: test__blci_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: xorl $-1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blci_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = xor i32 %1, -1
+ %3 = or i32 %a0, %2
+ ret i32 %3
+}
+
+define i32 @test__blcic_u32(i32 %a0) {
+; X32-LABEL: test__blcic_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: addl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcic_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: addl $1, %edi
+; X64-NEXT: andl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = add i32 %a0, 1
+ %3 = and i32 %1, %2
+ ret i32 %3
+}
+
+define i32 @test__blcmsk_u32(i32 %a0) {
+; X32-LABEL: test__blcmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: xorl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = xor i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blcs_u32(i32 %a0) {
+; X32-LABEL: test__blcs_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcs_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = or i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blsfill_u32(i32 %a0) {
+; X32-LABEL: test__blsfill_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsfill_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: retq
+ %1 = sub i32 %a0, 1
+ %2 = or i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blsic_u32(i32 %a0) {
+; X32-LABEL: test__blsic_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsic_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: subl $1, %edi
+; X64-NEXT: orl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = sub i32 %a0, 1
+ %3 = or i32 %1, %2
+ ret i32 %3
+}
+
+define i32 @test__t1mskc_u32(i32 %a0) {
+; X32-LABEL: test__t1mskc_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: addl $1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__t1mskc_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: addl $1, %edi
+; X64-NEXT: orl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = add i32 %a0, 1
+ %3 = or i32 %1, %2
+ ret i32 %3
+}
+
+define i32 @test__tzmsk_u32(i32 %a0) {
+; X32-LABEL: test__tzmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__tzmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: subl $1, %edi
+; X64-NEXT: andl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = sub i32 %a0, 1
+ %3 = and i32 %1, %2
+ ret i32 %3
+}
+
+declare i32 @llvm.x86.tbm.bextri.u32(i32, i32)
diff --git a/test/CodeGen/X86/tls-android.ll b/test/CodeGen/X86/tls-android.ll
index 4156c7b3f5b9..53717f564fac 100644
--- a/test/CodeGen/X86/tls-android.ll
+++ b/test/CodeGen/X86/tls-android.ll
@@ -37,7 +37,7 @@ entry:
; CHECK-NOT: __emutls_v.external_x:
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK-LABEL: __emutls_v.external_y:
; CHECK-NEXT: .long 4
; CHECK-NEXT: .long 4
@@ -46,7 +46,7 @@ entry:
; CHECK-LABEL: __emutls_t.external_y:
; CHECK-NEXT: .long 7
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK-LABEL: __emutls_v.internal_y:
; CHECK-NEXT: .long 4
; CHECK-NEXT: .long 4
@@ -70,7 +70,7 @@ entry:
; X64-NOT: __emutls_v.external_x:
-; X64: .align 8
+; X64: .p2align 3
; X64-LABEL: __emutls_v.external_y:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
@@ -79,7 +79,7 @@ entry:
; X64-LABEL: __emutls_t.external_y:
; X64-NEXT: .long 7
-; X64: .align 8
+; X64: .p2align 3
; X64-LABEL: __emutls_v.internal_y:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index 235230e3c6a8..30c219d691e6 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
@i = thread_local global i32 15
@@ -79,3 +79,7 @@ define i32* @f4() {
entry:
ret i32* @i2
}
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/tls-windows-itanium.ll b/test/CodeGen/X86/tls-windows-itanium.ll
new file mode 100644
index 000000000000..20ac09901969
--- /dev/null
+++ b/test/CodeGen/X86/tls-windows-itanium.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-ASM
+; RUN: llc -mtriple i686-windows-itanium -filetype obj -o - %s | llvm-readobj -relocations - | FileCheck %s -check-prefix CHECK-OBJ
+
+@get_count_incremented.count = internal thread_local unnamed_addr global i32 0, align 4
+
+define i32 @get_count_incremented() {
+entry:
+ %0 = load i32, i32* @get_count_incremented.count, align 4
+ %inc = add i32 %0, 1
+ store i32 %inc, i32* @get_count_incremented.count, align 4
+ ret i32 %inc
+}
+
+; CHECK-ASM-LABEL: _get_count_incremented:
+; CHECK-ASM: movl __tls_index, %eax
+; CHECK-ASM: movl %fs:__tls_array, %ecx
+; CHECK-ASM: movl (%ecx,%eax,4), %ecx
+; CHECK-ASM: _get_count_incremented.count@SECREL32(%ecx), %eax
+; CHECK-ASM: incl %eax
+; CHECK-ASM: movl %eax, _get_count_incremented.count@SECREL32(%ecx)
+; CHECK-ASM: retl
+
+; CHECK-OBJ: Relocations [
+; CHECK-OBJ: Section ({{[0-9]+}}) .text {
+; CHECK-OBJ: 0x1 IMAGE_REL_I386_DIR32 __tls_index
+; CHECK-OBJ: 0x8 IMAGE_REL_I386_DIR32 __tls_array
+; CHECK-OBJ: 0x11 IMAGE_REL_I386_SECREL _get_count_incremented.count
+; CHECK-OBJ: 0x18 IMAGE_REL_I386_SECREL _get_count_incremented.count
+; CHECK-OBJ: }
+; CHECK-OBJ: ]
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 0f3d3adec4c3..85c51e618b2a 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -10,6 +10,7 @@
@i3 = internal thread_local global i32 15
@i4 = hidden thread_local global i32 15
@i5 = external hidden thread_local global i32
+@i6 = external protected thread_local global i32
@s1 = thread_local global i16 15
@b1 = thread_local global i8 0
@b2 = thread_local(localexec) global i8 0
@@ -438,3 +439,17 @@ entry:
ret i8* @b2
}
+
+define i32* @f16() {
+; X32_LINUX-LABEL: f16:
+; X32_LINUX: movl %gs:0, %eax
+; X32_LINUX-NEXT: leal i6@NTPOFF(%eax), %eax
+; X32_LINUX-NEXT: ret
+
+; X64_LINUX-LABEL: f16:
+; X64_LINUX: movq %fs:0, %rax
+; X64_LINUX-NEXT: leaq i6@TPOFF(%rax), %rax
+; X64_LINUX-NEXT: ret
+
+ ret i32* @i6
+}
diff --git a/test/CodeGen/X86/trunc-to-bool.ll b/test/CodeGen/X86/trunc-to-bool.ll
index 3dd98eea7fa9..3c99928824bc 100644
--- a/test/CodeGen/X86/trunc-to-bool.ll
+++ b/test/CodeGen/X86/trunc-to-bool.ll
@@ -1,14 +1,14 @@
; An integer truncation to i1 should be done with an and instruction to make
; sure only the LSBit survives. Test that this is the case both for a returned
; value and as the operand of a branch.
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
define zeroext i1 @test1(i32 %X) nounwind {
%Y = trunc i32 %X to i1
ret i1 %Y
}
; CHECK-LABEL: test1:
-; CHECK: andl $1, %eax
+; CHECK: andb $1, %al
define i1 @test2(i32 %val, i32 %mask) nounwind {
entry:
diff --git a/test/CodeGen/X86/twoaddr-coalesce.ll b/test/CodeGen/X86/twoaddr-coalesce.ll
index 38685ec27c02..c727f34cc9a5 100644
--- a/test/CodeGen/X86/twoaddr-coalesce.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 4
+; RUN: llc < %s -march=x86 | grep mov | count 2
; rdar://6523745
@"\01LC" = internal constant [4 x i8] c"%d\0A\00" ; <[4 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index 4b594f7c62ab..d2b78a8886f0 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s
; rdar://6504833
@@ -5,8 +6,8 @@ define float @test1(i32 %x) nounwind readnone {
; CHECK-LABEL: test1:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: pushl %eax
-; CHECK-NEXT: movsd .LCPI0_0, %xmm0
-; CHECK-NEXT: movd {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: orpd %xmm0, %xmm1
; CHECK-NEXT: subsd %xmm0, %xmm1
; CHECK-NEXT: xorps %xmm0, %xmm0
@@ -16,8 +17,8 @@ define float @test1(i32 %x) nounwind readnone {
; CHECK-NEXT: popl %eax
; CHECK-NEXT: retl
entry:
- %0 = uitofp i32 %x to float
- ret float %0
+ %0 = uitofp i32 %x to float
+ ret float %0
}
; PR10802
@@ -26,8 +27,8 @@ define float @test2(<4 x i32> %x) nounwind readnone ssp {
; CHECK: # BB#0: # %entry
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: movss %xmm0, %xmm1
-; CHECK-NEXT: movsd .LCPI1_0, %xmm0
+; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: orps %xmm0, %xmm1
; CHECK-NEXT: subsd %xmm0, %xmm1
; CHECK-NEXT: xorps %xmm0, %xmm0
diff --git a/test/CodeGen/X86/uint_to_fp.ll b/test/CodeGen/X86/uint_to_fp.ll
index 0536eb05222c..a2784fdcbbdd 100644
--- a/test/CodeGen/X86/uint_to_fp.ll
+++ b/test/CodeGen/X86/uint_to_fp.ll
@@ -1,14 +1,27 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep "sub.*esp"
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep cvtsi2ss
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin8 -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin8 -mattr=+sse2 | FileCheck %s --check-prefix=X64
; rdar://6034396
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin8"
-
-define void @test(i32 %x, float* %y) nounwind {
+define void @test(i32 %x, float* %y) nounwind {
+; X32-LABEL: test:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: shrl $23, %ecx
+; X32-NEXT: cvtsi2ssl %ecx, %xmm0
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: shrl $23, %edi
+; X64-NEXT: cvtsi2ssl %edi, %xmm0
+; X64-NEXT: movss %xmm0, (%rsi)
+; X64-NEXT: retq
entry:
- lshr i32 %x, 23 ; <i32>:0 [#uses=1]
- uitofp i32 %0 to float ; <float>:1 [#uses=1]
- store float %1, float* %y
- ret void
+ lshr i32 %x, 23
+ uitofp i32 %0 to float
+ store float %1, float* %y
+ ret void
}
diff --git a/test/CodeGen/X86/umul-with-overflow.ll b/test/CodeGen/X86/umul-with-overflow.ll
index ba5a790f4380..29cecbe5a0f6 100644
--- a/test/CodeGen/X86/umul-with-overflow.ll
+++ b/test/CodeGen/X86/umul-with-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
define zeroext i1 @a(i32 %x) nounwind {
@@ -9,7 +9,6 @@ define zeroext i1 @a(i32 %x) nounwind {
; CHECK-LABEL: a:
; CHECK: mull
; CHECK: seto %al
-; CHECK: movzbl %al, %eax
; CHECK: ret
}
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll
index ffbbcff2e5d6..644a36447302 100644
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -30,8 +30,8 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
; COREI7: movups _.str3
; CORE2: .section
-; CORE2: .align 3
+; CORE2: .p2align 3
; CORE2-NEXT: _.str1:
; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CORE2: .align 3
+; CORE2: .p2align 3
; CORE2-NEXT: _.str3:
diff --git a/test/CodeGen/X86/unaligned-spill-folding.ll b/test/CodeGen/X86/unaligned-spill-folding.ll
index dee94bce15a5..935c0b967f9e 100644
--- a/test/CodeGen/X86/unaligned-spill-folding.ll
+++ b/test/CodeGen/X86/unaligned-spill-folding.ll
@@ -34,7 +34,7 @@ middle.block:
; doesn't force stack realignment though
; UNALIGNED-LABEL: @test1
; UNALIGNED-NOT: andl $-{{..}}, %esp
-; UNALIGNED: movdqu {{.*}} # 16-byte Folded Spill
+; UNALIGNED: movdqu {{.*}} # 16-byte Spill
; UNALIGNED-NOT: paddd {{.*}} # 16-byte Folded Reload
; ALIGNED-LABEL: @test1
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index c41e529aa954..1058994d0ee1 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -22,15 +22,14 @@ entry:
!llvm.module.flags = !{!12}
!0 = !DILocalVariable(name: "x", line: 1, arg: 2, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !10, scope: !2, type: !4)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 1, file: !10, scope: !2, type: !4)
!2 = !DIFile(filename: "test.c", directory: "/dir")
-!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "producer", isOptimized: false, emissionKind: 0, file: !10, enums: !11, retainedTypes: !11, subprograms: !9)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "producer", isOptimized: false, emissionKind: FullDebug, file: !10, enums: !11, retainedTypes: !11)
!4 = !DISubroutineType(types: !5)
!5 = !{!6}
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!7 = distinct !DILexicalBlock(line: 1, column: 30, file: !10, scope: !1)
!8 = !DILocation(line: 4, column: 3, scope: !7)
-!9 = !{!1}
!10 = !DIFile(filename: "test.c", directory: "/dir")
!11 = !{}
!12 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/unreachableblockelim.ll b/test/CodeGen/X86/unreachableblockelim.ll
new file mode 100644
index 000000000000..49a075c32811
--- /dev/null
+++ b/test/CodeGen/X86/unreachableblockelim.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S < %s -unreachableblockelim | FileCheck %s
+; RUN: opt -S < %s -passes=unreachableblockelim | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @abort()
+
+; CHECK-LABEL: @foo(
+; CHECK-NOT return:
+define void @foo(i32* %p) {
+entry:
+ %p.addr = alloca i32*, align 8
+ call void @abort()
+ unreachable
+
+return: ; No predecessors!
+ store i32* %p, i32** %p.addr, align 8
+ ret void
+}
+
diff --git a/test/CodeGen/X86/unused_stackslots.ll b/test/CodeGen/X86/unused_stackslots.ll
new file mode 100644
index 000000000000..0bb904130f1c
--- /dev/null
+++ b/test/CodeGen/X86/unused_stackslots.ll
@@ -0,0 +1,246 @@
+; PR26374: Check no stack slots are allocated for vregs which have no real reference.
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ImageParameters = type { i32, i32, [0 x [16 x i16]] }
+%struct.InputParameters = type { i32, i32 }
+
+@c = common global %struct.ImageParameters* null, align 8
+@a = common global i16** null, align 8
+@d = common global [6 x i32] zeroinitializer, align 16
+@b = common global %struct.InputParameters* null, align 8
+@e = common global [4 x i32] zeroinitializer, align 16
+
+; It is not easy to check there is no unused holes in stack allocated for spills,
+; so simply check the size of stack allocated cannot exceed 350.
+; (408 is used before the fix for PR26374. 344 is used after the fix).
+;
+; CHECK-LABEL: @fn
+; CHECK: subq {{\$3[0-4][0-9]}}, %rsp
+
+; Function Attrs: nounwind uwtable
+define i32 @fn() #0 {
+entry:
+ %n = alloca [8 x [8 x i32]], align 16
+ %tmp = bitcast [8 x [8 x i32]]* %n to i8*
+ call void @llvm.lifetime.start(i64 256, i8* %tmp) #3
+ %tmp1 = bitcast [8 x [8 x i32]]* %n to i8*
+ %arraydecay.1 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 1, i64 0
+ %tmp2 = bitcast i32* %arraydecay.1 to i8*
+ %arraydecay.2 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 2, i64 0
+ %tmp3 = bitcast i32* %arraydecay.2 to i8*
+ %arraydecay.3 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 3, i64 0
+ %tmp4 = bitcast i32* %arraydecay.3 to i8*
+ %arraydecay.4 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 4, i64 0
+ %tmp5 = bitcast i32* %arraydecay.4 to i8*
+ %arraydecay.5 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 5, i64 0
+ %tmp6 = bitcast i32* %arraydecay.5 to i8*
+ %arraydecay.6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 6, i64 0
+ %tmp7 = bitcast i32* %arraydecay.6 to i8*
+ %arraydecay.7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 7, i64 0
+ %tmp8 = bitcast i32* %arraydecay.7 to i8*
+ br label %for.body
+
+for.body: ; preds = %for.inc73, %entry
+ %q.0131 = phi i32 [ 0, %entry ], [ %inc74, %for.inc73 ]
+ %m.0130 = phi i32 [ 0, %entry ], [ %m.4, %for.inc73 ]
+ %div = sdiv i32 %q.0131, 2
+ %shl = shl i32 %div, 3
+ %rem = srem i32 %q.0131, 2
+ %shl1 = shl nsw i32 %rem, 3
+ %tmp9 = sext i32 %shl1 to i64
+ %tmp10 = sext i32 %shl to i64
+ %tmp11 = or i32 %shl1, 4
+ %tmp12 = sext i32 %tmp11 to i64
+ %tmp13 = or i32 %shl, 4
+ %tmp14 = sext i32 %tmp13 to i64
+ br label %for.body4
+
+for.body4: ; preds = %for.inc48, %for.body
+ %indvars.iv148 = phi i64 [ %tmp10, %for.body ], [ %indvars.iv.next149, %for.inc48 ]
+ %m.1126 = phi i32 [ %m.0130, %for.body ], [ %m.3.lcssa, %for.inc48 ]
+ %tmp15 = load %struct.ImageParameters*, %struct.ImageParameters** @c, align 8
+ %opix_y = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp15, i64 0, i32 1
+ %tmp16 = load i32, i32* %opix_y, align 4
+ %tmp17 = trunc i64 %indvars.iv148 to i32
+ %add5 = add nsw i32 %tmp16, %tmp17
+ %tmp18 = sub nuw nsw i64 %indvars.iv148, %tmp10
+ %tmp19 = sext i32 %add5 to i64
+ %tmp20 = add nsw i64 %tmp19, 1
+ %tmp21 = or i64 %indvars.iv148, 1
+ %tmp22 = or i64 %tmp18, 1
+ %tmp23 = add nsw i64 %tmp19, 2
+ %tmp24 = or i64 %indvars.iv148, 2
+ %tmp25 = or i64 %tmp18, 2
+ %tmp26 = add nsw i64 %tmp19, 3
+ %tmp27 = or i64 %indvars.iv148, 3
+ %tmp28 = or i64 %tmp18, 3
+ br label %for.body9
+
+for.body9: ; preds = %for.inc45.for.body9_crit_edge, %for.body4
+ %tmp29 = phi %struct.ImageParameters* [ %tmp15, %for.body4 ], [ %.pre, %for.inc45.for.body9_crit_edge ]
+ %indvars.iv145 = phi i64 [ %tmp9, %for.body4 ], [ %indvars.iv.next146, %for.inc45.for.body9_crit_edge ]
+ %m.2124 = phi i32 [ %m.1126, %for.body4 ], [ %m.3, %for.inc45.for.body9_crit_edge ]
+ %opix_x = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp29, i64 0, i32 0
+ %tmp30 = load i32, i32* %opix_x, align 4
+ %tmp31 = trunc i64 %indvars.iv145 to i32
+ %add10 = add nsw i32 %tmp30, %tmp31
+ tail call void @LumaPrediction4x4(i32 %tmp31, i32 %tmp17, i32 0, i32 0, i32 0, i16 signext 0, i16 signext 0) #3
+ %tmp32 = load i16**, i16*** @a, align 8
+ %tmp33 = load %struct.ImageParameters*, %struct.ImageParameters** @c, align 8
+ %tmp34 = sub nuw nsw i64 %indvars.iv145, %tmp9
+ %tmp35 = sext i32 %add10 to i64
+ br label %for.cond14.preheader
+
+for.cond14.preheader: ; preds = %for.body9
+ %arrayidx = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp19
+ %tmp36 = load i16*, i16** %arrayidx, align 8
+ %arrayidx20 = getelementptr inbounds i16, i16* %tmp36, i64 %tmp35
+ %arrayidx26 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %indvars.iv148, i64 %indvars.iv145
+ %arrayidx35 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp18, i64 %tmp34
+ %tmp37 = bitcast i16* %arrayidx20 to <4 x i16>*
+ %tmp38 = load <4 x i16>, <4 x i16>* %tmp37, align 2
+ %tmp39 = zext <4 x i16> %tmp38 to <4 x i32>
+ %tmp40 = bitcast i16* %arrayidx26 to <4 x i16>*
+ %tmp41 = load <4 x i16>, <4 x i16>* %tmp40, align 2
+ %tmp42 = zext <4 x i16> %tmp41 to <4 x i32>
+ %tmp43 = sub nsw <4 x i32> %tmp39, %tmp42
+ %tmp44 = bitcast i32* %arrayidx35 to <4 x i32>*
+ store <4 x i32> %tmp43, <4 x i32>* %tmp44, align 16
+ store <4 x i32> %tmp43, <4 x i32>* bitcast ([6 x i32]* @d to <4 x i32>*), align 16
+ %arrayidx.1 = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp20
+ %tmp45 = load i16*, i16** %arrayidx.1, align 8
+ %arrayidx20.1 = getelementptr inbounds i16, i16* %tmp45, i64 %tmp35
+ %arrayidx26.1 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %tmp21, i64 %indvars.iv145
+ %arrayidx35.1 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp22, i64 %tmp34
+ %tmp46 = bitcast i16* %arrayidx20.1 to <4 x i16>*
+ %tmp47 = load <4 x i16>, <4 x i16>* %tmp46, align 2
+ %tmp48 = zext <4 x i16> %tmp47 to <4 x i32>
+ %tmp49 = bitcast i16* %arrayidx26.1 to <4 x i16>*
+ %tmp50 = load <4 x i16>, <4 x i16>* %tmp49, align 2
+ %tmp51 = zext <4 x i16> %tmp50 to <4 x i32>
+ %tmp52 = sub nsw <4 x i32> %tmp48, %tmp51
+ %tmp53 = bitcast i32* %arrayidx35.1 to <4 x i32>*
+ store <4 x i32> %tmp52, <4 x i32>* %tmp53, align 16
+ store <4 x i32> %tmp52, <4 x i32>* bitcast (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @d, i64 0, i64 4) to <4 x i32>*), align 16
+ %arrayidx.2 = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp23
+ %tmp54 = load i16*, i16** %arrayidx.2, align 8
+ %arrayidx20.2 = getelementptr inbounds i16, i16* %tmp54, i64 %tmp35
+ %arrayidx26.2 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %tmp24, i64 %indvars.iv145
+ %arrayidx35.2 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp25, i64 %tmp34
+ %tmp55 = bitcast i16* %arrayidx20.2 to <4 x i16>*
+ %tmp56 = load <4 x i16>, <4 x i16>* %tmp55, align 2
+ %tmp57 = zext <4 x i16> %tmp56 to <4 x i32>
+ %tmp58 = bitcast i16* %arrayidx26.2 to <4 x i16>*
+ %tmp59 = load <4 x i16>, <4 x i16>* %tmp58, align 2
+ %tmp60 = zext <4 x i16> %tmp59 to <4 x i32>
+ %tmp61 = sub nsw <4 x i32> %tmp57, %tmp60
+ %tmp62 = bitcast i32* %arrayidx35.2 to <4 x i32>*
+ store <4 x i32> %tmp61, <4 x i32>* %tmp62, align 16
+ store <4 x i32> %tmp61, <4 x i32>* bitcast (i32* getelementptr ([6 x i32], [6 x i32]* @d, i64 1, i64 2) to <4 x i32>*), align 16
+ %arrayidx.3 = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp26
+ %tmp63 = load i16*, i16** %arrayidx.3, align 8
+ %arrayidx20.3 = getelementptr inbounds i16, i16* %tmp63, i64 %tmp35
+ %arrayidx26.3 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %tmp27, i64 %indvars.iv145
+ %arrayidx35.3 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp28, i64 %tmp34
+ %tmp64 = bitcast i16* %arrayidx20.3 to <4 x i16>*
+ %tmp65 = load <4 x i16>, <4 x i16>* %tmp64, align 2
+ %tmp66 = zext <4 x i16> %tmp65 to <4 x i32>
+ %tmp67 = bitcast i16* %arrayidx26.3 to <4 x i16>*
+ %tmp68 = load <4 x i16>, <4 x i16>* %tmp67, align 2
+ %tmp69 = zext <4 x i16> %tmp68 to <4 x i32>
+ %tmp70 = sub nsw <4 x i32> %tmp66, %tmp69
+ %tmp71 = bitcast i32* %arrayidx35.3 to <4 x i32>*
+ store <4 x i32> %tmp70, <4 x i32>* %tmp71, align 16
+ store <4 x i32> %tmp70, <4 x i32>* bitcast (i32* getelementptr ([6 x i32], [6 x i32]* @d, i64 2, i64 0) to <4 x i32>*), align 16
+ %tmp72 = load %struct.InputParameters*, %struct.InputParameters** @b, align 8
+ %rdopt = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp72, i64 0, i32 0
+ %tmp73 = load i32, i32* %rdopt, align 4
+ %cmp42 = icmp eq i32 %tmp73, 0
+ br i1 %cmp42, label %land.lhs.true, label %if.then
+
+land.lhs.true: ; preds = %for.cond14.preheader
+ %Transform8x8Mode = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp72, i64 0, i32 1
+ %tmp74 = load i32, i32* %Transform8x8Mode, align 4
+ %tobool = icmp eq i32 %tmp74, 0
+ br i1 %tobool, label %if.then, label %for.inc45
+
+if.then: ; preds = %land.lhs.true, %for.cond14.preheader
+ %call = tail call i32 @distortion4x4(i32* nonnull getelementptr inbounds ([6 x i32], [6 x i32]* @d, i64 0, i64 0)) #3
+ %add44 = add nsw i32 %call, %m.2124
+ br label %for.inc45
+
+for.inc45: ; preds = %if.then, %land.lhs.true
+ %m.3 = phi i32 [ %m.2124, %land.lhs.true ], [ %add44, %if.then ]
+ %cmp8 = icmp slt i64 %indvars.iv145, %tmp12
+ br i1 %cmp8, label %for.inc45.for.body9_crit_edge, label %for.inc48
+
+for.inc45.for.body9_crit_edge: ; preds = %for.inc45
+ %indvars.iv.next146 = add nsw i64 %indvars.iv145, 4
+ %.pre = load %struct.ImageParameters*, %struct.ImageParameters** @c, align 8
+ br label %for.body9
+
+for.inc48: ; preds = %for.inc45
+ %m.3.lcssa = phi i32 [ %m.3, %for.inc45 ]
+ %indvars.iv.next149 = add nsw i64 %indvars.iv148, 4
+ %cmp3 = icmp slt i64 %indvars.iv148, %tmp14
+ br i1 %cmp3, label %for.body4, label %for.end50
+
+for.end50: ; preds = %for.inc48
+ %m.3.lcssa.lcssa = phi i32 [ %m.3.lcssa, %for.inc48 ]
+ %tmp75 = load %struct.InputParameters*, %struct.InputParameters** @b, align 8
+ %rdopt51 = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp75, i64 0, i32 0
+ %tmp76 = load i32, i32* %rdopt51, align 4
+ %cmp52 = icmp eq i32 %tmp76, 0
+ br i1 %cmp52, label %land.lhs.true54, label %for.inc73
+
+land.lhs.true54: ; preds = %for.end50
+ %Transform8x8Mode55 = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp75, i64 0, i32 1
+ %tmp77 = load i32, i32* %Transform8x8Mode55, align 4
+ %tobool56 = icmp eq i32 %tmp77, 0
+ br i1 %tobool56, label %for.inc73, label %for.body61.preheader
+
+for.body61.preheader: ; preds = %land.lhs.true54
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 4, i64 0) to i8*), i8* %tmp1, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 6, i64 0) to i8*), i8* %tmp2, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 8, i64 0) to i8*), i8* %tmp3, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 10, i64 0) to i8*), i8* %tmp4, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 12, i64 0) to i8*), i8* %tmp5, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 14, i64 0) to i8*), i8* %tmp6, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 16, i64 0) to i8*), i8* %tmp7, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 18, i64 0) to i8*), i8* %tmp8, i64 32, i32 16, i1 false)
+ %call70 = tail call i32 @distortion4x4(i32* nonnull getelementptr inbounds ([4 x i32], [4 x i32]* @e, i64 0, i64 0)) #3
+ %add71 = add nsw i32 %call70, %m.3.lcssa.lcssa
+ br label %for.inc73
+
+for.inc73: ; preds = %for.body61.preheader, %land.lhs.true54, %for.end50
+ %m.4 = phi i32 [ %add71, %for.body61.preheader ], [ %m.3.lcssa.lcssa, %land.lhs.true54 ], [ %m.3.lcssa.lcssa, %for.end50 ]
+ %inc74 = add nuw nsw i32 %q.0131, 1
+ %exitcond156 = icmp eq i32 %inc74, 4
+ br i1 %exitcond156, label %for.end75, label %for.body
+
+for.end75: ; preds = %for.inc73
+ %m.4.lcssa = phi i32 [ %m.4, %for.inc73 ]
+ call void @llvm.lifetime.end(i64 256, i8* %tmp) #3
+ ret i32 %m.4.lcssa
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare void @LumaPrediction4x4(i32, i32, i32, i32, i32, i16 signext, i16 signext) #2
+
+declare i32 @distortion4x4(i32*) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
diff --git a/test/CodeGen/X86/update-terminator.mir b/test/CodeGen/X86/update-terminator.mir
new file mode 100644
index 000000000000..1e75c6af9eb9
--- /dev/null
+++ b/test/CodeGen/X86/update-terminator.mir
@@ -0,0 +1,57 @@
+# RUN: llc -march=x86-64 -verify-machineinstrs -run-pass block-placement -o - %s | FileCheck %s
+# Check the conditional jump in bb.1 is changed to unconditional after block placement swaps bb.2 and bb.3.
+
+--- |
+ @a = external global i16
+ @b = external global i32
+
+ ; Function Attrs: nounwind
+ define void @f2() {
+ br i1 undef, label %bb1, label %bb3
+
+ bb1:
+ br i1 undef, label %bb2, label %bb2
+
+ bb2:
+ br label %bb4
+
+ bb3:
+ br label %bb2
+
+ bb4:
+ ret void
+ }
+
+
+...
+---
+# CHECK-LABEL: name: f2
+# CHECK: bb.1:
+# CHECK: JMP_1 %bb.2
+# CHECK: bb.3:
+# CHECK: bb.2:
+name: f2
+body: |
+ bb.0 (%ir-block.0):
+ successors: %bb.1(50), %bb.3(50)
+
+ JNE_1 %bb.1, implicit %eflags
+ JMP_1 %bb.3
+ bb.1:
+ successors: %bb.2(100)
+
+ JNE_1 %bb.2, implicit %eflags
+
+ bb.2:
+ successors: %bb.4(100)
+
+ JMP_1 %bb.4
+
+ bb.3:
+ successors: %bb.2(100)
+ JMP_1 %bb.2
+
+ bb.4:
+ RETQ
+
+...
diff --git a/test/CodeGen/X86/urem-i8-constant.ll b/test/CodeGen/X86/urem-i8-constant.ll
index e3cb69ca591f..45717f985c23 100644
--- a/test/CodeGen/X86/urem-i8-constant.ll
+++ b/test/CodeGen/X86/urem-i8-constant.ll
@@ -1,6 +1,21 @@
-; RUN: llc < %s -march=x86 | grep 111
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
define i8 @foo(i8 %tmp325) {
- %t546 = urem i8 %tmp325, 37
- ret i8 %t546
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: imull $111, %ecx, %eax
+; CHECK-NEXT: andl $28672, %eax # imm = 0x7000
+; CHECK-NEXT: shrl $12, %eax
+; CHECK-NEXT: movb $37, %dl
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: mulb %dl
+; CHECK-NEXT: subb %al, %cl
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %t546 = urem i8 %tmp325, 37
+ ret i8 %t546
}
+
diff --git a/test/CodeGen/X86/urem-power-of-two.ll b/test/CodeGen/X86/urem-power-of-two.ll
new file mode 100644
index 000000000000..9e27809c297d
--- /dev/null
+++ b/test/CodeGen/X86/urem-power-of-two.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; The easy case: a constant power-of-2 divisor.
+
+define i64 @const_pow_2(i64 %x) {
+; CHECK-LABEL: const_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl $31, %edi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+;
+ %urem = urem i64 %x, 32
+ ret i64 %urem
+}
+
+; A left-shifted power-of-2 divisor. Use a weird type for wider coverage.
+
+define i25 @shift_left_pow_2(i25 %x, i25 %y) {
+; CHECK-LABEL: shift_left_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: shll %cl, %eax
+; CHECK-NEXT: addl $33554431, %eax # imm = 0x1FFFFFF
+; CHECK-NEXT: andl %edi, %eax
+; CHECK-NEXT: retq
+;
+ %shl = shl i25 1, %y
+ %urem = urem i25 %x, %shl
+ ret i25 %urem
+}
+
+; FIXME: A logically right-shifted sign bit is a power-of-2 or UB.
+
+define i16 @shift_right_pow_2(i16 %x, i16 %y) {
+; CHECK-LABEL: shift_right_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $32768, %r8d # imm = 0x8000
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: shrl %cl, %r8d
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: divw %r8w
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
+;
+ %shr = lshr i16 -32768, %y
+ %urem = urem i16 %x, %shr
+ ret i16 %urem
+}
+
+; FIXME: A zero divisor would be UB, so this could be reduced to an 'and' with 3.
+
+define i8 @and_pow_2(i8 %x, i8 %y) {
+; CHECK-LABEL: and_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andb $4, %sil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %sil
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+;
+ %and = and i8 %y, 4
+ %urem = urem i8 %x, %and
+ ret i8 %urem
+}
+
+; A vector splat constant divisor should get the same treatment as a scalar.
+
+define <4 x i32> @vec_const_pow_2(<4 x i32> %x) {
+; CHECK-LABEL: vec_const_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %urem = urem <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
+ ret <4 x i32> %urem
+}
+
diff --git a/test/CodeGen/X86/utf16-cfstrings.ll b/test/CodeGen/X86/utf16-cfstrings.ll
index 5f0e78fccc65..773efbcdefaa 100644
--- a/test/CodeGen/X86/utf16-cfstrings.ll
+++ b/test/CodeGen/X86/utf16-cfstrings.ll
@@ -9,7 +9,7 @@
@_unnamed_cfstring_ = private constant %struct.NSConstantString { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([5 x i16]* @.str to i8*), i64 4 }, section "__DATA,__cfstring"
; CHECK: .section __TEXT,__ustring
-; CHECK-NEXT: .align 1
+; CHECK-NEXT: .p2align 1
; CHECK-NEXT: _.str:
; CHECK-NEXT: .short 252 ## 0xfc
; CHECK-NEXT: .short 98 ## 0x62
diff --git a/test/CodeGen/X86/v4f32-immediate.ll b/test/CodeGen/X86/v4f32-immediate.ll
index 68d20a04ecf0..7945b1093f8e 100644
--- a/test/CodeGen/X86/v4f32-immediate.ll
+++ b/test/CodeGen/X86/v4f32-immediate.ll
@@ -1,7 +1,16 @@
-; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
-
-; CHECK: movaps
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse | FileCheck %s --check-prefix=X64
define <4 x float> @foo() {
+; X32-LABEL: foo:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [3.223542e+00,2.300000e+00,1.200000e+00,1.000000e-01]
+; X32-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [3.223542e+00,2.300000e+00,1.200000e+00,1.000000e-01]
+; X64-NEXT: retq
ret <4 x float> <float 0x4009C9D0A0000000, float 0x4002666660000000, float 0x3FF3333340000000, float 0x3FB99999A0000000>
}
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index 21fe96321987..0135832ad929 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -1,15 +1,36 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
-
-;CHECK-LABEL: and_masks:
-;CHECK: vmovaps
-;CHECK: vcmpltp
-;CHECK: vcmpltp
-;CHECK: vandps
-;CHECK: vandps
-;CHECK: vmovaps
-;CHECK: ret
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64
define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; X32-LABEL: and_masks:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: vmovups (%edx), %ymm0
+; X32-NEXT: vmovups (%ecx), %ymm1
+; X32-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
+; X32-NEXT: vmovups (%eax), %ymm2
+; X32-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vandps LCPI0_0, %ymm1, %ymm1
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: and_masks:
+; X64: ## BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: vmovups (%rsi), %ymm1
+; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
+; X64-NEXT: vmovups (%rdx), %ymm2
+; X64-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: vmovaps %ymm0, (%rax)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%v0 = load <8 x float>, <8 x float>* %a, align 16
%v1 = load <8 x float>, <8 x float>* %b, align 16
%m0 = fcmp olt <8 x float> %v1, %v0
@@ -21,13 +42,30 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
ret void
}
-;CHECK: neg_mask
-;CHECK: vcmpltps
-;CHECK: vxorps
-;CHECK: vandps
-;CHECK: vmovaps
-;CHECK: ret
define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; X32-LABEL: neg_masks:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%ecx), %ymm0
+; X32-NEXT: vcmpltps (%eax), %ymm0, %ymm0
+; X32-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: neg_masks:
+; X64: ## BB#0:
+; X64-NEXT: vmovups (%rsi), %ymm0
+; X64-NEXT: vcmpltps (%rdi), %ymm0, %ymm0
+; X64-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: vmovaps %ymm0, (%rax)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%v0 = load <8 x float>, <8 x float>* %a, align 16
%v1 = load <8 x float>, <8 x float>* %b, align 16
%m0 = fcmp olt <8 x float> %v1, %v0
diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll
index bb1104d85d87..7d93c332f61c 100644
--- a/test/CodeGen/X86/vararg-callee-cleanup.ll
+++ b/test/CodeGen/X86/vararg-callee-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-pc-windows < %s | FileCheck %s
+; RUN: llc -mtriple=i686-pc-windows -no-x86-call-frame-opt < %s | FileCheck %s
target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
diff --git a/test/CodeGen/X86/vec-sign.ll b/test/CodeGen/X86/vec-sign.ll
deleted file mode 100644
index b3d85fd6ec7b..000000000000
--- a/test/CodeGen/X86/vec-sign.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem | FileCheck %s
-
-define <4 x i32> @signd(<4 x i32> %a, <4 x i32> %b) nounwind {
-entry:
-; CHECK-LABEL: signd:
-; CHECK: psignd
-; CHECK-NOT: sub
-; CHECK: ret
- %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <4 x i32> zeroinitializer, %a
- %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <4 x i32> %a, %0
- %2 = and <4 x i32> %b.lobit, %sub
- %cond = or <4 x i32> %1, %2
- ret <4 x i32> %cond
-}
-
-define <4 x i32> @blendvb(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) nounwind {
-entry:
-; CHECK-LABEL: blendvb:
-; CHECK: pblendvb
-; CHECK: ret
- %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <4 x i32> zeroinitializer, %a
- %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <4 x i32> %c, %0
- %2 = and <4 x i32> %a, %b.lobit
- %cond = or <4 x i32> %1, %2
- ret <4 x i32> %cond
-}
diff --git a/test/CodeGen/X86/vec_compare-sse4.ll b/test/CodeGen/X86/vec_compare-sse4.ll
index 084d61134206..714701897918 100644
--- a/test/CodeGen/X86/vec_compare-sse4.ll
+++ b/test/CodeGen/X86/vec_compare-sse4.ll
@@ -1,35 +1,66 @@
-; RUN: llc < %s -march=x86 -mattr=-sse3,+sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86 -mattr=-sse4.2,+sse4.1 | FileCheck %s -check-prefix=SSE41
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -check-prefix=SSE42
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=-sse3,+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=-sse4.2,+sse4.1 | FileCheck %s --check-prefix=SSE41
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42
define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
-; SSE42-LABEL: test1:
-; SSE42: pcmpgtq
-; SSE42: ret
-; SSE41-LABEL: test1:
-; SSE41-NOT: pcmpgtq
-; SSE41: ret
; SSE2-LABEL: test1:
-; SSE2-NOT: pcmpgtq
-; SSE2: ret
-
- %C = icmp sgt <2 x i64> %A, %B
+; SSE2: ## BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retl
+;
+; SSE41-LABEL: test1:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; SSE42-LABEL: test1:
+; SSE42: ## BB#0:
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: retl
+ %C = icmp sgt <2 x i64> %A, %B
%D = sext <2 x i1> %C to <2 x i64>
- ret <2 x i64> %D
+ ret <2 x i64> %D
}
define <2 x i64> @test2(<2 x i64> %A, <2 x i64> %B) nounwind {
-; SSE42-LABEL: test2:
-; SSE42: pcmpeqq
-; SSE42: ret
-; SSE41-LABEL: test2:
-; SSE41: pcmpeqq
-; SSE41: ret
; SSE2-LABEL: test2:
-; SSE2-NOT: pcmpeqq
-; SSE2: ret
-
- %C = icmp eq <2 x i64> %A, %B
+; SSE2: ## BB#0:
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: retl
+;
+; SSE41-LABEL: test2:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; SSE42-LABEL: test2:
+; SSE42: ## BB#0:
+; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE42-NEXT: retl
+ %C = icmp eq <2 x i64> %A, %B
%D = sext <2 x i1> %C to <2 x i64>
- ret <2 x i64> %D
+ ret <2 x i64> %D
}
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 66114bc9c6bc..e151317c6585 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -121,22 +121,22 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind {
; CHECK-LABEL: prompop:
; CHECK: # BB#0:
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: psubq %xmm1, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pand %xmm1, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: pand %xmm1, %xmm3
; CHECK-NEXT: psrlq $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: paddq %xmm3, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $4, %xmm1
; CHECK-NEXT: paddq %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: psadbw %xmm0, %xmm1
+; CHECK-NEXT: psadbw %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
diff --git a/test/CodeGen/X86/vec_ext_inreg.ll b/test/CodeGen/X86/vec_ext_inreg.ll
index 02b16a79f4a0..1ee4b24b62f2 100644
--- a/test/CodeGen/X86/vec_ext_inreg.ll
+++ b/test/CodeGen/X86/vec_ext_inreg.ll
@@ -1,36 +1,108 @@
-; RUN: llc < %s -march=x86-64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
define <8 x i32> @a(<8 x i32> %a) nounwind {
+; SSE-LABEL: a:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: a:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpslld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: a:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT: retq
%b = trunc <8 x i32> %a to <8 x i16>
%c = sext <8 x i16> %b to <8 x i32>
ret <8 x i32> %c
}
define <3 x i32> @b(<3 x i32> %a) nounwind {
+; SSE-LABEL: b:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: b:
+; AVX: # BB#0:
+; AVX-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT: retq
%b = trunc <3 x i32> %a to <3 x i16>
%c = sext <3 x i16> %b to <3 x i32>
ret <3 x i32> %c
}
define <1 x i32> @c(<1 x i32> %a) nounwind {
+; ALL-LABEL: c:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: retq
%b = trunc <1 x i32> %a to <1 x i16>
%c = sext <1 x i16> %b to <1 x i32>
ret <1 x i32> %c
}
define <8 x i32> @d(<8 x i32> %a) nounwind {
+; SSE-LABEL: d:
+; SSE: # BB#0:
+; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT: andps %xmm2, %xmm0
+; SSE-NEXT: andps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: d:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: d:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
%b = trunc <8 x i32> %a to <8 x i16>
%c = zext <8 x i16> %b to <8 x i32>
ret <8 x i32> %c
}
define <3 x i32> @e(<3 x i32> %a) nounwind {
+; SSE-LABEL: e:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: e:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
+; AVX-NEXT: retq
%b = trunc <3 x i32> %a to <3 x i16>
%c = zext <3 x i16> %b to <3 x i32>
ret <3 x i32> %c
}
define <1 x i32> @f(<1 x i32> %a) nounwind {
+; ALL-LABEL: f:
+; ALL: # BB#0:
+; ALL-NEXT: movzwl %di, %eax
+; ALL-NEXT: retq
%b = trunc <1 x i32> %a to <1 x i16>
%c = zext <1 x i16> %b to <1 x i32>
ret <1 x i32> %c
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll
index abb07233d35e..7286b4c403b9 100644
--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; When extracting multiple consecutive elements from a larger
; vector into a smaller one, do it efficiently. We should use
@@ -8,11 +9,18 @@
; Extracting the low elements only requires using the right kind of store.
define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
-; CHECK-LABEL: low_v8f32_to_v4f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps %xmm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: low_v8f32_to_v4f32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: low_v8f32_to_v4f32:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %xmm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <8 x float> %v, i32 0
%ext1 = extractelement <8 x float> %v, i32 1
%ext2 = extractelement <8 x float> %v, i32 2
@@ -27,11 +35,18 @@ define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; Extracting the high elements requires just one AVX instruction.
define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
-; CHECK-LABEL: high_v8f32_to_v4f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: high_v8f32_to_v4f32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: high_v8f32_to_v4f32:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <8 x float> %v, i32 4
%ext1 = extractelement <8 x float> %v, i32 5
%ext2 = extractelement <8 x float> %v, i32 6
@@ -48,11 +63,18 @@ define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; if we were actually using the vector in this function and
; have AVX2, we should generate vextracti128 (the int version).
define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
-; CHECK-LABEL: high_v8i32_to_v4i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: high_v8i32_to_v4i32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: high_v8i32_to_v4i32:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <8 x i32> %v, i32 4
%ext1 = extractelement <8 x i32> %v, i32 5
%ext2 = extractelement <8 x i32> %v, i32 6
@@ -67,11 +89,18 @@ define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
; Make sure that element size doesn't alter the codegen.
define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
-; CHECK-LABEL: high_v4f64_to_v2f64:
-; CHECK: # BB#0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: high_v4f64_to_v2f64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: high_v4f64_to_v2f64:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <4 x double> %v, i32 2
%ext1 = extractelement <4 x double> %v, i32 3
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -84,14 +113,25 @@ define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
; FIXME - Ideally these should just call VMOVD/VMOVQ/VMOVSS/VMOVSD
define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) {
-; CHECK-LABEL: legal_vzmovl_2i32_8i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; CHECK-NEXT: vmovaps %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2i32_8i32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2i32_8i32:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X64-NEXT: vmovaps %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x i32>, <2 x i32>* %in, align 8
%ext = extractelement <2 x i32> %ld, i64 0
%ins = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %ext, i64 0
@@ -100,14 +140,25 @@ define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) {
}
define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
-; CHECK-LABEL: legal_vzmovl_2i64_4i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovupd (%rdi), %xmm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; CHECK-NEXT: vmovapd %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2i64_4i64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovupd (%ecx), %xmm0
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X32-NEXT: vmovapd %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2i64_4i64:
+; X64: # BB#0:
+; X64-NEXT: vmovupd (%rdi), %xmm0
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X64-NEXT: vmovapd %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x i64>, <2 x i64>* %in, align 8
%ext = extractelement <2 x i64> %ld, i64 0
%ins = insertelement <4 x i64> <i64 undef, i64 0, i64 0, i64 0>, i64 %ext, i64 0
@@ -116,14 +167,23 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
}
define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
-; CHECK-LABEL: legal_vzmovl_2f32_8f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; CHECK-NEXT: vmovaps %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2f32_8f32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2f32_8f32:
+; X64: # BB#0:
+; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X64-NEXT: vmovaps %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x float>, <2 x float>* %in, align 8
%ext = extractelement <2 x float> %ld, i64 0
%ins = insertelement <8 x float> <float undef, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, float %ext, i64 0
@@ -132,14 +192,25 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
}
define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {
-; CHECK-LABEL: legal_vzmovl_2f64_4f64:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovupd (%rdi), %xmm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; CHECK-NEXT: vmovapd %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2f64_4f64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovupd (%ecx), %xmm0
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X32-NEXT: vmovapd %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2f64_4f64:
+; X64: # BB#0:
+; X64-NEXT: vmovupd (%rdi), %xmm0
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X64-NEXT: vmovapd %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x double>, <2 x double>* %in, align 8
%ext = extractelement <2 x double> %ld, i64 0
%ins = insertelement <4 x double> <double undef, double 0.0, double 0.0, double 0.0>, double %ext, i64 0
diff --git a/test/CodeGen/X86/vec_extract-mmx.ll b/test/CodeGen/X86/vec_extract-mmx.ll
index 780066d2da15..329437cfedab 100644
--- a/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/test/CodeGen/X86/vec_extract-mmx.ll
@@ -1,12 +1,35 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
-define i32 @test0(<1 x i64>* %v4) {
-; CHECK-LABEL: test0:
-; CHECK: # BB#0:{{.*}} %entry
-; CHECK: pshufw $238, (%[[REG:[a-z]+]]), %mm0
-; CHECK-NEXT: movd %mm0, %eax
-; CHECK-NEXT: addl $32, %eax
-; CHECK-NEXT: retq
+define i32 @test0(<1 x i64>* %v4) nounwind {
+; X32-LABEL: test0:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl (%eax), %ecx
+; X32-NEXT: movl 4(%eax), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %ecx, (%esp)
+; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3]
+; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test0:
+; X64: # BB#0: # %entry
+; X64-NEXT: pshufw $238, (%rdi), %mm0 # mm0 = mem[2,3,2,3]
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: addl $32, %eax
+; X64-NEXT: retq
entry:
%v5 = load <1 x i64>, <1 x i64>* %v4, align 8
%v12 = bitcast <1 x i64> %v5 to <4 x i16>
@@ -21,14 +44,32 @@ entry:
ret i32 %v20
}
-define i32 @test1(i32* nocapture readonly %ptr) {
-; CHECK-LABEL: test1:
-; CHECK: # BB#0:{{.*}} %entry
-; CHECK: movd (%[[REG]]), %mm0
-; CHECK-NEXT: pshufw $232, %mm0, %mm0
-; CHECK-NEXT: movd %mm0, %eax
-; CHECK-NEXT: emms
-; CHECK-NEXT: retq
+define i32 @test1(i32* nocapture readonly %ptr) nounwind {
+; X32-LABEL: test1:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movd (%eax), %mm0
+; X32-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
+; X32-NEXT: movq %mm0, (%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: emms
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd (%rdi), %mm0
+; X64-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: emms
+; X64-NEXT: retq
entry:
%0 = load i32, i32* %ptr, align 4
%1 = insertelement <2 x i32> undef, i32 %0, i32 0
@@ -47,13 +88,30 @@ entry:
ret i32 %12
}
-define i32 @test2(i32* nocapture readonly %ptr) {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0:{{.*}} %entry
-; CHECK: pshufw $232, (%[[REG]]), %mm0
-; CHECK-NEXT: movd %mm0, %eax
-; CHECK-NEXT: emms
-; CHECK-NEXT: retq
+define i32 @test2(i32* nocapture readonly %ptr) nounwind {
+; X32-LABEL: test2:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3]
+; X32-NEXT: movq %mm0, (%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: emms
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0: # %entry
+; X64-NEXT: pshufw $232, (%rdi), %mm0 # mm0 = mem[0,2,2,3]
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: emms
+; X64-NEXT: retq
entry:
%0 = bitcast i32* %ptr to x86_mmx*
%1 = load x86_mmx, x86_mmx* %0, align 8
@@ -67,5 +125,48 @@ entry:
ret i32 %7
}
+define i32 @test3(x86_mmx %a) nounwind {
+; X32-LABEL: test3:
+; X32: # BB#0:
+; X32-NEXT: movd %mm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0:
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: retq
+ %tmp0 = bitcast x86_mmx %a to <2 x i32>
+ %tmp1 = extractelement <2 x i32> %tmp0, i32 0
+ ret i32 %tmp1
+}
+
+; Verify we don't muck with extractelts from the upper lane.
+define i32 @test4(x86_mmx %a) nounwind {
+; X32-LABEL: test4:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: movq %mm0, (%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # BB#0:
+; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: retq
+ %tmp0 = bitcast x86_mmx %a to <2 x i32>
+ %tmp1 = extractelement <2 x i32> %tmp0, i32 1
+ ret i32 %tmp1
+}
+
declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 9f4210f7847e..f073f1538d2e 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,60 +1,79 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
define void @t1(float* %R, <4 x float>* %P1) nounwind {
-; CHECK-LABEL: t1:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movss 12(%ecx), %xmm0
-; CHECK-NEXT: movss %xmm0, (%eax)
-; CHECK-NEXT: retl
-
- %X = load <4 x float>, <4 x float>* %P1
- %tmp = extractelement <4 x float> %X, i32 3
- store float %tmp, float* %R
- ret void
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss %xmm0, (%rdi)
+; X64-NEXT: retq
+ %X = load <4 x float>, <4 x float>* %P1
+ %tmp = extractelement <4 x float> %X, i32 3
+ store float %tmp, float* %R
+ ret void
}
define float @t2(<4 x float>* %P1) nounwind {
-; CHECK-LABEL: t2:
-; CHECK: # BB#0:
-; CHECK-NEXT: pushl %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movapd (%eax), %xmm0
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: movss %xmm0, (%esp)
-; CHECK-NEXT: flds (%esp)
-; CHECK-NEXT: popl %eax
-; CHECK-NEXT: retl
-
- %X = load <4 x float>, <4 x float>* %P1
- %tmp = extractelement <4 x float> %X, i32 2
- ret float %tmp
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT: retq
+ %X = load <4 x float>, <4 x float>* %P1
+ %tmp = extractelement <4 x float> %X, i32 2
+ ret float %tmp
}
define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
-; CHECK-LABEL: t3:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl 12(%ecx), %ecx
-; CHECK-NEXT: movl %ecx, (%eax)
-; CHECK-NEXT: retl
-
- %X = load <4 x i32>, <4 x i32>* %P1
- %tmp = extractelement <4 x i32> %X, i32 3
- store i32 %tmp, i32* %R
- ret void
+; X32-LABEL: t3:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl 12(%ecx), %ecx
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0:
+; X64-NEXT: movl 12(%rsi), %eax
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: retq
+ %X = load <4 x i32>, <4 x i32>* %P1
+ %tmp = extractelement <4 x i32> %X, i32 3
+ store i32 %tmp, i32* %R
+ ret void
}
define i32 @t4(<4 x i32>* %P1) nounwind {
-; CHECK-LABEL: t4:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl 12(%eax), %eax
-; CHECK-NEXT: retl
-
- %X = load <4 x i32>, <4 x i32>* %P1
- %tmp = extractelement <4 x i32> %X, i32 3
- ret i32 %tmp
+; X32-LABEL: t4:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 12(%eax), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: t4:
+; X64: # BB#0:
+; X64-NEXT: movl 12(%rdi), %eax
+; X64-NEXT: retq
+ %X = load <4 x i32>, <4 x i32>* %P1
+ %tmp = extractelement <4 x i32> %X, i32 3
+ ret i32 %tmp
}
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 3b478880590d..47f719d9e32e 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,74 +1,104 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define void @test1(<4 x float>* %F, float* %f) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movaps (%ecx), %xmm0
-; CHECK-NEXT: addps %xmm0, %xmm0
-; CHECK-NEXT: movss %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X32-LABEL: test1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movaps (%ecx), %xmm0
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: movss %xmm0, (%rsi)
+; X64-NEXT: retq
entry:
- %tmp = load <4 x float>, <4 x float>* %F ; <<4 x float>> [#uses=2]
- %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1]
- %tmp2 = extractelement <4 x float> %tmp7, i32 0 ; <float> [#uses=1]
- store float %tmp2, float* %f
- ret void
+ %tmp = load <4 x float>, <4 x float>* %F
+ %tmp7 = fadd <4 x float> %tmp, %tmp
+ %tmp2 = extractelement <4 x float> %tmp7, i32 0
+ store float %tmp2, float* %f
+ ret void
}
define float @test2(<4 x float>* %F, float* %f) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: pushl %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movaps (%eax), %xmm0
-; CHECK-NEXT: addps %xmm0, %xmm0
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: movss %xmm0, (%esp)
-; CHECK-NEXT: flds (%esp)
-; CHECK-NEXT: popl %eax
-; CHECK-NEXT: retl
+; X32-LABEL: test2:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: retq
entry:
- %tmp = load <4 x float>, <4 x float>* %F ; <<4 x float>> [#uses=2]
- %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1]
- %tmp2 = extractelement <4 x float> %tmp7, i32 2 ; <float> [#uses=1]
- ret float %tmp2
+ %tmp = load <4 x float>, <4 x float>* %F
+ %tmp7 = fadd <4 x float> %tmp, %tmp
+ %tmp2 = extractelement <4 x float> %tmp7, i32 2
+ ret float %tmp2
}
define void @test3(float* %R, <4 x float>* %P1) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movss 12(%ecx), %xmm0
-; CHECK-NEXT: movss %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X32-LABEL: test3:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0: # %entry
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
- %X = load <4 x float>, <4 x float>* %P1 ; <<4 x float>> [#uses=1]
- %tmp = extractelement <4 x float> %X, i32 3 ; <float> [#uses=1]
- store float %tmp, float* %R
- ret void
+ %X = load <4 x float>, <4 x float>* %P1
+ %tmp = extractelement <4 x float> %X, i32 3
+ store float %tmp, float* %R
+ ret void
}
define double @test4(double %A) nounwind {
-; CHECK-LABEL: test4:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: subl $12, %esp
-; CHECK-NEXT: calll foo
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT: movsd %xmm0, (%esp)
-; CHECK-NEXT: fldl (%esp)
-; CHECK-NEXT: addl $12, %esp
-; CHECK-NEXT: retl
+; X32-LABEL: test4:
+; X32: # BB#0: # %entry
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: calll foo
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: movsd %xmm0, (%esp)
+; X32-NEXT: fldl (%esp)
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # BB#0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
+; X64-NEXT: callq foo
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
entry:
- %tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1]
- %tmp2 = extractelement <2 x double> %tmp1, i32 1 ; <double> [#uses=1]
- %tmp3 = fadd double %tmp2, %A ; <double> [#uses=1]
- ret double %tmp3
+ %tmp1 = call <2 x double> @foo( )
+ %tmp2 = extractelement <2 x double> %tmp1, i32 1
+ %tmp3 = fadd double %tmp2, %A
+ ret double %tmp3
}
declare <2 x double> @foo()
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 54f33b2bd224..0f5e09914890 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,37 +1,64 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64
-define <2 x double> @fabs_v2f64(<2 x double> %p)
-{
- ; CHECK-LABEL: fabs_v2f64
- ; CHECK: vandpd
+define <2 x double> @fabs_v2f64(<2 x double> %p) {
+; X32-LABEL: fabs_v2f64:
+; X32: # BB#0:
+; X32-NEXT: vandpd .LCPI0_0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v2f64:
+; X64: # BB#0:
+; X64-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
%t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
-define <4 x float> @fabs_v4f32(<4 x float> %p)
-{
- ; CHECK-LABEL: fabs_v4f32
- ; CHECK: vandps
+define <4 x float> @fabs_v4f32(<4 x float> %p) {
+; X32-LABEL: fabs_v4f32:
+; X32: # BB#0:
+; X32-NEXT: vandps .LCPI1_0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v4f32:
+; X64: # BB#0:
+; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
%t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
-define <4 x double> @fabs_v4f64(<4 x double> %p)
-{
- ; CHECK-LABEL: fabs_v4f64
- ; CHECK: vandpd
+define <4 x double> @fabs_v4f64(<4 x double> %p) {
+; X32-LABEL: fabs_v4f64:
+; X32: # BB#0:
+; X32-NEXT: vandpd .LCPI2_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v4f64:
+; X64: # BB#0:
+; X64-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
%t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
-define <8 x float> @fabs_v8f32(<8 x float> %p)
-{
- ; CHECK-LABEL: fabs_v8f32
- ; CHECK: vandps
+define <8 x float> @fabs_v8f32(<8 x float> %p) {
+; X32-LABEL: fabs_v8f32:
+; X32: # BB#0:
+; X32-NEXT: vandps .LCPI3_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v8f32:
+; X64: # BB#0:
+; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
%t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
ret <8 x float> %t
}
@@ -44,7 +71,7 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
; that has the sign bits turned off.
;
; So instead of something like this:
-; movabsq (constant pool load of mask for sign bits)
+; movabsq (constant pool load of mask for sign bits)
; vmovq (move from integer register to vector/fp register)
; vandps (mask off sign bits)
; vmovq (move vector/fp register back to integer return register)
@@ -53,9 +80,16 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
; mov (put constant value in return register)
define i64 @fabs_v2f32_1() {
-; CHECK-LABEL: fabs_v2f32_1:
-; CHECK: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
-; CHECK-NEXT: retq
+; X32-LABEL: fabs_v2f32_1:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v2f32_1:
+; X64: # BB#0:
+; X64-NEXT: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
+; X64-NEXT: retq
%bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
%ret = bitcast <2 x float> %fabs to i64
@@ -63,9 +97,16 @@ define i64 @fabs_v2f32_1() {
}
define i64 @fabs_v2f32_2() {
-; CHECK-LABEL: fabs_v2f32_2:
-; CHECK: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-NEXT: retq
+; X32-LABEL: fabs_v2f32_2:
+; X32: # BB#0:
+; X32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v2f32_2:
+; X64: # BB#0:
+; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT: retq
%bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
%ret = bitcast <2 x float> %fabs to i64
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index f35c4ab4a76e..4fa79bc7fa8b 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -1,181 +1,312 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
-
-define <2 x double> @floor_v2f64(<2 x double> %p)
-{
- ; CHECK: floor_v2f64
- ; CHECK: vroundpd
+define <2 x double> @floor_v2f64(<2 x double> %p) {
+; SSE41-LABEL: floor_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
-define <4 x float> @floor_v4f32(<4 x float> %p)
-{
- ; CHECK: floor_v4f32
- ; CHECK: vroundps
+define <4 x float> @floor_v4f32(<4 x float> %p) {
+; SSE41-LABEL: floor_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $9, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $9, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
-define <4 x double> @floor_v4f64(<4 x double> %p)
-{
- ; CHECK: floor_v4f64
- ; CHECK: vroundpd
+define <4 x double> @floor_v4f64(<4 x double> %p){
+; SSE41-LABEL: floor_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
-define <8 x float> @floor_v8f32(<8 x float> %p)
-{
- ; CHECK: floor_v8f32
- ; CHECK: vroundps
+define <8 x float> @floor_v8f32(<8 x float> %p) {
+; SSE41-LABEL: floor_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $9, %xmm0, %xmm0
+; SSE41-NEXT: roundps $9, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $9, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
-define <2 x double> @ceil_v2f64(<2 x double> %p)
-{
- ; CHECK: ceil_v2f64
- ; CHECK: vroundpd
+define <2 x double> @ceil_v2f64(<2 x double> %p) {
+; SSE41-LABEL: ceil_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
-define <4 x float> @ceil_v4f32(<4 x float> %p)
-{
- ; CHECK: ceil_v4f32
- ; CHECK: vroundps
+define <4 x float> @ceil_v4f32(<4 x float> %p) {
+; SSE41-LABEL: ceil_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $10, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $10, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
-define <4 x double> @ceil_v4f64(<4 x double> %p)
-{
- ; CHECK: ceil_v4f64
- ; CHECK: vroundpd
+define <4 x double> @ceil_v4f64(<4 x double> %p) {
+; SSE41-LABEL: ceil_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
-define <8 x float> @ceil_v8f32(<8 x float> %p)
-{
- ; CHECK: ceil_v8f32
- ; CHECK: vroundps
+define <8 x float> @ceil_v8f32(<8 x float> %p) {
+; SSE41-LABEL: ceil_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $10, %xmm0, %xmm0
+; SSE41-NEXT: roundps $10, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $10, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
-define <2 x double> @trunc_v2f64(<2 x double> %p)
-{
- ; CHECK: trunc_v2f64
- ; CHECK: vroundpd
+define <2 x double> @trunc_v2f64(<2 x double> %p) {
+; SSE41-LABEL: trunc_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $11, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
-define <4 x float> @trunc_v4f32(<4 x float> %p)
-{
- ; CHECK: trunc_v4f32
- ; CHECK: vroundps
+define <4 x float> @trunc_v4f32(<4 x float> %p) {
+; SSE41-LABEL: trunc_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $11, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $11, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
-define <4 x double> @trunc_v4f64(<4 x double> %p)
-{
- ; CHECK: trunc_v4f64
- ; CHECK: vroundpd
+define <4 x double> @trunc_v4f64(<4 x double> %p) {
+; SSE41-LABEL: trunc_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
-define <8 x float> @trunc_v8f32(<8 x float> %p)
-{
- ; CHECK: trunc_v8f32
- ; CHECK: vroundps
+define <8 x float> @trunc_v8f32(<8 x float> %p) {
+; SSE41-LABEL: trunc_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $11, %xmm0, %xmm0
+; SSE41-NEXT: roundps $11, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
-define <2 x double> @rint_v2f64(<2 x double> %p)
-{
- ; CHECK: rint_v2f64
- ; CHECK: vroundpd
+define <2 x double> @rint_v2f64(<2 x double> %p) {
+; SSE41-LABEL: rint_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $4, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.rint.v2f64(<2 x double> %p)
-define <4 x float> @rint_v4f32(<4 x float> %p)
-{
- ; CHECK: rint_v4f32
- ; CHECK: vroundps
+define <4 x float> @rint_v4f32(<4 x float> %p) {
+; SSE41-LABEL: rint_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $4, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $4, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.rint.v4f32(<4 x float> %p)
-define <4 x double> @rint_v4f64(<4 x double> %p)
-{
- ; CHECK: rint_v4f64
- ; CHECK: vroundpd
+define <4 x double> @rint_v4f64(<4 x double> %p) {
+; SSE41-LABEL: rint_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.rint.v4f64(<4 x double> %p)
-define <8 x float> @rint_v8f32(<8 x float> %p)
-{
- ; CHECK: rint_v8f32
- ; CHECK: vroundps
+define <8 x float> @rint_v8f32(<8 x float> %p) {
+; SSE41-LABEL: rint_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $4, %xmm0, %xmm0
+; SSE41-NEXT: roundps $4, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.rint.v8f32(<8 x float> %p)
-define <2 x double> @nearbyint_v2f64(<2 x double> %p)
-{
- ; CHECK: nearbyint_v2f64
- ; CHECK: vroundpd
+define <2 x double> @nearbyint_v2f64(<2 x double> %p) {
+; SSE41-LABEL: nearbyint_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $12, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
-define <4 x float> @nearbyint_v4f32(<4 x float> %p)
-{
- ; CHECK: nearbyint_v4f32
- ; CHECK: vroundps
+define <4 x float> @nearbyint_v4f32(<4 x float> %p) {
+; SSE41-LABEL: nearbyint_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $12, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $12, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
-define <4 x double> @nearbyint_v4f64(<4 x double> %p)
-{
- ; CHECK: nearbyint_v4f64
- ; CHECK: vroundpd
+define <4 x double> @nearbyint_v4f64(<4 x double> %p) {
+; SSE41-LABEL: nearbyint_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
-define <8 x float> @nearbyint_v8f32(<8 x float> %p)
-{
- ; CHECK: nearbyint_v8f32
- ; CHECK: vroundps
+define <8 x float> @nearbyint_v8f32(<8 x float> %p) {
+; SSE41-LABEL: nearbyint_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $12, %xmm0, %xmm0
+; SSE41-NEXT: roundps $12, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $12, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
ret <8 x float> %t
}
@@ -186,43 +317,85 @@ declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
;
define <2 x double> @const_floor_v2f64() {
- ; CHECK: const_floor_v2f64
- ; CHECK: movaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_floor_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_floor_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_floor_v4f32() {
- ; CHECK: const_floor_v4f32
- ; CHECK: movaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_floor_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_floor_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
define <2 x double> @const_ceil_v2f64() {
- ; CHECK: const_ceil_v2f64
- ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+; SSE41-LABEL: const_ceil_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_ceil_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_ceil_v4f32() {
- ; CHECK: const_ceil_v4f32
- ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+; SSE41-LABEL: const_ceil_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_ceil_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
define <2 x double> @const_trunc_v2f64() {
- ; CHECK: const_trunc_v2f64
- ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_trunc_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_trunc_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_trunc_v4f32() {
- ; CHECK: const_trunc_v4f32
- ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_trunc_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_trunc_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index a85ae984d8e6..78799ff04fe1 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -1,25 +1,43 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE2
; FNEG is defined as subtraction from -0.0.
; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed.
-define <4 x float> @t1(<4 x float> %Q) {
-; CHECK-LABEL: t1:
-; CHECK: xorps {{.*}}LCPI0_0{{.*}}, %xmm0
-; CHECK-NEXT: retq
- %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
- ret <4 x float> %tmp
+define <4 x float> @t1(<4 x float> %Q) nounwind {
+; X32-SSE-LABEL: t1:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps .LCPI0_0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: t1:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: retq
+ %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
+ ret <4 x float> %tmp
}
; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg.
-define <4 x float> @t2(<4 x float> %Q) {
-; CHECK-LABEL: t2:
-; CHECK: xorps %[[X:xmm[0-9]+]], %[[X]]
-; CHECK-NEXT: subps %xmm0, %[[X]]
-; CHECK-NEXT: movaps %[[X]], %xmm0
-; CHECK-NEXT: retq
- %tmp = fsub <4 x float> zeroinitializer, %Q
- ret <4 x float> %tmp
+define <4 x float> @t2(<4 x float> %Q) nounwind {
+; X32-SSE-LABEL: t2:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm1
+; X32-SSE-NEXT: subps %xmm0, %xmm1
+; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: t2:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm1
+; X64-SSE-NEXT: subps %xmm0, %xmm1
+; X64-SSE-NEXT: movaps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %tmp = fsub <4 x float> zeroinitializer, %Q
+ ret <4 x float> %tmp
}
; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely.
@@ -31,14 +49,51 @@ define <4 x float> @t2(<4 x float> %Q) {
; We should generate:
; movabsq (put sign bit mask in integer register))
; xorq (flip sign bits)
-; movd (move to xmm return register)
+; movd (move to xmm return register)
-define <2 x float> @fneg_bitcast(i64 %i) {
-; CHECK-LABEL: fneg_bitcast:
-; CHECK: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
-; CHECK-NEXT: xorq %rdi, %rax
-; CHECK-NEXT: movd %rax, %xmm0
-; CHECK-NEXT: retq
+define <2 x float> @fneg_bitcast(i64 %i) nounwind {
+; X32-SSE1-LABEL: fneg_bitcast:
+; X32-SSE1: # BB#0:
+; X32-SSE1-NEXT: pushl %ebp
+; X32-SSE1-NEXT: movl %esp, %ebp
+; X32-SSE1-NEXT: andl $-16, %esp
+; X32-SSE1-NEXT: subl $32, %esp
+; X32-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X32-SSE1-NEXT: movl 12(%ebp), %ecx
+; X32-SSE1-NEXT: xorl %eax, %ecx
+; X32-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE1-NEXT: xorl 8(%ebp), %eax
+; X32-SSE1-NEXT: movl %eax, (%esp)
+; X32-SSE1-NEXT: movaps (%esp), %xmm0
+; X32-SSE1-NEXT: movl %ebp, %esp
+; X32-SSE1-NEXT: popl %ebp
+; X32-SSE1-NEXT: retl
+;
+; X32-SSE2-LABEL: fneg_bitcast:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT: xorl %eax, %ecx
+; X32-SSE2-NEXT: movd %ecx, %xmm1
+; X32-SSE2-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT: movd %eax, %xmm0
+; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT: retl
+;
+; X64-SSE1-LABEL: fneg_bitcast:
+; X64-SSE1: # BB#0:
+; X64-SSE1-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
+; X64-SSE1-NEXT: xorq %rdi, %rax
+; X64-SSE1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE1-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE1-NEXT: retq
+;
+; X64-SSE2-LABEL: fneg_bitcast:
+; X64-SSE2: # BB#0:
+; X64-SSE2-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
+; X64-SSE2-NEXT: xorq %rdi, %rax
+; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: retq
%bitcast = bitcast i64 %i to <2 x float>
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
ret <2 x float> %fneg
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 7834b2804247..0ad5ef7ee8f5 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -81,6 +81,7 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
;
; AVX-LABEL: fptosi_4f64_to_2i32:
; AVX: # BB#0:
+; AVX-NEXT: # kill
; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index bb5409b91ee4..5f14324958a2 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,13 +1,39 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1,-avx | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
; PR11674
define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
-; CHECK-LABEL: fpext_frommem:
-; AVX-LABEL: fpext_frommem:
+; X32-SSE-LABEL: fpext_frommem:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0
+; X32-SSE-NEXT: movups %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_frommem:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtps2pd (%ecx), %xmm0
+; X32-AVX-NEXT: vmovups %xmm0, (%eax)
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_frommem:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0
+; X64-SSE-NEXT: movups %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_frommem:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtps2pd (%rdi), %xmm0
+; X64-AVX-NEXT: vmovups %xmm0, (%rsi)
+; X64-AVX-NEXT: retq
entry:
-; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
%0 = load <2 x float>, <2 x float>* %in, align 8
%1 = fpext <2 x float> %0 to <2 x double>
store <2 x double> %1, <2 x double>* %out, align 1
@@ -15,12 +41,40 @@ entry:
}
define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
-; CHECK-LABEL: fpext_frommem4:
-; AVX-LABEL: fpext_frommem4:
+; X32-SSE-LABEL: fpext_frommem4:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0
+; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1
+; X32-SSE-NEXT: movups %xmm1, 16(%eax)
+; X32-SSE-NEXT: movups %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_frommem4:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtps2pd (%ecx), %ymm0
+; X32-AVX-NEXT: vmovups %ymm0, (%eax)
+; X32-AVX-NEXT: vzeroupper
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_frommem4:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0
+; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1
+; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT: movups %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_frommem4:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtps2pd (%rdi), %ymm0
+; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
entry:
-; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
%0 = load <4 x float>, <4 x float>* %in
%1 = fpext <4 x float> %0 to <4 x double>
store <4 x double> %1, <4 x double>* %out, align 1
@@ -28,15 +82,52 @@ entry:
}
define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
-; CHECK-LABEL: fpext_frommem8:
-; AVX-LABEL: fpext_frommem8:
+; X32-SSE-LABEL: fpext_frommem8:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0
+; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1
+; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm2
+; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm3
+; X32-SSE-NEXT: movups %xmm3, 48(%eax)
+; X32-SSE-NEXT: movups %xmm2, 32(%eax)
+; X32-SSE-NEXT: movups %xmm1, 16(%eax)
+; X32-SSE-NEXT: movups %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_frommem8:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtps2pd (%ecx), %ymm0
+; X32-AVX-NEXT: vcvtps2pd 16(%ecx), %ymm1
+; X32-AVX-NEXT: vmovups %ymm1, 32(%eax)
+; X32-AVX-NEXT: vmovups %ymm0, (%eax)
+; X32-AVX-NEXT: vzeroupper
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_frommem8:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0
+; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1
+; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2
+; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm3
+; X64-SSE-NEXT: movups %xmm3, 48(%rsi)
+; X64-SSE-NEXT: movups %xmm2, 32(%rsi)
+; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT: movups %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_frommem8:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtps2pd (%rdi), %ymm0
+; X64-AVX-NEXT: vcvtps2pd 16(%rdi), %ymm1
+; X64-AVX-NEXT: vmovups %ymm1, 32(%rsi)
+; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
entry:
-; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
%0 = load <8 x float>, <8 x float>* %in
%1 = fpext <8 x float> %0 to <8 x double>
store <8 x double> %1, <8 x double>* %out, align 1
@@ -44,11 +135,26 @@ entry:
}
define <2 x double> @fpext_fromconst() {
-; CHECK-LABEL: fpext_fromconst:
-; AVX-LABEL: fpext_fromconst:
+; X32-SSE-LABEL: fpext_fromconst:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_fromconst:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_fromconst:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_fromconst:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X64-AVX-NEXT: retq
entry:
-; CHECK: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
-; AVX: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
%0 = insertelement <2 x float> undef, float 1.0, i32 0
%1 = insertelement <2 x float> %0, float -2.0, i32 1
%2 = fpext <2 x float> %1 to <2 x double>
diff --git a/test/CodeGen/X86/vec_fptrunc.ll b/test/CodeGen/X86/vec_fptrunc.ll
new file mode 100644
index 000000000000..fa22a4af1755
--- /dev/null
+++ b/test/CodeGen/X86/vec_fptrunc.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
+
+define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
+; X32-SSE-LABEL: fptrunc_frommem2:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm0
+; X32-SSE-NEXT: extractps $1, %xmm0, 4(%eax)
+; X32-SSE-NEXT: movss %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_frommem2:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtpd2psx (%ecx), %xmm0
+; X32-AVX-NEXT: vextractps $1, %xmm0, 4(%eax)
+; X32-AVX-NEXT: vmovss %xmm0, (%eax)
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_frommem2:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm0
+; X64-SSE-NEXT: movlpd %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_frommem2:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psx (%rdi), %xmm0
+; X64-AVX-NEXT: vmovlpd %xmm0, (%rsi)
+; X64-AVX-NEXT: retq
+entry:
+ %0 = load <2 x double>, <2 x double>* %in
+ %1 = fptrunc <2 x double> %0 to <2 x float>
+ store <2 x float> %1, <2 x float>* %out, align 1
+ ret void
+}
+
+define void @fptrunc_frommem4(<4 x double>* %in, <4 x float>* %out) {
+; X32-SSE-LABEL: fptrunc_frommem4:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0
+; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm1
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-SSE-NEXT: movupd %xmm1, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_frommem4:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0
+; X32-AVX-NEXT: vmovupd %xmm0, (%eax)
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_frommem4:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps 16(%rdi), %xmm0
+; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm1
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-SSE-NEXT: movupd %xmm1, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_frommem4:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psy (%rdi), %xmm0
+; X64-AVX-NEXT: vmovupd %xmm0, (%rsi)
+; X64-AVX-NEXT: retq
+entry:
+ %0 = load <4 x double>, <4 x double>* %in
+ %1 = fptrunc <4 x double> %0 to <4 x float>
+ store <4 x float> %1, <4 x float>* %out, align 1
+ ret void
+}
+
+define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
+; X32-SSE-LABEL: fptrunc_frommem8:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0
+; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm1
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-SSE-NEXT: cvtpd2ps 48(%ecx), %xmm0
+; X32-SSE-NEXT: cvtpd2ps 32(%ecx), %xmm2
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X32-SSE-NEXT: movupd %xmm2, 16(%eax)
+; X32-SSE-NEXT: movupd %xmm1, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_frommem8:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0
+; X32-AVX-NEXT: vcvtpd2psy 32(%ecx), %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovupd %ymm0, (%eax)
+; X32-AVX-NEXT: vzeroupper
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_frommem8:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps 16(%rdi), %xmm0
+; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm1
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-SSE-NEXT: cvtpd2ps 48(%rdi), %xmm0
+; X64-SSE-NEXT: cvtpd2ps 32(%rdi), %xmm2
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X64-SSE-NEXT: movupd %xmm2, 16(%rsi)
+; X64-SSE-NEXT: movupd %xmm1, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_frommem8:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psy (%rdi), %xmm0
+; X64-AVX-NEXT: vcvtpd2psy 32(%rdi), %xmm1
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovupd %ymm0, (%rsi)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
+entry:
+ %0 = load <8 x double>, <8 x double>* %in
+ %1 = fptrunc <8 x double> %0 to <8 x float>
+ store <8 x float> %1, <8 x float>* %out, align 1
+ ret void
+}
+
+; FIXME: For exact truncations we should be able to fold this.
+define <4 x float> @fptrunc_fromconst() {
+; X32-SSE-LABEL: fptrunc_fromconst:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: cvtpd2ps .LCPI3_0, %xmm1
+; X32-SSE-NEXT: cvtpd2ps .LCPI3_1, %xmm0
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_fromconst:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: vcvtpd2psy .LCPI3_0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_fromconst:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps {{.*}}(%rip), %xmm1
+; X64-SSE-NEXT: cvtpd2ps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_fromconst:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psy {{.*}}(%rip), %xmm0
+; X64-AVX-NEXT: retq
+entry:
+ %0 = insertelement <4 x double> undef, double 1.0, i32 0
+ %1 = insertelement <4 x double> %0, double -2.0, i32 1
+ %2 = insertelement <4 x double> %1, double +4.0, i32 2
+ %3 = insertelement <4 x double> %2, double -0.0, i32 3
+ %4 = fptrunc <4 x double> %3 to <4 x float>
+ ret <4 x float> %4
+}
diff --git a/test/CodeGen/X86/vec_i64.ll b/test/CodeGen/X86/vec_i64.ll
index 48ca1ff021d9..e468839ddc23 100644
--- a/test/CodeGen/X86/vec_i64.ll
+++ b/test/CodeGen/X86/vec_i64.ll
@@ -1,22 +1,43 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep movq %t | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; Used movq to load i64 into a v2i64 when the top i64 is 0.
define <2 x i64> @foo1(i64* %y) nounwind {
+; X32-LABEL: foo1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: foo1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
entry:
- %tmp1 = load i64, i64* %y, align 8 ; <i64> [#uses=1]
- %s2v = insertelement <2 x i64> undef, i64 %tmp1, i32 0
- %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
- ret <2 x i64> %loadl
+ %tmp1 = load i64, i64* %y, align 8
+ %s2v = insertelement <2 x i64> undef, i64 %tmp1, i32 0
+ %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
+ ret <2 x i64> %loadl
}
define <4 x float> @foo2(i64* %p) nounwind {
+; X32-LABEL: foo2:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: foo2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
entry:
- %load = load i64, i64* %p
- %s2v = insertelement <2 x i64> undef, i64 %load, i32 0
- %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
- %0 = bitcast <2 x i64> %loadl to <4 x float>
- ret <4 x float> %0
+ %load = load i64, i64* %p
+ %s2v = insertelement <2 x i64> undef, i64 %load, i32 0
+ %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
+ %0 = bitcast <2 x i64> %loadl to <4 x float>
+ ret <4 x float> %0
}
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 565be7a6cc70..8019e11ad4c0 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -1,24 +1,109 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep "(%esp,%eax,4)" | count 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
; Inserts and extracts with variable indices must be lowered
; to memory accesses.
define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t0:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl $76, (%esp,%eax,4)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t0:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl $76, -24(%rsp,%rax,4)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: retq
%t13 = insertelement <4 x i32> %t8, i32 76, i32 %t7
%t9 = extractelement <4 x i32> %t13, i32 0
ret i32 %t9
}
+
define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl $76, %ecx
+; X32-NEXT: pinsrd $0, %ecx, %xmm0
+; X32-NEXT: movdqa %xmm0, (%esp)
+; X32-NEXT: movl (%esp,%eax,4), %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movl $76, %eax
+; X64-NEXT: pinsrd $0, %eax, %xmm0
+; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl -24(%rsp,%rax,4), %eax
+; X64-NEXT: retq
%t13 = insertelement <4 x i32> %t8, i32 76, i32 0
%t9 = extractelement <4 x i32> %t13, i32 %t7
ret i32 %t9
}
+
define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movdqa %xmm0, (%esp)
+; X32-NEXT: pinsrd $0, (%esp,%eax,4), %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: pinsrd $0, -24(%rsp,%rax,4), %xmm0
+; X64-NEXT: retq
%t9 = extractelement <4 x i32> %t8, i32 %t7
%t13 = insertelement <4 x i32> %t8, i32 %t9, i32 0
ret <4 x i32> %t13
}
+
define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t3:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movd %xmm0, (%esp,%eax,4)
+; X32-NEXT: movaps (%esp), %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movd %xmm0, -24(%rsp,%rax,4)
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: retq
%t9 = extractelement <4 x i32> %t8, i32 0
%t13 = insertelement <4 x i32> %t8, i32 %t9, i32 %t7
ret <4 x i32> %t13
diff --git a/test/CodeGen/X86/vec_ins_extract.ll b/test/CodeGen/X86/vec_ins_extract.ll
index e92f46dbabb5..5ff49eff6df3 100644
--- a/test/CodeGen/X86/vec_ins_extract.ll
+++ b/test/CodeGen/X86/vec_ins_extract.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -scalarrepl -instcombine | \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: opt < %s -sroa -instcombine | \
; RUN: llc -march=x86 -mcpu=yonah | not grep sub.*esp
; This checks that various insert/extract idiom work without going to the
diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll
index fe20a474f59a..2e6654185de8 100644
--- a/test/CodeGen/X86/vec_insert-2.ll
+++ b/test/CodeGen/X86/vec_insert-2.ll
@@ -1,42 +1,68 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X64 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
; X32-LABEL: t1:
-; X32: shufps $36
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = insertelement <4 x float> %tmp, float %s, i32 3
ret <4 x float> %tmp1
}
define <4 x i32> @t2(i32 %s, <4 x i32> %tmp) nounwind {
; X32-LABEL: t2:
-; X32: shufps $36
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X64-NEXT: retq
%tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 3
ret <4 x i32> %tmp1
}
define <2 x double> @t3(double %s, <2 x double> %tmp) nounwind {
; X32-LABEL: t3:
-; X32: movhpd
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-NEXT: retl
+;
; X64-LABEL: t3:
-; X64: unpcklpd
-; X64: ret
-
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = insertelement <2 x double> %tmp, double %s, i32 1
ret <2 x double> %tmp1
}
define <8 x i16> @t4(i16 %s, <8 x i16> %tmp) nounwind {
; X32-LABEL: t4:
-; X32: pinsrw
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t4:
+; X64: # BB#0:
+; X64-NEXT: pinsrw $5, %edi, %xmm0
+; X64-NEXT: retq
%tmp1 = insertelement <8 x i16> %tmp, i16 %s, i32 5
ret <8 x i16> %tmp1
}
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll
index 75244ae0b71a..57a265a0ce30 100644
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -1,10 +1,23 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
-; CHECK-LABEL: t1:
-; CHECK: punpcklqdq
-; CHECK-NEXT: retq
-
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm1
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
%tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
ret <2 x i64> %tmp1
}
diff --git a/test/CodeGen/X86/vec_insert-4.ll b/test/CodeGen/X86/vec_insert-4.ll
index 2c31e56b4af6..c847ac983003 100644
--- a/test/CodeGen/X86/vec_insert-4.ll
+++ b/test/CodeGen/X86/vec_insert-4.ll
@@ -1,11 +1,40 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep 1084227584 | count 1
-
-; ModuleID = '<stdin>'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i686-apple-darwin9.2.2"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9.2.2 -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9.2.2 -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
+; X32-LABEL: f:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-32, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
+; X32-NEXT: movaps (%esp), %xmm0
+; X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: f:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $64, %rsp
+; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, (%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl $1084227584, (%rsp,%rax,4) ## imm = 0x40A00000
+; X64-NEXT: movaps (%rsp), %xmm0
+; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
entry:
- %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b ; <<4 x float>> [#uses=1]
- ret <8 x float> %vecins
+ %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
+ ret <8 x float> %vecins
}
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 14b57e76dc8f..67875b3ef23e 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -1,17 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86 -mattr=+sse2,+ssse3 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+ssse3 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3 | FileCheck %s --check-prefix=X64
+
; There are no MMX operations in @t1
define void @t1(i32 %a, x86_mmx* %P) nounwind {
-; CHECK-LABEL: t1:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: shll $12, %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; CHECK-NEXT: movq %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: shll $12, %ecx
+; X32-NEXT: movd %ecx, %xmm0
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X32-NEXT: movq %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: shll $12, %edi
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rsi)
+; X64-NEXT: retq
%tmp12 = shl i32 %a, 12
%tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
%tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
@@ -21,87 +33,135 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
}
define <4 x float> @t2(<4 x float>* %P) nounwind {
-; CHECK-LABEL: t2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movaps (%eax), %xmm1
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
-; CHECK-NEXT: retl
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm1
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm1
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
ret <4 x float> %tmp2
}
define <4 x float> @t3(<4 x float>* %P) nounwind {
-; CHECK-LABEL: t3:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movapd (%eax), %xmm0
-; CHECK-NEXT: xorpd %xmm1, %xmm1
-; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT: retl
+; X32-LABEL: t3:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movapd (%eax), %xmm0
+; X32-NEXT: xorpd %xmm1, %xmm1
+; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0:
+; X64-NEXT: movapd (%rdi), %xmm0
+; X64-NEXT: xorpd %xmm1, %xmm1
+; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
ret <4 x float> %tmp2
}
define <4 x float> @t4(<4 x float>* %P) nounwind {
-; CHECK-LABEL: t4:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movaps (%eax), %xmm0
-; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; CHECK-NEXT: retl
+; X32-LABEL: t4:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: t4:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
ret <4 x float> %tmp2
}
define <16 x i8> @t5(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t5:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrlw $8, %xmm0
-; CHECK-NEXT: retl
+; X32-LABEL: t5:
+; X32: # BB#0:
+; X32-NEXT: psrlw $8, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t5:
+; X64: # BB#0:
+; X64-NEXT: psrlw $8, %xmm0
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
ret <16 x i8> %s
}
define <16 x i8> @t6(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t6:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrlw $8, %xmm0
-; CHECK-NEXT: retl
+; X32-LABEL: t6:
+; X32: # BB#0:
+; X32-NEXT: psrlw $8, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t6:
+; X64: # BB#0:
+; X64-NEXT: psrlw $8, %xmm0
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i8> %s
}
define <16 x i8> @t7(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t7:
-; CHECK: # BB#0:
-; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; CHECK-NEXT: retl
+; X32-LABEL: t7:
+; X32: # BB#0:
+; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: t7:
+; X64: # BB#0:
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
ret <16 x i8> %s
}
define <16 x i8> @t8(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t8:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
-; CHECK-NEXT: retl
+; X32-LABEL: t8:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t8:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
ret <16 x i8> %s
}
define <16 x i8> @t9(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t9:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
-; CHECK-NEXT: retl
+; X32-LABEL: t9:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t9:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
ret <16 x i8> %s
}
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 4f72c66ecba2..02db6e6d8751 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -1,29 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=i686-apple-darwin9 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+mmx,+sse4.2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+mmx,+sse4.2 | FileCheck %s --check-prefix=X64
; MMX insertelement is not available; these are promoted to XMM.
; (Without SSE they are split to two ints, and the code is much better.)
define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
-; CHECK-LABEL: mmx_movzl:
-; CHECK: ## BB#0:
-; CHECK-NEXT: subl $20, %esp
-; CHECK-NEXT: movq %mm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: pinsrd $0, %eax, %xmm0
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: pinsrd $1, %eax, %xmm0
-; CHECK-NEXT: pinsrd $2, %eax, %xmm0
-; CHECK-NEXT: pinsrd $3, %eax, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: movq %xmm0, (%esp)
-; CHECK-NEXT: movq (%esp), %mm0
-; CHECK-NEXT: addl $20, %esp
-; CHECK-NEXT: retl
+; X32-LABEL: mmx_movzl:
+; X32: ## BB#0:
+; X32-NEXT: subl $20, %esp
+; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
+; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: pinsrd $0, %eax, %xmm0
+; X32-NEXT: pxor %xmm1, %xmm1
+; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; X32-NEXT: movq %xmm1, (%esp)
+; X32-NEXT: movq (%esp), %mm0
+; X32-NEXT: addl $20, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: mmx_movzl:
+; X64: ## BB#0:
+; X64-NEXT: movdq2q %xmm0, %mm0
+; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: pinsrq $0, %rax, %xmm1
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; X64-NEXT: retq
%tmp = bitcast x86_mmx %x to <2 x i32>
- %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 ; <<2 x i32>> [#uses=1]
- %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1 ; <<2 x i32>> [#uses=1]
+ %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
+ %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1
%tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
ret x86_mmx %tmp9
}
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index 917832c40adb..d612e7eb10d3 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -1,15 +1,58 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1 -o %t
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
; tests variable insert and extract of a 4 x i32
-define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
+define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
+; X32-LABEL: var_insert:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 12(%ebp), %ecx
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl %eax, (%esp,%ecx,4)
+; X32-NEXT: movaps (%esp), %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: var_insert:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %esi, %rax
+; X64-NEXT: movl %edi, -24(%rsp,%rax,4)
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: retq
entry:
- %tmp3 = insertelement <4 x i32> %x, i32 %val, i32 %idx ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %tmp3
+ %tmp3 = insertelement <4 x i32> %x, i32 %val, i32 %idx
+ ret <4 x i32> %tmp3
}
-define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
+define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
+; X32-LABEL: var_extract:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl (%esp,%eax,4), %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: var_extract:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl -24(%rsp,%rax,4), %eax
+; X64-NEXT: retq
entry:
- %tmp3 = extractelement <4 x i32> %x, i32 %idx ; <<i32>> [#uses=1]
- ret i32 %tmp3
+ %tmp3 = extractelement <4 x i32> %x, i32 %idx
+ ret i32 %tmp3
}
diff --git a/test/CodeGen/X86/vec_insert-9.ll b/test/CodeGen/X86/vec_insert-9.ll
index 5f2e676ef1ae..ec4a0288e107 100644
--- a/test/CodeGen/X86/vec_insert-9.ll
+++ b/test/CodeGen/X86/vec_insert-9.ll
@@ -1,9 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1 > %t
-; RUN: grep pinsrd %t | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
+; X32-LABEL: var_insert2:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: var_insert2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: pinsrd $3, %esi, %xmm0
+; X64-NEXT: retq
entry:
- %tmp3 = insertelement <4 x i32> undef, i32 %val, i32 0 ; <<4 x i32>> [#uses=1]
- %tmp4 = insertelement <4 x i32> %tmp3, i32 %idx, i32 3 ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %tmp4
+ %tmp3 = insertelement <4 x i32> undef, i32 %val, i32 0
+ %tmp4 = insertelement <4 x i32> %tmp3, i32 %idx, i32 3
+ ret <4 x i32> %tmp4
}
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
index cbd420885ac1..2aae35591ab2 100644
--- a/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -1,37 +1,56 @@
-; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s -check-prefix=X86-64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64
; This is not an MMX operation; promoted to XMM.
define x86_mmx @t0(i32 %A) nounwind {
-; X86-32-LABEL: t0:
-; X86-32: ## BB#0:
-; X86-32: movd {{[0-9]+}}(%esp), %xmm0
-; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; X86-32-NEXT: movq %xmm0, (%esp)
-; X86-32-NEXT: movq (%esp), %mm0
-; X86-32-NEXT: addl $12, %esp
-; X86-32-NEXT: retl
+; X32-LABEL: t0:
+; X32: ## BB#0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X32-NEXT: movq %xmm0, (%esp)
+; X32-NEXT: movq (%esp), %mm0
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: t0:
+; X64: ## BB#0:
+; X64-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: retq
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
%tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
ret x86_mmx %tmp4
}
define <8 x i8> @t1(i8 zeroext %x) nounwind {
-; X86-32-LABEL: t1:
-; X86-32: ## BB#0:
-; X86-32-NOT: movl
-; X86-32-NEXT: movd {{[0-9]+}}(%esp), %xmm0
-; X86-32-NEXT: retl
+; X32-LABEL: t1:
+; X32: ## BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: ## BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: retq
%r = insertelement <8 x i8> undef, i8 %x, i32 0
ret <8 x i8> %r
}
; PR2574
define <2 x float> @t2(<2 x float> %a0) {
-; X86-32-LABEL: t2:
-; X86-32: ## BB#0:
-; X86-32-NEXT: xorps %xmm0, %xmm0
-; X86-32-NEXT: retl
+; X32-LABEL: t2:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
%v1 = insertelement <2 x float> %a0, float 0.000000e+00, i32 0
%v2 = insertelement <2 x float> %v1, float 0.000000e+00, i32 1
ret <2 x float> %v2
@@ -42,14 +61,31 @@ define <2 x float> @t2(<2 x float> %a0) {
; PR2562
define void @t3() {
-; X86-64-LABEL: t3:
-; X86-64: ## BB#0:
-; X86-64: pmovzxwd (%rcx)
-; X86-64-NEXT: movzwl
-; X86-64-NEXT: pinsrd $0
-; X86-64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X86-64-NEXT: movq %xmm0
-; X86-64-NEXT: retq
+; X32-LABEL: t3:
+; X32: ## BB#0:
+; X32-NEXT: movl L_g0$non_lazy_ptr, %eax
+; X32-NEXT: movl L_g1$non_lazy_ptr, %ecx
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-NEXT: movzwl (%eax), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT: movq %xmm0, (%ecx)
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: ## BB#0:
+; X64-NEXT: movq _g0@{{.*}}(%rip), %rax
+; X64-NEXT: movq _g1@{{.*}}(%rip), %rcx
+; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-NEXT: movzwl (%rax), %eax
+; X64-NEXT: pinsrd $0, %eax, %xmm0
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X64-NEXT: movq %xmm0, (%rcx)
+; X64-NEXT: retq
load i16, i16* @g0
load <4 x i16>, <4 x i16>* @g1
insertelement <4 x i16> %2, i16 %1, i32 0
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index fd98791815e7..43f5318a6070 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -61,6 +61,7 @@ define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
; AVX-LABEL: sitofp_4i32_to_2f64:
; AVX: # BB#0:
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: # kill
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x double>
@@ -98,6 +99,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -105,6 +107,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x double>
@@ -144,6 +147,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -152,6 +156,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x double>
@@ -432,6 +437,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -445,6 +451,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x double>
@@ -482,6 +489,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -489,6 +497,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x double>
@@ -528,6 +537,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -536,6 +546,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x double>
@@ -890,6 +901,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -897,6 +909,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x float>
@@ -939,6 +952,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -947,6 +961,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x float>
@@ -1085,9 +1100,7 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
;
; AVX2-LABEL: sitofp_8i8_to_8f32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1386,6 +1399,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1393,6 +1407,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x float>
@@ -1430,12 +1445,12 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
;
; AVX1-LABEL: uitofp_16i8_to_4f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1444,6 +1459,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x float>
@@ -1583,6 +1599,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: .LBB45_10:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
@@ -1650,6 +1667,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: .LBB45_10:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
@@ -1754,20 +1772,16 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
;
; AVX1-LABEL: uitofp_8i8_to_8f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i8_to_8f32:
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1790,11 +1804,10 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
;
; AVX1-LABEL: uitofp_16i8_to_8f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1810,6 +1823,1654 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
}
;
+; Load Signed Integer to Double
+;
+
+define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
+; SSE-LABEL: sitofp_load_2i64_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2sdq %rax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2sdq %rax, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i64_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %ld = load <2 x i64>, <2 x i64> *%a
+ %cvt = sitofp <2 x i64> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
+; SSE-LABEL: sitofp_load_2i32_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i32_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i32>, <2 x i32> *%a
+ %cvt = sitofp <2 x i32> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
+; SSE-LABEL: sitofp_load_2i16_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i16_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i16>, <2 x i16> *%a
+ %cvt = sitofp <2 x i16> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
+; SSE-LABEL: sitofp_load_2i8_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl (%rdi), %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i8_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i8>, <2 x i8> *%a
+ %cvt = sitofp <2 x i8> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
+; SSE-LABEL: sitofp_load_4i64_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2sdq %rax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2sdq %rax, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2sdq %rax, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2sdq %rax, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_4i64_to_4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_4i64_to_4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = sitofp <4 x i64> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
+; SSE-LABEL: sitofp_load_4i32_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i32_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = sitofp <4 x i32> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
+; SSE-LABEL: sitofp_load_4i16_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i16_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = sitofp <4 x i16> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
+; SSE-LABEL: sitofp_load_4i8_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i8_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = sitofp <4 x i8> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+;
+; Load Unsigned Integer to Double
+;
+
+define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
+; SSE-LABEL: uitofp_load_2i64_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: subpd %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i64_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
+; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT: retq
+ %ld = load <2 x i64>, <2 x i64> *%a
+ %cvt = uitofp <2 x i64> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
+; SSE-LABEL: uitofp_load_2i32_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: subpd %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i32_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
+; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT: retq
+ %ld = load <2 x i32>, <2 x i32> *%a
+ %cvt = uitofp <2 x i32> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
+; SSE-LABEL: uitofp_load_2i16_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i16_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i16>, <2 x i16> *%a
+ %cvt = uitofp <2 x i16> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
+; SSE-LABEL: uitofp_load_2i8_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl (%rdi), %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i8_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i8>, <2 x i8> *%a
+ %cvt = uitofp <2 x i8> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
+; SSE-LABEL: uitofp_load_4i64_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm5, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: subpd %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
+; SSE-NEXT: addpd %xmm4, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: subpd %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT: addpd %xmm2, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: subpd %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; SSE-NEXT: addpd %xmm4, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i64_to_4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = uitofp <4 x i64> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
+; SSE-LABEL: uitofp_load_4i32_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm6, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: subpd %xmm6, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
+; SSE-NEXT: addpd %xmm5, %xmm3
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT: subpd %xmm6, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT: addpd %xmm2, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: subpd %xmm6, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i32_to_4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i32_to_4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = uitofp <4 x i32> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
+; SSE-LABEL: uitofp_load_4i16_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i16_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = uitofp <4 x i16> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
+; SSE-LABEL: uitofp_load_4i8_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i8_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = uitofp <4 x i8> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+;
+; Load Signed Integer to Float
+;
+
+define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
+; SSE-LABEL: sitofp_load_4i64_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_4i64_to_4f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_4i64_to_4f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = sitofp <4 x i64> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
+; SSE-LABEL: sitofp_load_4i32_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i32_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = sitofp <4 x i32> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
+; SSE-LABEL: sitofp_load_4i16_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i16_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = sitofp <4 x i16> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
+; SSE-LABEL: sitofp_load_4i8_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i8_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = sitofp <4 x i8> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
+; SSE-LABEL: sitofp_load_8i64_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa 32(%rdi), %xmm3
+; SSE-NEXT: movdqa 48(%rdi), %xmm4
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm5
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movd %xmm4, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_8i64_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_8i64_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i64>, <8 x i64> *%a
+ %cvt = sitofp <8 x i64> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
+; SSE-LABEL: sitofp_load_8i32_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
+; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_8i32_to_8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0
+; AVX-NEXT: retq
+ %ld = load <8 x i32>, <8 x i32> *%a
+ %cvt = sitofp <8 x i32> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
+; SSE-LABEL: sitofp_load_8i16_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i16>, <8 x i16> *%a
+ %cvt = sitofp <8 x i16> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
+; SSE-LABEL: sitofp_load_8i8_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_8i8_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_8i8_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i8>, <8 x i8> *%a
+ %cvt = sitofp <8 x i8> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+;
+; Load Unsigned Integer to Float
+;
+
+define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
+; SSE-LABEL: uitofp_load_4i64_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm3
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_1
+; SSE-NEXT: # BB#2:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: jmp .LBB74_3
+; SSE-NEXT: .LBB74_1:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm2
+; SSE-NEXT: .LBB74_3:
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_4
+; SSE-NEXT: # BB#5:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: jmp .LBB74_6
+; SSE-NEXT: .LBB74_4:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: addss %xmm0, %xmm0
+; SSE-NEXT: .LBB74_6:
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_7
+; SSE-NEXT: # BB#8:
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: jmp .LBB74_9
+; SSE-NEXT: .LBB74_7:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT: addss %xmm3, %xmm3
+; SSE-NEXT: .LBB74_9:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_10
+; SSE-NEXT: # BB#11:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: jmp .LBB74_12
+; SSE-NEXT: .LBB74_10:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: addss %xmm1, %xmm1
+; SSE-NEXT: .LBB74_12:
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i64_to_4f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_1
+; AVX1-NEXT: # BB#2:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: jmp .LBB74_3
+; AVX1-NEXT: .LBB74_1:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: .LBB74_3:
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_4
+; AVX1-NEXT: # BB#5:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB74_6
+; AVX1-NEXT: .LBB74_4:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB74_6:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_7
+; AVX1-NEXT: # BB#8:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB74_9
+; AVX1-NEXT: .LBB74_7:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB74_9:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_10
+; AVX1-NEXT: # BB#11:
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB74_10:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_1
+; AVX2-NEXT: # BB#2:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: jmp .LBB74_3
+; AVX2-NEXT: .LBB74_1:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: .LBB74_3:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_4
+; AVX2-NEXT: # BB#5:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB74_6
+; AVX2-NEXT: .LBB74_4:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB74_6:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_7
+; AVX2-NEXT: # BB#8:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB74_9
+; AVX2-NEXT: .LBB74_7:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB74_9:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_10
+; AVX2-NEXT: # BB#11:
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB74_10:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = uitofp <4 x i64> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
+; SSE-LABEL: uitofp_load_4i32_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: addps {{.*}}(%rip), %xmm0
+; SSE-NEXT: addps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i32_to_4f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i32_to_4f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = uitofp <4 x i32> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
+; SSE-LABEL: uitofp_load_4i16_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i16_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = uitofp <4 x i16> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
+; SSE-LABEL: uitofp_load_4i8_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i8_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = uitofp <4 x i8> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
+; SSE-LABEL: uitofp_load_8i64_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm5
+; SSE-NEXT: movdqa 32(%rdi), %xmm2
+; SSE-NEXT: movdqa 48(%rdi), %xmm3
+; SSE-NEXT: movd %xmm5, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_1
+; SSE-NEXT: # BB#2:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm4
+; SSE-NEXT: jmp .LBB78_3
+; SSE-NEXT: .LBB78_1:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm4
+; SSE-NEXT: addss %xmm4, %xmm4
+; SSE-NEXT: .LBB78_3:
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_4
+; SSE-NEXT: # BB#5:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: jmp .LBB78_6
+; SSE-NEXT: .LBB78_4:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: addss %xmm0, %xmm0
+; SSE-NEXT: .LBB78_6:
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE-NEXT: movd %xmm5, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_7
+; SSE-NEXT: # BB#8:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm6
+; SSE-NEXT: jmp .LBB78_9
+; SSE-NEXT: .LBB78_7:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm6
+; SSE-NEXT: addss %xmm6, %xmm6
+; SSE-NEXT: .LBB78_9:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_10
+; SSE-NEXT: # BB#11:
+; SSE-NEXT: xorps %xmm5, %xmm5
+; SSE-NEXT: cvtsi2ssq %rax, %xmm5
+; SSE-NEXT: jmp .LBB78_12
+; SSE-NEXT: .LBB78_10:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm5, %xmm5
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm5
+; SSE-NEXT: addss %xmm5, %xmm5
+; SSE-NEXT: .LBB78_12:
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_13
+; SSE-NEXT: # BB#14:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm7
+; SSE-NEXT: jmp .LBB78_15
+; SSE-NEXT: .LBB78_13:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm7
+; SSE-NEXT: addss %xmm7, %xmm7
+; SSE-NEXT: .LBB78_15:
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_16
+; SSE-NEXT: # BB#17:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: jmp .LBB78_18
+; SSE-NEXT: .LBB78_16:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: addss %xmm1, %xmm1
+; SSE-NEXT: .LBB78_18:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_19
+; SSE-NEXT: # BB#20:
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: jmp .LBB78_21
+; SSE-NEXT: .LBB78_19:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT: addss %xmm3, %xmm3
+; SSE-NEXT: .LBB78_21:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_22
+; SSE-NEXT: # BB#23:
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: jmp .LBB78_24
+; SSE-NEXT: .LBB78_22:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm2
+; SSE-NEXT: .LBB78_24:
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i64_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_1
+; AVX1-NEXT: # BB#2:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: jmp .LBB78_3
+; AVX1-NEXT: .LBB78_1:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: .LBB78_3:
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_4
+; AVX1-NEXT: # BB#5:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: jmp .LBB78_6
+; AVX1-NEXT: .LBB78_4:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: .LBB78_6:
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_7
+; AVX1-NEXT: # BB#8:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
+; AVX1-NEXT: jmp .LBB78_9
+; AVX1-NEXT: .LBB78_7:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
+; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: .LBB78_9:
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_10
+; AVX1-NEXT: # BB#11:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB78_12
+; AVX1-NEXT: .LBB78_10:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB78_12:
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_13
+; AVX1-NEXT: # BB#14:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX1-NEXT: jmp .LBB78_15
+; AVX1-NEXT: .LBB78_13:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
+; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: .LBB78_15:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_16
+; AVX1-NEXT: # BB#17:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: jmp .LBB78_18
+; AVX1-NEXT: .LBB78_16:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: .LBB78_18:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vmovq %xmm4, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_19
+; AVX1-NEXT: # BB#20:
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX1-NEXT: jmp .LBB78_21
+; AVX1-NEXT: .LBB78_19:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX1-NEXT: .LBB78_21:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX1-NEXT: vpextrq $1, %xmm4, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_22
+; AVX1-NEXT: # BB#23:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB78_24
+; AVX1-NEXT: .LBB78_22:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB78_24:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i64_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_1
+; AVX2-NEXT: # BB#2:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: jmp .LBB78_3
+; AVX2-NEXT: .LBB78_1:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: .LBB78_3:
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_4
+; AVX2-NEXT: # BB#5:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: jmp .LBB78_6
+; AVX2-NEXT: .LBB78_4:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: .LBB78_6:
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_7
+; AVX2-NEXT: # BB#8:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
+; AVX2-NEXT: jmp .LBB78_9
+; AVX2-NEXT: .LBB78_7:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
+; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: .LBB78_9:
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_10
+; AVX2-NEXT: # BB#11:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB78_12
+; AVX2-NEXT: .LBB78_10:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB78_12:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_13
+; AVX2-NEXT: # BB#14:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX2-NEXT: jmp .LBB78_15
+; AVX2-NEXT: .LBB78_13:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
+; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: .LBB78_15:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_16
+; AVX2-NEXT: # BB#17:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: jmp .LBB78_18
+; AVX2-NEXT: .LBB78_16:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: .LBB78_18:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_19
+; AVX2-NEXT: # BB#20:
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX2-NEXT: jmp .LBB78_21
+; AVX2-NEXT: .LBB78_19:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX2-NEXT: .LBB78_21:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX2-NEXT: vpextrq $1, %xmm4, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_22
+; AVX2-NEXT: # BB#23:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB78_24
+; AVX2-NEXT: .LBB78_22:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB78_24:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i64>, <8 x i64> *%a
+ %cvt = uitofp <8 x i64> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
+; SSE-LABEL: uitofp_load_8i32_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movdqa 16(%rdi), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT: por %xmm4, %xmm3
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
+; SSE-NEXT: por %xmm5, %xmm0
+; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE-NEXT: addps %xmm6, %xmm0
+; SSE-NEXT: addps %xmm3, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: por %xmm4, %xmm2
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: addps %xmm6, %xmm1
+; SSE-NEXT: addps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i32_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i32_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i32>, <8 x i32> *%a
+ %cvt = uitofp <8 x i32> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
+; SSE-LABEL: uitofp_load_8i16_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i16>, <8 x i16> *%a
+ %cvt = uitofp <8 x i16> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
+; SSE-LABEL: uitofp_load_8i8_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i8_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i8_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i8>, <8 x i8> *%a
+ %cvt = uitofp <8 x i8> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+;
; Aggregates
;
diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll
index ecae5d962826..b0d95c5d00da 100644
--- a/test/CodeGen/X86/vec_loadsingles.ll
+++ b/test/CodeGen/X86/vec_loadsingles.ll
@@ -1,22 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=FAST32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=SLOW32
define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {
+; ALL-LABEL: merge_2_floats:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
%tmp1 = load float, float* %p
%vecins = insertelement <4 x float> undef, float %tmp1, i32 0
%add.ptr = getelementptr float, float* %p, i32 1
%tmp5 = load float, float* %add.ptr
%vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
ret <4 x float> %vecins7
-
-; ALL-LABEL: merge_2_floats
-; ALL: vmovq
-; ALL-NEXT: retq
}
; Test-case generated due to a crash when trying to treat loading the first
; two i64s of a <4 x i64> as a load of two i32s.
define <4 x i64> @merge_2_floats_into_4() {
+; ALL-LABEL: merge_2_floats_into_4:
+; ALL: # BB#0:
+; ALL-NEXT: movq (%rax), %rax
+; ALL-NEXT: vmovups (%rax), %xmm0
+; ALL-NEXT: retq
%1 = load i64*, i64** undef, align 8
%2 = getelementptr inbounds i64, i64* %1, i64 0
%3 = load i64, i64* %2
@@ -27,13 +33,13 @@ define <4 x i64> @merge_2_floats_into_4() {
%8 = insertelement <4 x i64> %4, i64 %7, i32 1
%9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i64> %9
-
-; ALL-LABEL: merge_2_floats_into_4
-; ALL: vmovups
-; ALL-NEXT: retq
}
define <4 x float> @merge_4_floats(float* %ptr) {
+; ALL-LABEL: merge_4_floats:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups (%rdi), %xmm0
+; ALL-NEXT: retq
%a = load float, float* %ptr, align 8
%vec = insertelement <4 x float> undef, float %a, i32 0
%idx1 = getelementptr inbounds float, float* %ptr, i64 1
@@ -46,18 +52,24 @@ define <4 x float> @merge_4_floats(float* %ptr) {
%d = load float, float* %idx5, align 8
%vec6 = insertelement <4 x float> %vec4, float %d, i32 3
ret <4 x float> %vec6
-
-; ALL-LABEL: merge_4_floats
-; ALL: vmovups
-; ALL-NEXT: retq
}
-; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 )
+; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 )
; Make sure that 32-byte vectors are handled efficiently.
; If the target has slow 32-byte accesses, we should still generate
; 16-byte loads.
define <8 x float> @merge_8_floats(float* %ptr) {
+; FAST32-LABEL: merge_8_floats:
+; FAST32: # BB#0:
+; FAST32-NEXT: vmovups (%rdi), %ymm0
+; FAST32-NEXT: retq
+;
+; SLOW32-LABEL: merge_8_floats:
+; SLOW32: # BB#0:
+; SLOW32-NEXT: vmovups (%rdi), %xmm0
+; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; SLOW32-NEXT: retq
%a = load float, float* %ptr, align 4
%vec = insertelement <8 x float> undef, float %a, i32 0
%idx1 = getelementptr inbounds float, float* %ptr, i64 1
@@ -82,18 +94,19 @@ define <8 x float> @merge_8_floats(float* %ptr) {
%h = load float, float* %idx13, align 4
%vec14 = insertelement <8 x float> %vec12, float %h, i32 7
ret <8 x float> %vec14
-
-; ALL-LABEL: merge_8_floats
-
-; FAST32: vmovups
-; FAST32-NEXT: retq
-
-; SLOW32: vmovups
-; SLOW32-NEXT: vinsertf128
-; SLOW32-NEXT: retq
}
define <4 x double> @merge_4_doubles(double* %ptr) {
+; FAST32-LABEL: merge_4_doubles:
+; FAST32: # BB#0:
+; FAST32-NEXT: vmovups (%rdi), %ymm0
+; FAST32-NEXT: retq
+;
+; SLOW32-LABEL: merge_4_doubles:
+; SLOW32: # BB#0:
+; SLOW32-NEXT: vmovups (%rdi), %xmm0
+; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; SLOW32-NEXT: retq
%a = load double, double* %ptr, align 8
%vec = insertelement <4 x double> undef, double %a, i32 0
%idx1 = getelementptr inbounds double, double* %ptr, i64 1
@@ -106,20 +119,22 @@ define <4 x double> @merge_4_doubles(double* %ptr) {
%d = load double, double* %idx5, align 8
%vec6 = insertelement <4 x double> %vec4, double %d, i32 3
ret <4 x double> %vec6
-
-; ALL-LABEL: merge_4_doubles
-; FAST32: vmovups
-; FAST32-NEXT: retq
-
-; SLOW32: vmovups
-; SLOW32-NEXT: vinsertf128
-; SLOW32-NEXT: retq
}
-; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )
+; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )
; Recognize and combine consecutive loads even when the
; first of the combined loads is offset from the base address.
define <4 x double> @merge_4_doubles_offset(double* %ptr) {
+; FAST32-LABEL: merge_4_doubles_offset:
+; FAST32: # BB#0:
+; FAST32-NEXT: vmovups 32(%rdi), %ymm0
+; FAST32-NEXT: retq
+;
+; SLOW32-LABEL: merge_4_doubles_offset:
+; SLOW32: # BB#0:
+; SLOW32-NEXT: vmovups 32(%rdi), %xmm0
+; SLOW32-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
+; SLOW32-NEXT: retq
%arrayidx4 = getelementptr inbounds double, double* %ptr, i64 4
%arrayidx5 = getelementptr inbounds double, double* %ptr, i64 5
%arrayidx6 = getelementptr inbounds double, double* %ptr, i64 6
@@ -133,13 +148,5 @@ define <4 x double> @merge_4_doubles_offset(double* %ptr) {
%vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2
%vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3
ret <4 x double> %vecinit7
-
-; ALL-LABEL: merge_4_doubles_offset
-; FAST32: vmovups
-; FAST32-NEXT: retq
-
-; SLOW32: vmovups
-; SLOW32-NEXT: vinsertf128
-; SLOW32-NEXT: retq
}
diff --git a/test/CodeGen/X86/vec_logical.ll b/test/CodeGen/X86/vec_logical.ll
index 6ab2d8963abd..b632616cde88 100644
--- a/test/CodeGen/X86/vec_logical.ll
+++ b/test/CodeGen/X86/vec_logical.ll
@@ -1,42 +1,87 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep xorps %t | count 2
-; RUN: grep andnps %t
-; RUN: grep movaps %t | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
define void @t(<4 x float> %A) {
- %tmp1277 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %A
- store <4 x float> %tmp1277, <4 x float>* null
- ret void
+; SSE-LABEL: t:
+; SSE: # BB#0:
+; SSE-NEXT: xorps .LCPI0_0, %xmm0
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps .LCPI0_0, %xmm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, 0
+; AVX-NEXT: retl
+ %tmp1277 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %A
+ store <4 x float> %tmp1277, <4 x float>* null
+ ret void
}
define <4 x float> @t1(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: t1:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: xorps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t1:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
entry:
- %tmp9 = bitcast <4 x float> %a to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp10 = bitcast <4 x float> %b to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp11 = xor <4 x i32> %tmp9, %tmp10 ; <<4 x i32>> [#uses=1]
- %tmp13 = bitcast <4 x i32> %tmp11 to <4 x float> ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp13
+ %tmp9 = bitcast <4 x float> %a to <4 x i32>
+ %tmp10 = bitcast <4 x float> %b to <4 x i32>
+ %tmp11 = xor <4 x i32> %tmp9, %tmp10
+ %tmp13 = bitcast <4 x i32> %tmp11 to <4 x float>
+ ret <4 x float> %tmp13
}
define <2 x double> @t2(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: t2:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: andps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t2:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
entry:
- %tmp9 = bitcast <2 x double> %a to <2 x i64> ; <<2 x i64>> [#uses=1]
- %tmp10 = bitcast <2 x double> %b to <2 x i64> ; <<2 x i64>> [#uses=1]
- %tmp11 = and <2 x i64> %tmp9, %tmp10 ; <<2 x i64>> [#uses=1]
- %tmp13 = bitcast <2 x i64> %tmp11 to <2 x double> ; <<2 x double>> [#uses=1]
- ret <2 x double> %tmp13
+ %tmp9 = bitcast <2 x double> %a to <2 x i64>
+ %tmp10 = bitcast <2 x double> %b to <2 x i64>
+ %tmp11 = and <2 x i64> %tmp9, %tmp10
+ %tmp13 = bitcast <2 x i64> %tmp11 to <2 x double>
+ ret <2 x double> %tmp13
}
define void @t3(<4 x float> %a, <4 x float> %b, <4 x float>* %c, <4 x float>* %d) {
+; SSE-LABEL: t3:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT: andnps %xmm1, %xmm0
+; SSE-NEXT: orps (%ecx), %xmm0
+; SSE-NEXT: movaps %xmm0, (%eax)
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t3:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vorps (%ecx), %xmm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%eax)
+; AVX-NEXT: retl
entry:
- %tmp3 = load <4 x float>, <4 x float>* %c ; <<4 x float>> [#uses=1]
- %tmp11 = bitcast <4 x float> %a to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp12 = bitcast <4 x float> %b to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp13 = xor <4 x i32> %tmp11, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %tmp14 = and <4 x i32> %tmp12, %tmp13 ; <<4 x i32>> [#uses=1]
- %tmp27 = bitcast <4 x float> %tmp3 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp28 = or <4 x i32> %tmp14, %tmp27 ; <<4 x i32>> [#uses=1]
- %tmp30 = bitcast <4 x i32> %tmp28 to <4 x float> ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp30, <4 x float>* %d
- ret void
+ %tmp3 = load <4 x float>, <4 x float>* %c
+ %tmp11 = bitcast <4 x float> %a to <4 x i32>
+ %tmp12 = bitcast <4 x float> %b to <4 x i32>
+ %tmp13 = xor <4 x i32> %tmp11, < i32 -1, i32 -1, i32 -1, i32 -1 >
+ %tmp14 = and <4 x i32> %tmp12, %tmp13
+ %tmp27 = bitcast <4 x float> %tmp3 to <4 x i32>
+ %tmp28 = or <4 x i32> %tmp14, %tmp27
+ %tmp30 = bitcast <4 x i32> %tmp28 to <4 x float>
+ store <4 x float> %tmp30, <4 x float>* %d
+ ret void
}
diff --git a/test/CodeGen/X86/vec_partial.ll b/test/CodeGen/X86/vec_partial.ll
index 469667a28a76..e5ac81add7f6 100644
--- a/test/CodeGen/X86/vec_partial.ll
+++ b/test/CodeGen/X86/vec_partial.ll
@@ -1,11 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; PR11580
define <3 x float> @addf3(<3 x float> %x) {
-; CHECK-LABEL: addf3
-; CHECK: # BB#0:
-; CHECK-NEXT: addps .LCPI0_0(%rip), %xmm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: addf3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
entry:
%add = fadd <3 x float> %x, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
ret <3 x float> %add
@@ -13,9 +14,9 @@ entry:
; PR11580
define <4 x float> @cvtf3_f4(<3 x float> %x) {
-; CHECK-LABEL: cvtf3_f4
-; CHECK: # BB#0:
-; CHECK-NEXT: retq
+; CHECK-LABEL: cvtf3_f4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%extractVec = shufflevector <3 x float> %x, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
ret <4 x float> %extractVec
@@ -23,9 +24,9 @@ entry:
; PR11580
define <3 x float> @cvtf4_f3(<4 x float> %x) {
-; CHECK-LABEL: cvtf4_f3
-; CHECK: # BB#0:
-; CHECK-NEXT: retq
+; CHECK-LABEL: cvtf4_f3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%extractVec = shufflevector <4 x float> %x, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
ret <3 x float> %extractVec
diff --git a/test/CodeGen/X86/vec_sdiv_to_shift.ll b/test/CodeGen/X86/vec_sdiv_to_shift.ll
index 7f71a0c2ea5b..f7151af528b5 100644
--- a/test/CodeGen/X86/vec_sdiv_to_shift.ll
+++ b/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -1,93 +1,286 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=+avx2 | FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
define <8 x i16> @sdiv_vec8x16(<8 x i16> %var) {
+; SSE-LABEL: sdiv_vec8x16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psraw $15, %xmm1
+; SSE-NEXT: psrlw $11, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: psraw $5, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_vec8x16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $11, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $5, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_vec8x16
-; CHECK: psraw $15
-; CHECK: vpsrlw $11
-; CHECK: vpaddw
-; CHECK: vpsraw $5
-; CHECK: ret
%0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
ret <8 x i16> %0
}
define <8 x i16> @sdiv_vec8x16_minsize(<8 x i16> %var) minsize {
+; SSE-LABEL: sdiv_vec8x16_minsize:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psraw $15, %xmm1
+; SSE-NEXT: psrlw $11, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: psraw $5, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_vec8x16_minsize:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $11, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $5, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_vec8x16_minsize
-; CHECK: psraw $15
-; CHECK: vpsrlw $11
-; CHECK: vpaddw
-; CHECK: vpsraw $5
-; CHECK: ret
%0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
ret <8 x i16> %0
}
-
define <4 x i32> @sdiv_zero(<4 x i32> %var) {
+; SSE-LABEL: sdiv_zero:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pextrd $1, %xmm0, %eax
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: pinsrd $1, %ecx, %xmm1
+; SSE-NEXT: pextrd $2, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: pinsrd $2, %eax, %xmm1
+; SSE-NEXT: pextrd $3, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: pinsrd $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_zero:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_zero
-; CHECK-NOT: sra
-; CHECK: ret
%0 = sdiv <4 x i32> %var, <i32 0, i32 0, i32 0, i32 0>
ret <4 x i32> %0
}
define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) {
+; SSE-LABEL: sdiv_vec4x32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: psrld $28, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: psrad $4, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_vec4x32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $4, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_vec4x32
-; CHECK: vpsrad $31
-; CHECK: vpsrld $28
-; CHECK: vpaddd
-; CHECK: vpsrad $4
-; CHECK: ret
%0 = sdiv <4 x i32> %var, <i32 16, i32 16, i32 16, i32 16>
ret <4 x i32> %0
}
define <4 x i32> @sdiv_negative(<4 x i32> %var) {
+; SSE-LABEL: sdiv_negative:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: psrld $28, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: psrad $4, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_negative:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $4, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_negative
-; CHECK: vpsrad $31
-; CHECK: vpsrld $28
-; CHECK: vpaddd
-; CHECK: vpsrad $4
-; CHECK: vpsubd
-; CHECK: ret
%0 = sdiv <4 x i32> %var, <i32 -16, i32 -16, i32 -16, i32 -16>
ret <4 x i32> %0
}
define <8 x i32> @sdiv8x32(<8 x i32> %var) {
+; SSE-LABEL: sdiv8x32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psrad $31, %xmm2
+; SSE-NEXT: psrld $26, %xmm2
+; SSE-NEXT: paddd %xmm0, %xmm2
+; SSE-NEXT: psrad $6, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: psrld $26, %xmm3
+; SSE-NEXT: paddd %xmm1, %xmm3
+; SSE-NEXT: psrad $6, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sdiv8x32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $26, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $6, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $26, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $6, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sdiv8x32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
+; AVX2-NEXT: vpsrld $26, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $6, %ymm0, %ymm0
+; AVX2-NEXT: retq
entry:
-; CHECK: sdiv8x32
-; CHECK: vpsrad $31
-; CHECK: vpsrld $26
-; CHECK: vpaddd
-; CHECK: vpsrad $6
-; CHECK: ret
%0 = sdiv <8 x i32> %var, <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
ret <8 x i32> %0
}
define <16 x i16> @sdiv16x16(<16 x i16> %var) {
+; SSE-LABEL: sdiv16x16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psraw $15, %xmm2
+; SSE-NEXT: psrlw $14, %xmm2
+; SSE-NEXT: paddw %xmm0, %xmm2
+; SSE-NEXT: psraw $2, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psraw $15, %xmm3
+; SSE-NEXT: psrlw $14, %xmm3
+; SSE-NEXT: paddw %xmm1, %xmm3
+; SSE-NEXT: psraw $2, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sdiv16x16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX1-NEXT: vpsrlw $14, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsraw $2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlw $14, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sdiv16x16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
+; AVX2-NEXT: vpsrlw $14, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $2, %ymm0, %ymm0
+; AVX2-NEXT: retq
entry:
-; CHECK: sdiv16x16
-; CHECK: vpsraw $15
-; CHECK: vpsrlw $14
-; CHECK: vpaddw
-; CHECK: vpsraw $2
-; CHECK: ret
%a0 = sdiv <16 x i16> %var, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
ret <16 x i16> %a0
}
-; CHECK: sdiv_non_splat
-; CHECK: idivl
-; CHECK: ret
define <4 x i32> @sdiv_non_splat(<4 x i32> %x) {
+; SSE-LABEL: sdiv_non_splat:
+; SSE: # BB#0:
+; SSE-NEXT: pextrd $1, %xmm0, %eax
+; SSE-NEXT: xorl %ecx, %ecx
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %ecx
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: movl %edx, %esi
+; SSE-NEXT: shrl $31, %esi
+; SSE-NEXT: addl %edx, %esi
+; SSE-NEXT: sarl %esi
+; SSE-NEXT: movd %esi, %xmm1
+; SSE-NEXT: pinsrd $1, %eax, %xmm1
+; SSE-NEXT: pextrd $2, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %ecx
+; SSE-NEXT: pinsrd $2, %eax, %xmm1
+; SSE-NEXT: pextrd $3, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %ecx
+; SSE-NEXT: pinsrd $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_non_splat:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %ecx
+; AVX-NEXT: vmovd %xmm0, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: sarl %esi
+; AVX-NEXT: vmovd %esi, %xmm1
+; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %ecx
+; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %ecx
+; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: retq
%y = sdiv <4 x i32> %x, <i32 2, i32 0, i32 0, i32 0>
ret <4 x i32> %y
}
diff --git a/test/CodeGen/X86/vec_set-2.ll b/test/CodeGen/X86/vec_set-2.ll
index a8f1187084d6..02f25d8e35a2 100644
--- a/test/CodeGen/X86/vec_set-2.ll
+++ b/test/CodeGen/X86/vec_set-2.ll
@@ -1,19 +1,27 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movd | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
define <4 x float> @test1(float %a) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1]
- %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
- %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
- %tmp7 = insertelement <4 x float> %tmp6, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp7
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retl
+ %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0
+ %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
+ %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2
+ %tmp7 = insertelement <4 x float> %tmp6, float 0.000000e+00, i32 3
+ ret <4 x float> %tmp7
}
define <2 x i64> @test(i32 %a) nounwind {
- %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 ; <<8 x i16>> [#uses=1]
- %tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1 ; <<8 x i32>> [#uses=1]
- %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 ; <<8 x i32>> [#uses=1]
- %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3 ; <<8 x i32>> [#uses=1]
- %tmp19 = bitcast <4 x i32> %tmp10 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp19
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retl
+ %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0
+ %tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1
+ %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2
+ %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3
+ %tmp19 = bitcast <4 x i32> %tmp10 to <2 x i64>
+ ret <2 x i64> %tmp19
}
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index a13c813ea7b0..ee4a08599968 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
define <4 x float> @test(float %a) {
; CHECK-LABEL: test:
-; CHECK: insertps $29, {{.*}}, %xmm0
+; CHECK: # BB#0:
+; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero
; CHECK-NEXT: retl
-
-entry:
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
%tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2
%tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3
@@ -14,11 +14,10 @@ entry:
define <2 x i64> @test2(i32 %a) {
; CHECK-LABEL: test2:
-; CHECK: movd {{.*}}, %xmm0
+; CHECK: # BB#0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; CHECK-NEXT: retl
-
-entry:
%tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
%tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3
%tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
@@ -27,9 +26,9 @@ entry:
define <4 x float> @test3(<4 x float> %A) {
; CHECK-LABEL: test3:
-; CHECK: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; CHECK: # BB#0:
+; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
; CHECK-NEXT: retl
-
%tmp0 = extractelement <4 x float> %A, i32 0
%tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
%tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2
diff --git a/test/CodeGen/X86/vec_set-4.ll b/test/CodeGen/X86/vec_set-4.ll
index 332c8b70760f..8f35529d61b4 100644
--- a/test/CodeGen/X86/vec_set-4.ll
+++ b/test/CodeGen/X86/vec_set-4.ll
@@ -1,24 +1,34 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pinsrw | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <2 x i64> @test(i16 %a) nounwind {
-entry:
- %tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3 ; <<8 x i16>> [#uses=1]
- %tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4 ; <<8 x i16>> [#uses=1]
- %tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5 ; <<8 x i16>> [#uses=1]
- %tmp16 = insertelement <8 x i16> %tmp14, i16 0, i32 6 ; <<8 x i16>> [#uses=1]
- %tmp18 = insertelement <8 x i16> %tmp16, i16 0, i32 7 ; <<8 x i16>> [#uses=1]
- %tmp19 = bitcast <8 x i16> %tmp18 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp19
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: retl
+ %tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3
+ %tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4
+ %tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5
+ %tmp16 = insertelement <8 x i16> %tmp14, i16 0, i32 6
+ %tmp18 = insertelement <8 x i16> %tmp16, i16 0, i32 7
+ %tmp19 = bitcast <8 x i16> %tmp18 to <2 x i64>
+ ret <2 x i64> %tmp19
}
define <2 x i64> @test2(i8 %a) nounwind {
-entry:
- %tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10 ; <<16 x i8>> [#uses=1]
- %tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11 ; <<16 x i8>> [#uses=1]
- %tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12 ; <<16 x i8>> [#uses=1]
- %tmp30 = insertelement <16 x i8> %tmp28, i8 0, i32 13 ; <<16 x i8>> [#uses=1]
- %tmp32 = insertelement <16 x i8> %tmp30, i8 0, i32 14 ; <<16 x i8>> [#uses=1]
- %tmp34 = insertelement <16 x i8> %tmp32, i8 0, i32 15 ; <<16 x i8>> [#uses=1]
- %tmp35 = bitcast <16 x i8> %tmp34 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp35
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: pinsrw $5, %eax, %xmm0
+; CHECK-NEXT: retl
+ %tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10
+ %tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11
+ %tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12
+ %tmp30 = insertelement <16 x i8> %tmp28, i8 0, i32 13
+ %tmp32 = insertelement <16 x i8> %tmp30, i8 0, i32 14
+ %tmp34 = insertelement <16 x i8> %tmp32, i8 0, i32 15
+ %tmp35 = bitcast <16 x i8> %tmp34 to <2 x i64>
+ ret <2 x i64> %tmp35
}
diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll
index 0713d956ee44..4429834b8ef0 100644
--- a/test/CodeGen/X86/vec_set-6.ll
+++ b/test/CodeGen/X86/vec_set-6.ll
@@ -1,12 +1,16 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep movss %t | count 1
-; RUN: grep movq %t | count 1
-; RUN: grep shufps %t | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
define <4 x float> @test(float %a, float %b, float %c) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1]
- %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2 ; <<4 x float>> [#uses=1]
- %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3 ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp10
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
+; CHECK-NEXT: retl
+ %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
+ %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2
+ %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3
+ ret <4 x float> %tmp10
}
diff --git a/test/CodeGen/X86/vec_set-7.ll b/test/CodeGen/X86/vec_set-7.ll
index 1701e491da66..e8fe6debb140 100644
--- a/test/CodeGen/X86/vec_set-7.ll
+++ b/test/CodeGen/X86/vec_set-7.ll
@@ -1,11 +1,17 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <2 x i64> @test(<2 x i64>* %p) nounwind {
- %tmp = bitcast <2 x i64>* %p to double*
- %tmp.upgrd.1 = load double, double* %tmp
- %tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0
- %tmp5 = insertelement <2 x double> %tmp.upgrd.2, double 0.0, i32 1
- %tmp.upgrd.3 = bitcast <2 x double> %tmp5 to <2 x i64>
- ret <2 x i64> %tmp.upgrd.3
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp = bitcast <2 x i64>* %p to double*
+ %tmp.upgrd.1 = load double, double* %tmp
+ %tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0
+ %tmp5 = insertelement <2 x double> %tmp.upgrd.2, double 0.0, i32 1
+ %tmp.upgrd.3 = bitcast <2 x double> %tmp5 to <2 x i64>
+ ret <2 x i64> %tmp.upgrd.3
}
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index 41061ae7ac23..560e5c568faf 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
-; CHECK-NOT: movsd
-; CHECK: movd {{%rdi|%rcx}}, %xmm0
-; CHECK-NOT: movsd
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s
define <2 x i64> @test(i64 %i) nounwind {
-entry:
- %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
- %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
- ret <2 x i64> %tmp11
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movd %rdi, %xmm0
+; CHECK-NEXT: retq
+ %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
+ %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
+ ret <2 x i64> %tmp11
}
-
diff --git a/test/CodeGen/X86/vec_set-A.ll b/test/CodeGen/X86/vec_set-A.ll
index 92dda4c11b88..cae39a3d775b 100644
--- a/test/CodeGen/X86/vec_set-A.ll
+++ b/test/CodeGen/X86/vec_set-A.ll
@@ -1,7 +1,12 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-; CHECK: movl $1, %{{.*}}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+
define <2 x i64> @test1() nounwind {
-entry:
- ret <2 x i64> < i64 1, i64 0 >
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: retl
+ ret <2 x i64> < i64 1, i64 0 >
}
diff --git a/test/CodeGen/X86/vec_set-B.ll b/test/CodeGen/X86/vec_set-B.ll
index 5578ecaf0007..0580a3376656 100644
--- a/test/CodeGen/X86/vec_set-B.ll
+++ b/test/CodeGen/X86/vec_set-B.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep esp | count 2
-
-; CHECK-NOT: movaps
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
; These should both generate something like this:
;_test3:
@@ -11,16 +9,26 @@
; ret
define <2 x i64> @test3(i64 %arg) nounwind {
-entry:
- %A = and i64 %arg, 1234567
- %B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0
- ret <2 x i64> %B
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687
+; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: retl
+ %A = and i64 %arg, 1234567
+ %B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0
+ ret <2 x i64> %B
}
define <2 x i64> @test2(i64 %arg) nounwind {
-entry:
- %A = and i64 %arg, 1234567
- %B = insertelement <2 x i64> undef, i64 %A, i32 0
- ret <2 x i64> %B
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687
+; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: retl
+ %A = and i64 %arg, 1234567
+ %B = insertelement <2 x i64> undef, i64 %A, i32 0
+ ret <2 x i64> %B
}
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index 052da30a6bb8..cbcac34ce4a5 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep movq
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep mov | count 1
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | grep movd
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(i64 %x) nounwind {
- %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
- ret <2 x i64> %tmp8
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: retq
+ %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
+ ret <2 x i64> %tmp8
}
diff --git a/test/CodeGen/X86/vec_set-D.ll b/test/CodeGen/X86/vec_set-D.ll
index 9c1e1acf0bab..f736a4ab45be 100644
--- a/test/CodeGen/X86/vec_set-D.ll
+++ b/test/CodeGen/X86/vec_set-D.ll
@@ -1,9 +1,12 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-; CHECK: movq
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <4 x i32> @t(i32 %x, i32 %y) nounwind {
- %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
- %tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1
- ret <4 x i32> %tmp2
+; CHECK-LABEL: t:
+; CHECK: # BB#0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
+ %tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1
+ ret <4 x i32> %tmp2
}
diff --git a/test/CodeGen/X86/vec_set-F.ll b/test/CodeGen/X86/vec_set-F.ll
index aa17f9bfbf5c..e69d8f4fc4da 100644
--- a/test/CodeGen/X86/vec_set-F.ll
+++ b/test/CodeGen/X86/vec_set-F.ll
@@ -1,19 +1,27 @@
-; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | grep movq
-; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | grep movsd
-; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | grep mov | count 3
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | FileCheck %s
define <2 x i64> @t1(<2 x i64>* %ptr) nounwind {
- %tmp45 = bitcast <2 x i64>* %ptr to <2 x i32>*
- %tmp615 = load <2 x i32>, <2 x i32>* %tmp45
- %tmp7 = bitcast <2 x i32> %tmp615 to i64
- %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %tmp7, i32 0
- ret <2 x i64> %tmp8
+; CHECK-LABEL: t1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp45 = bitcast <2 x i64>* %ptr to <2 x i32>*
+ %tmp615 = load <2 x i32>, <2 x i32>* %tmp45
+ %tmp7 = bitcast <2 x i32> %tmp615 to i64
+ %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %tmp7, i32 0
+ ret <2 x i64> %tmp8
}
define <2 x i64> @t2(i64 %x) nounwind {
- %tmp717 = bitcast i64 %x to double
- %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0
- %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1
- %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64>
- ret <2 x i64> %tmp11
+; CHECK-LABEL: t2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp717 = bitcast i64 %x to double
+ %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0
+ %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1
+ %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64>
+ ret <2 x i64> %tmp11
}
diff --git a/test/CodeGen/X86/vec_set-H.ll b/test/CodeGen/X86/vec_set-H.ll
index 5037e36d3fd5..af8ac70c5b3d 100644
--- a/test/CodeGen/X86/vec_set-H.ll
+++ b/test/CodeGen/X86/vec_set-H.ll
@@ -1,15 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movz
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <2 x i64> @doload64(i16 signext %x) nounwind {
-entry:
- %tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0 ; <<8 x i16>> [#uses=1]
- %tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1 ; <<8 x i16>> [#uses=1]
- %tmp38 = insertelement <8 x i16> %tmp37, i16 %x, i32 2 ; <<8 x i16>> [#uses=1]
- %tmp39 = insertelement <8 x i16> %tmp38, i16 %x, i32 3 ; <<8 x i16>> [#uses=1]
- %tmp40 = insertelement <8 x i16> %tmp39, i16 %x, i32 4 ; <<8 x i16>> [#uses=1]
- %tmp41 = insertelement <8 x i16> %tmp40, i16 %x, i32 5 ; <<8 x i16>> [#uses=1]
- %tmp42 = insertelement <8 x i16> %tmp41, i16 %x, i32 6 ; <<8 x i16>> [#uses=1]
- %tmp43 = insertelement <8 x i16> %tmp42, i16 %x, i32 7 ; <<8 x i16>> [#uses=1]
- %tmp46 = bitcast <8 x i16> %tmp43 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp46
+; CHECK-LABEL: doload64:
+; CHECK: # BB#0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: retl
+ %tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0
+ %tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1
+ %tmp38 = insertelement <8 x i16> %tmp37, i16 %x, i32 2
+ %tmp39 = insertelement <8 x i16> %tmp38, i16 %x, i32 3
+ %tmp40 = insertelement <8 x i16> %tmp39, i16 %x, i32 4
+ %tmp41 = insertelement <8 x i16> %tmp40, i16 %x, i32 5
+ %tmp42 = insertelement <8 x i16> %tmp41, i16 %x, i32 6
+ %tmp43 = insertelement <8 x i16> %tmp42, i16 %x, i32 7
+ %tmp46 = bitcast <8 x i16> %tmp43 to <2 x i64>
+ ret <2 x i64> %tmp46
}
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index 53d880b4bbdd..49bd3beef75a 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,15 +1,36 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep punpckl | count 7
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
- %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0 ; <<8 x i16>> [#uses=1]
- %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1 ; <<8 x i16>> [#uses=1]
- %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2 ; <<8 x i16>> [#uses=1]
- %tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3 ; <<8 x i16>> [#uses=1]
- %tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4 ; <<8 x i16>> [#uses=1]
- %tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5 ; <<8 x i16>> [#uses=1]
- %tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6 ; <<8 x i16>> [#uses=1]
- %tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7 ; <<8 x i16>> [#uses=1]
- store <8 x i16> %tmp14, <8 x i16>* %b
- ret void
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; CHECK-NEXT: movdqa %xmm3, (%eax)
+; CHECK-NEXT: retl
+ %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0
+ %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1
+ %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2
+ %tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3
+ %tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4
+ %tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5
+ %tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6
+ %tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7
+ store <8 x i16> %tmp14, <8 x i16>* %b
+ ret void
}
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index b69f90cd6e2f..1eef0be2dbbb 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -1,179 +1,199 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse4.1 | FileCheck %s -check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
define <16 x i8> @v16i8_icmp_uge(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
+; SSE-LABEL: v16i8_icmp_uge:
+; SSE: # BB#0:
+; SSE-NEXT: pmaxub %xmm0, %xmm1
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v16i8_icmp_uge:
+; AVX: # BB#0:
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = icmp uge <16 x i8> %a, %b
%2 = sext <16 x i1> %1 to <16 x i8>
ret <16 x i8> %2
-; SSE2-LABEL: v16i8_icmp_uge:
-; SSE2: pmaxub %xmm0, %xmm1
-; SSE2: pcmpeqb %xmm1, %xmm0
-
-; SSE41-LABEL: v16i8_icmp_uge:
-; SSE41: pmaxub %xmm0, %xmm1
-; SSE41: pcmpeqb %xmm1, %xmm0
-
-; AVX-LABEL: v16i8_icmp_uge:
-; AVX: vpmaxub %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqb %xmm1, %xmm0, %xmm0
}
define <16 x i8> @v16i8_icmp_ule(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
+; SSE-LABEL: v16i8_icmp_ule:
+; SSE: # BB#0:
+; SSE-NEXT: pminub %xmm0, %xmm1
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v16i8_icmp_ule:
+; AVX: # BB#0:
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = icmp ule <16 x i8> %a, %b
%2 = sext <16 x i1> %1 to <16 x i8>
ret <16 x i8> %2
-; SSE2-LABEL: v16i8_icmp_ule:
-; SSE2: pminub %xmm0, %xmm1
-; SSE2: pcmpeqb %xmm1, %xmm0
-
-; SSE41-LABEL: v16i8_icmp_ule:
-; SSE41: pminub %xmm0, %xmm1
-; SSE41: pcmpeqb %xmm1, %xmm0
-
-; AVX-LABEL: v16i8_icmp_ule:
-; AVX: vpminub %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqb %xmm1, %xmm0, %xmm0
}
-
define <8 x i16> @v8i16_icmp_uge(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
- %1 = icmp uge <8 x i16> %a, %b
- %2 = sext <8 x i1> %1 to <8 x i16>
- ret <8 x i16> %2
; SSE2-LABEL: v8i16_icmp_uge:
-; SSE2: psubusw %xmm0, %xmm1
-; SEE2: pxor %xmm0, %xmm0
-; SSE2: pcmpeqw %xmm1, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v8i16_icmp_uge:
-; SSE41: pmaxuw %xmm0, %xmm1
-; SSE41: pcmpeqw %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pmaxuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v8i16_icmp_uge:
-; AVX: vpmaxuw %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp uge <8 x i16> %a, %b
+ %2 = sext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
}
define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
- %1 = icmp ule <8 x i16> %a, %b
- %2 = sext <8 x i1> %1 to <8 x i16>
- ret <8 x i16> %2
; SSE2-LABEL: v8i16_icmp_ule:
-; SSE2: psubusw %xmm1, %xmm0
-; SSE2: pxor %xmm1, %xmm1
-; SSE2: pcmpeqw %xmm1, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v8i16_icmp_ule:
-; SSE41: pminuw %xmm0, %xmm1
-; SSE41: pcmpeqw %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pminuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v8i16_icmp_ule:
-; AVX: vpminuw %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp ule <8 x i16> %a, %b
+ %2 = sext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
}
-
define <4 x i32> @v4i32_icmp_uge(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
- %1 = icmp uge <4 x i32> %a, %b
- %2 = sext <4 x i1> %1 to <4 x i32>
- ret <4 x i32> %2
; SSE2-LABEL: v4i32_icmp_uge:
-; SSE2: movdqa {{.*}}(%rip), %xmm2
-; SSE2: pxor %xmm2, %xmm0
-; SSE2: pxor %xmm1, %xmm2
-; SSE2: pcmpgtd %xmm0, %xmm2
-; SSE2: pcmpeqd %xmm0, %xmm0
-; SSE2: pxor %xmm2, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v4i32_icmp_uge:
-; SSE41: pmaxud %xmm0, %xmm1
-; SSE41: pcmpeqd %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pmaxud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v4i32_icmp_uge:
-; AVX: vpmaxud %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp uge <4 x i32> %a, %b
+ %2 = sext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
}
define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
- %1 = icmp ule <4 x i32> %a, %b
- %2 = sext <4 x i1> %1 to <4 x i32>
- ret <4 x i32> %2
; SSE2-LABEL: v4i32_icmp_ule:
-; SSE2: movdqa {{.*}}(%rip), %xmm2
-; SSE2: pxor %xmm2, %xmm1
-; SSE2: pxor %xmm2, %xmm0
-; SSE2: pcmpgtd %xmm1, %xmm0
-; SSE2: pcmpeqd %xmm1, %xmm1
-; SSE2: pxor %xmm1, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v4i32_icmp_ule:
-; SSE41: pminud %xmm0, %xmm1
-; SSE41: pcmpeqd %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pminud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v4i32_icmp_ule:
-; AVX: pminud %xmm1, %xmm0, %xmm1
-; AVX: pcmpeqd %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp ule <4 x i32> %a, %b
+ %2 = sext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
}
; At one point we were incorrectly constant-folding a setcc to 0x1 instead of
; 0xff, leading to a constpool load. The instruction doesn't matter here, but it
; should set all bits to 1.
define <16 x i8> @test_setcc_constfold_vi8(<16 x i8> %l, <16 x i8> %r) {
+; SSE-LABEL: test_setcc_constfold_vi8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_setcc_constfold_vi8:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%test1 = icmp eq <16 x i8> %l, %r
%mask1 = sext <16 x i1> %test1 to <16 x i8>
-
%test2 = icmp ne <16 x i8> %l, %r
%mask2 = sext <16 x i1> %test2 to <16 x i8>
-
%res = or <16 x i8> %mask1, %mask2
ret <16 x i8> %res
-; SSE2-LABEL: test_setcc_constfold_vi8:
-; SSE2: pcmpeqd %xmm0, %xmm0
-
-; SSE41-LABEL: test_setcc_constfold_vi8:
-; SSE41: pcmpeqd %xmm0, %xmm0
-
-; AVX-LABEL: test_setcc_constfold_vi8:
-; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
}
; Make sure sensible results come from doing extension afterwards
define <16 x i8> @test_setcc_constfold_vi1(<16 x i8> %l, <16 x i8> %r) {
+; SSE-LABEL: test_setcc_constfold_vi1:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_setcc_constfold_vi1:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%test1 = icmp eq <16 x i8> %l, %r
%test2 = icmp ne <16 x i8> %l, %r
-
%res = or <16 x i1> %test1, %test2
%mask = sext <16 x i1> %res to <16 x i8>
ret <16 x i8> %mask
-; SSE2-LABEL: test_setcc_constfold_vi1:
-; SSE2: pcmpeqd %xmm0, %xmm0
-
-; SSE41-LABEL: test_setcc_constfold_vi1:
-; SSE41: pcmpeqd %xmm0, %xmm0
-
-; AVX-LABEL: test_setcc_constfold_vi1:
-; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
}
-
; 64-bit case is also particularly important, as the constant "-1" is probably
; just 32-bits wide.
define <2 x i64> @test_setcc_constfold_vi64(<2 x i64> %l, <2 x i64> %r) {
+; SSE-LABEL: test_setcc_constfold_vi64:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_setcc_constfold_vi64:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%test1 = icmp eq <2 x i64> %l, %r
%mask1 = sext <2 x i1> %test1 to <2 x i64>
-
%test2 = icmp ne <2 x i64> %l, %r
%mask2 = sext <2 x i1> %test2 to <2 x i64>
-
%res = or <2 x i64> %mask1, %mask2
ret <2 x i64> %res
-; SSE2-LABEL: test_setcc_constfold_vi64:
-; SSE2: pcmpeqd %xmm0, %xmm0
-
-; SSE41-LABEL: test_setcc_constfold_vi64:
-; SSE41: pcmpeqd %xmm0, %xmm0
-
-; AVX-LABEL: test_setcc_constfold_vi64:
-; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
}
diff --git a/test/CodeGen/X86/vec_shift.ll b/test/CodeGen/X86/vec_shift.ll
index ddf0469b72a7..55b55936634d 100644
--- a/test/CodeGen/X86/vec_shift.ll
+++ b/test/CodeGen/X86/vec_shift.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psllw
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psrlq
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psraw
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0: # %entry
+; X32-NEXT: psllw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllw %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp6 = bitcast <2 x i64> %c to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp8 = bitcast <2 x i64> %b1 to <8 x i16> ; <<8 x i16>> [#uses=1]
@@ -12,6 +21,17 @@ entry:
}
define <2 x i64> @t3(<2 x i64> %b1, i32 %c) nounwind {
+; X32-LABEL: t3:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: psraw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: psraw %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp2 = bitcast <2 x i64> %b1 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp4 = insertelement <4 x i32> undef, i32 %c, i32 0 ; <<4 x i32>> [#uses=1]
@@ -21,14 +41,23 @@ entry:
ret <2 x i64> %tmp11
}
-declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0: # %entry
+; X32-NEXT: psrlq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0: # %entry
+; X64-NEXT: psrlq %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp9 = tail call <2 x i64> @llvm.x86.sse2.psrl.q( <2 x i64> %b1, <2 x i64> %c ) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp9
}
-declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
-declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/vec_shift2.ll b/test/CodeGen/X86/vec_shift2.ll
index c5f9dc4ace32..21d599fead08 100644
--- a/test/CodeGen/X86/vec_shift2.ll
+++ b/test/CodeGen/X86/vec_shift2.ll
@@ -1,6 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep CPI
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movl $14, %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: psrlw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movl $14, %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: psrlw %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = bitcast <2 x i64> %b1 to <8 x i16>
%tmp2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp1, <8 x i16> bitcast (<4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > to <8 x i16>) ) nounwind readnone
%tmp3 = bitcast <8 x i16> %tmp2 to <2 x i64>
@@ -8,10 +23,23 @@ define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
}
define <4 x i32> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: movl $14, %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: pslld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movl $14, %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: pslld %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = bitcast <2 x i64> %b1 to <4 x i32>
%tmp2 = tail call <4 x i32> @llvm.x86.sse2.psll.d( <4 x i32> %tmp1, <4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > ) nounwind readnone
ret <4 x i32> %tmp2
}
-declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/vec_shift3.ll b/test/CodeGen/X86/vec_shift3.ll
index 1ebf455c0555..071f0d38b96d 100644
--- a/test/CodeGen/X86/vec_shift3.ll
+++ b/test/CodeGen/X86/vec_shift3.ll
@@ -1,20 +1,51 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psllq
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psraw
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movd | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(<2 x i64> %x1, i32 %bits) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: psllq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: psllq %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp3 = tail call <2 x i64> @llvm.x86.sse2.pslli.q( <2 x i64> %x1, i32 %bits ) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp3
}
define <2 x i64> @t2(<2 x i64> %x1) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0: # %entry
+; X32-NEXT: psllq $10, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllq $10, %xmm0
+; X64-NEXT: retq
entry:
%tmp3 = tail call <2 x i64> @llvm.x86.sse2.pslli.q( <2 x i64> %x1, i32 10 ) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp3
}
define <2 x i64> @t3(<2 x i64> %x1, i32 %bits) nounwind {
+; X32-LABEL: t3:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: psraw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: psraw %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp2 = bitcast <2 x i64> %x1 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w( <8 x i16> %tmp2, i32 %bits ) nounwind readnone ; <<8 x i16>> [#uses=1]
@@ -22,5 +53,5 @@ entry:
ret <2 x i64> %tmp5
}
-declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
-declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index b266a6987557..66229361990f 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -1,6 +1,23 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
+; X32-LABEL: shl1:
+; X32: # BB#0: # %entry
+; X32-NEXT: pslld $23, %xmm1
+; X32-NEXT: paddd {{\.LCPI.*}}, %xmm1
+; X32-NEXT: cvttps2dq %xmm1, %xmm1
+; X32-NEXT: pmulld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: shl1:
+; X64: # BB#0: # %entry
+; X64-NEXT: pslld $23, %xmm1
+; X64-NEXT: paddd {{.*}}(%rip), %xmm1
+; X64-NEXT: cvttps2dq %xmm1, %xmm1
+; X64-NEXT: pmulld %xmm1, %xmm0
+; X64-NEXT: retq
entry:
; CHECK-NOT: shll
; CHECK: pslld
@@ -14,6 +31,51 @@ entry:
}
define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
+; X32-LABEL: shl2:
+; X32: # BB#0: # %entry
+; X32-NEXT: movdqa %xmm0, %xmm2
+; X32-NEXT: psllw $5, %xmm1
+; X32-NEXT: movdqa %xmm2, %xmm3
+; X32-NEXT: psllw $4, %xmm3
+; X32-NEXT: pand {{\.LCPI.*}}, %xmm3
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: pblendvb %xmm3, %xmm2
+; X32-NEXT: movdqa %xmm2, %xmm3
+; X32-NEXT: psllw $2, %xmm3
+; X32-NEXT: pand {{\.LCPI.*}}, %xmm3
+; X32-NEXT: paddb %xmm1, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: pblendvb %xmm3, %xmm2
+; X32-NEXT: movdqa %xmm2, %xmm3
+; X32-NEXT: paddb %xmm3, %xmm3
+; X32-NEXT: paddb %xmm1, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: pblendvb %xmm3, %xmm2
+; X32-NEXT: movdqa %xmm2, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: shl2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: psllw $5, %xmm1
+; X64-NEXT: movdqa %xmm2, %xmm3
+; X64-NEXT: psllw $4, %xmm3
+; X64-NEXT: pand {{.*}}(%rip), %xmm3
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: pblendvb %xmm3, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm3
+; X64-NEXT: psllw $2, %xmm3
+; X64-NEXT: pand {{.*}}(%rip), %xmm3
+; X64-NEXT: paddb %xmm1, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: pblendvb %xmm3, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm3
+; X64-NEXT: paddb %xmm3, %xmm3
+; X64-NEXT: paddb %xmm1, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: pblendvb %xmm3, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: retq
entry:
; CHECK-NOT: shlb
; CHECK: pblendvb
diff --git a/test/CodeGen/X86/vec_shift5.ll b/test/CodeGen/X86/vec_shift5.ll
index 499aa22de52d..cba2b5d05041 100644
--- a/test/CodeGen/X86/vec_shift5.ll
+++ b/test/CodeGen/X86/vec_shift5.ll
@@ -1,153 +1,238 @@
-; RUN: llc -march=x86-64 -mattr=+sse2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; Verify that we correctly fold target specific packed vector shifts by
; immediate count into a simple build_vector when the elements of the vector
; in input to the packed shift are all constants or undef.
define <8 x i16> @test1() {
+; X32-LABEL: test1:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64]
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64]
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test1
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test2() {
+; X32-LABEL: test2:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test2
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test3() {
+; X32-LABEL: test3:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test3
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test4() {
+; X32-LABEL: test4:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64]
+; X32-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64]
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test4
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test5() {
+; X32-LABEL: test5:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test5:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test5
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test6() {
+; X32-LABEL: test6:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test6:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test6
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test7() {
+; X32-LABEL: test7:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = [1,0,2,0]
+; X32-NEXT: psllq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test7:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16]
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test7
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test8() {
+; X32-LABEL: test8:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = [8,0,16,0]
+; X32-NEXT: psrlq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test8:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [1,2]
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test8
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test9() {
+; X32-LABEL: test9:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT: retl
+;
+; X64-LABEL: test9:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test9
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test10() {
+; X32-LABEL: test10:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X32-NEXT: retl
+;
+; X64-LABEL: test10:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test10
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test11() {
+; X32-LABEL: test11:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = <u,u,31,0>
+; X32-NEXT: psrlq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test11:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,3>
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test11
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test12() {
+; X32-LABEL: test12:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT: retl
+;
+; X64-LABEL: test12:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test12
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test13() {
+; X32-LABEL: test13:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X32-NEXT: retl
+;
+; X64-LABEL: test13:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test13
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test14() {
+; X32-LABEL: test14:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT: retl
+;
+; X64-LABEL: test14:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test14
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test15() {
+; X32-LABEL: test15:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <u,64,u,256>
+; X32-NEXT: retl
+;
+; X64-LABEL: test15:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,64,u,256>
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test15
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test16() {
+; X32-LABEL: test16:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = <u,u,31,0>
+; X32-NEXT: psllq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test16:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,248>
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test16
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
-
declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
index b71f9893a9db..c4b7f204be69 100644
--- a/test/CodeGen/X86/vec_shift6.ll
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -1,134 +1,229 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefix=AVX512
; Verify that we don't scalarize a packed vector shift left of 16-bit
; signed integers if the amount is a constant build_vector.
; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
define <8 x i16> @test1(<8 x i16> %a) {
+; SSE-LABEL: test1:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test1:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
ret <8 x i16> %shl
}
-; CHECK-LABEL: test1
-; CHECK: pmullw
-; CHECK-NEXT: ret
-
define <8 x i16> @test2(<8 x i16> %a) {
+; SSE-LABEL: test2:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test2:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test2:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
ret <8 x i16> %shl
}
-; CHECK-LABEL: test2
-; CHECK: pmullw
-; CHECK-NEXT: ret
-
; Verify that a vector shift left of 32-bit signed integers is simply expanded
; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
; counts is a constant build_vector.
define <4 x i32> @test3(<4 x i32> %a) {
+; SSE-LABEL: test3:
+; SSE: # BB#0:
+; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test3:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
ret <4 x i32> %shl
}
-; CHECK-LABEL: test3
-; CHECK-NOT: cvttps2dq
-; SSE: pmulld
-; AVX2: vpsllvd
-; CHECK-NEXT: ret
-
define <4 x i32> @test4(<4 x i32> %a) {
+; SSE-LABEL: test4:
+; SSE: # BB#0:
+; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test4:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
ret <4 x i32> %shl
}
-; CHECK-LABEL: test4
-; CHECK-NOT: cvttps2dq
-; SSE: pmulld
-; AVX2: vpsllvd
-; CHECK-NEXT: ret
-
; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
; into two pmullw instructions. With AVX2, the test case below would produce
; a single vpmullw.
define <16 x i16> @test5(<16 x i16> %a) {
+; SSE-LABEL: test5:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test5:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test5:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test5
-; SSE: pmullw
-; SSE-NEXT: pmullw
-; AVX2: vpmullw
-; AVX2-NOT: vpmullw
-; CHECK: ret
-
; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
; into two pmulld instructions. With AVX2, the test case below would produce
; a single vpsllvd instead.
define <8 x i32> @test6(<8 x i32> %a) {
+; SSE-LABEL: test6:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
+; SSE-NEXT: pmulld %xmm2, %xmm0
+; SSE-NEXT: pmulld %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test6:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test6:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test6
-; SSE: pmulld
-; SSE-NEXT: pmulld
-; AVX2: vpsllvd
-; CHECK: ret
-
; With AVX2 and AVX512, the test case below should produce a sequence of
; two vpmullw instructions. On SSE2 instead, we split the shift in four
; parts and then we convert each part into a pmullw.
define <32 x i16> @test7(<32 x i16> %a) {
+; SSE-LABEL: test7:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
+; SSE-NEXT: pmullw %xmm4, %xmm0
+; SSE-NEXT: pmullw %xmm4, %xmm1
+; SSE-NEXT: pmullw %xmm4, %xmm2
+; SSE-NEXT: pmullw %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test7:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test7:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: retq
%shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
ret <32 x i16> %shl
}
-; CHECK-LABEL: test7
-; SSE: pmullw
-; SSE-NEXT: pmullw
-; SSE-NEXT: pmullw
-; SSE-NEXT: pmullw
-; AVX2: vpmullw
-; AVX2-NEXT: vpmullw
-; CHECK: ret
-
; Similar to test7; the difference is that with AVX512 support
; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
define <16 x i32> @test8(<16 x i32> %a) {
+; SSE-LABEL: test8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
+; SSE-NEXT: pmulld %xmm4, %xmm0
+; SSE-NEXT: pmulld %xmm4, %xmm1
+; SSE-NEXT: pmulld %xmm4, %xmm2
+; SSE-NEXT: pmulld %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: retq
%shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
ret <16 x i32> %shl
}
-; CHECK-LABEL: test8
-; SSE: pmulld
-; SSE-NEXT: pmulld
-; SSE-NEXT: pmulld
-; SSE-NEXT: pmulld
-; AVX2ONLY: vpsllvd
-; AVX2ONLY-NEXT: vpsllvd
-; AVX512: vpsllvd
-; AVX512-NOT: vpsllvd
-; CHECK: ret
-
-; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support.
+; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
define <8 x i64> @test9(<8 x i64> %a) {
+; SSE-LABEL: test9:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psllq $3, %xmm4
+; SSE-NEXT: psllq $2, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psllq $3, %xmm4
+; SSE-NEXT: psllq $2, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT: paddq %xmm0, %xmm0
+; SSE-NEXT: paddq %xmm2, %xmm2
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test9:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
+; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test9:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: retq
%shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
ret <8 x i64> %shl
}
-; CHECK-LABEL: test9
-; AVX2ONLY: vpsllvq
-; AVX2ONLY-NEXT: vpsllvq
-; AVX512: vpsllvq
-; AVX512-NOT: vpsllvq
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/vec_shift7.ll b/test/CodeGen/X86/vec_shift7.ll
index cdf828976be4..80d72a4a986f 100644
--- a/test/CodeGen/X86/vec_shift7.ll
+++ b/test/CodeGen/X86/vec_shift7.ll
@@ -1,12 +1,29 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; Verify that we don't fail when shift by zero is encountered.
define i64 @test1(<2 x i64> %a) {
+; X32-LABEL: test1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movdqa %xmm0, %xmm1
+; X32-NEXT: psllq $2, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-NEXT: movd %xmm1, %eax
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X32-NEXT: movd %xmm0, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psllq $2, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X64-NEXT: movd %xmm1, %rax
+; X64-NEXT: retq
entry:
%c = shl <2 x i64> %a, <i64 0, i64 2>
%d = extractelement <2 x i64> %c, i32 0
ret i64 %d
}
-; CHECK-LABEL: test1
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index ab5031e267dc..076f748009b3 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -1,39 +1,55 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s
target datalayout = "e-p:32:32"
-target triple = "i686-apple-darwin8.7.2"
define i16 @test1(float %f) nounwind {
- %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
- %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
- %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
- %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
- %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
- %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
- %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
- %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1]
- %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
- %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1]
- ret i16 %tmp69
; CHECK-LABEL: test1:
-; CHECK: subss LCPI0_
-; CHECK: mulss LCPI0_
-; CHECK: minss LCPI0_
+; CHECK: ## BB#0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: subss LCPI0_0, %xmm0
+; CHECK-NEXT: mulss LCPI0_1, %xmm0
+; CHECK-NEXT: minss LCPI0_2, %xmm0
+; CHECK-NEXT: maxss %xmm1, %xmm0
+; CHECK-NEXT: cvttss2si %xmm0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retl
+;
+ %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
+ %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
+ %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
+ %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
+ %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
+ %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
+ %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
+ %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1]
+ %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
+ %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1]
+ ret i16 %tmp69
}
define i16 @test2(float %f) nounwind {
- %tmp28 = fsub float %f, 1.000000e+00 ; <float> [#uses=1]
- %tmp37 = fmul float %tmp28, 5.000000e-01 ; <float> [#uses=1]
- %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0 ; <<4 x float>> [#uses=1]
- %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
- %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
- %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
- %tmp69 = trunc i32 %tmp to i16 ; <i16> [#uses=1]
- ret i16 %tmp69
; CHECK-LABEL: test2:
-; CHECK: addss LCPI1_
-; CHECK: mulss LCPI1_
-; CHECK: minss LCPI1_
+; CHECK: ## BB#0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: addss LCPI1_0, %xmm0
+; CHECK-NEXT: mulss LCPI1_1, %xmm0
+; CHECK-NEXT: minss LCPI1_2, %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: maxss %xmm1, %xmm0
+; CHECK-NEXT: cvttss2si %xmm0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retl
+;
+ %tmp28 = fsub float %f, 1.000000e+00 ; <float> [#uses=1]
+ %tmp37 = fmul float %tmp28, 5.000000e-01 ; <float> [#uses=1]
+ %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0 ; <<4 x float>> [#uses=1]
+ %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
+ %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
+ %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
+ %tmp69 = trunc i32 %tmp to i16 ; <i16> [#uses=1]
+ ret i16 %tmp69
}
declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
@@ -46,41 +62,56 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
-
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32)
+
declare <4 x float> @f()
define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: roundss $4, (%eax), %xmm0
+; CHECK-NEXT: retl
+;
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4)
ret <4 x float> %X
-; CHECK-LABEL: test3:
-; CHECK: roundss $4, (%eax), %xmm0
}
define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: subl $28, %esp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movaps %xmm0, (%esp) ## 16-byte Spill
+; CHECK-NEXT: calll _f
+; CHECK-NEXT: movaps (%esp), %xmm1 ## 16-byte Reload
+; CHECK-NEXT: roundss $4, %xmm1, %xmm0
+; CHECK-NEXT: addl $28, %esp
+; CHECK-NEXT: retl
+;
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%q = call <4 x float> @f()
%X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %q, <4 x float> %B, i32 4)
ret <4 x float> %X
-; CHECK-LABEL: test4:
-; CHECK: movss (%eax), %xmm
-; CHECK: call
-; CHECK: roundss $4, %xmm{{.*}}, %xmm0
}
-; PR13576
+; PR13576
define <2 x double> @test5() nounwind uwtable readnone noinline {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
+; CHECK-NEXT: movl $128, %eax
+; CHECK-NEXT: cvtsi2sdl %eax, %xmm0
+; CHECK-NEXT: retl
+;
entry:
%0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double
4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
ret <2 x double> %0
-; CHECK-LABEL: test5:
-; CHECK: mov
-; CHECK: mov
-; CHECK: cvtsi2sd
}
declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index dfc186bef052..c0e02bd15996 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -65,7 +65,9 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
;
; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32:
; AVX512F: # BB#0:
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
@@ -142,7 +144,9 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
;
; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32:
; AVX512F: # BB#0:
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32:
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
new file mode 100644
index 000000000000..5a443991c53f
--- /dev/null
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -0,0 +1,3772 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+
+define i8 @test_bitreverse_i8(i8 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: shlb $7, %al
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shlb $5, %cl
+; SSE-NEXT: andb $64, %cl
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shlb $3, %dl
+; SSE-NEXT: andb $32, %dl
+; SSE-NEXT: orb %cl, %dl
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: addb %cl, %cl
+; SSE-NEXT: andb $16, %cl
+; SSE-NEXT: orb %dl, %cl
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrb %dl
+; SSE-NEXT: andb $8, %dl
+; SSE-NEXT: orb %cl, %dl
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrb $3, %cl
+; SSE-NEXT: andb $4, %cl
+; SSE-NEXT: orb %dl, %cl
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrb $5, %dl
+; SSE-NEXT: andb $2, %dl
+; SSE-NEXT: orb %cl, %dl
+; SSE-NEXT: shrb $7, %dil
+; SSE-NEXT: orb %dl, %dil
+; SSE-NEXT: orb %al, %dil
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i8:
+; AVX: # BB#0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: shlb $7, %al
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shlb $5, %cl
+; AVX-NEXT: andb $64, %cl
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shlb $3, %dl
+; AVX-NEXT: andb $32, %dl
+; AVX-NEXT: orb %cl, %dl
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: addb %cl, %cl
+; AVX-NEXT: andb $16, %cl
+; AVX-NEXT: orb %dl, %cl
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrb %dl
+; AVX-NEXT: andb $8, %dl
+; AVX-NEXT: orb %cl, %dl
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrb $3, %cl
+; AVX-NEXT: andb $4, %cl
+; AVX-NEXT: orb %dl, %cl
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrb $5, %dl
+; AVX-NEXT: andb $2, %dl
+; AVX-NEXT: orb %cl, %dl
+; AVX-NEXT: shrb $7, %dil
+; AVX-NEXT: orb %dl, %dil
+; AVX-NEXT: orb %al, %dil
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i8:
+; XOP: # BB#0:
+; XOP-NEXT: vmovd %edi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vpextrb $0, %xmm0, %eax
+; XOP-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; XOP-NEXT: retq
+ %b = call i8 @llvm.bitreverse.i8(i8 %a)
+ ret i8 %b
+}
+
+define i16 @test_bitreverse_i16(i16 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i16:
+; SSE: # BB#0:
+; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: shll $15, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $2, %edx
+; SSE-NEXT: shll $13, %edx
+; SSE-NEXT: leal (%rdx,%rax), %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $4, %edx
+; SSE-NEXT: shll $11, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $8, %edx
+; SSE-NEXT: shll $9, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $16, %edx
+; SSE-NEXT: shll $7, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $32, %edx
+; SSE-NEXT: shll $5, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $64, %edx
+; SSE-NEXT: shll $3, %edx
+; SSE-NEXT: leal (%rdi,%rdi), %esi
+; SSE-NEXT: andl $256, %esi # imm = 0x100
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl %edx
+; SSE-NEXT: andl $128, %edx
+; SSE-NEXT: orl %esi, %edx
+; SSE-NEXT: movl %edi, %esi
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $64, %esi
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $5, %edx
+; SSE-NEXT: andl $32, %edx
+; SSE-NEXT: orl %esi, %edx
+; SSE-NEXT: movl %edi, %esi
+; SSE-NEXT: shrl $7, %esi
+; SSE-NEXT: andl $16, %esi
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $9, %edx
+; SSE-NEXT: andl $8, %edx
+; SSE-NEXT: orl %esi, %edx
+; SSE-NEXT: movl %edi, %esi
+; SSE-NEXT: shrl $11, %esi
+; SSE-NEXT: andl $4, %esi
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: shrl $13, %edi
+; SSE-NEXT: andl $2, %edi
+; SSE-NEXT: orl %esi, %edi
+; SSE-NEXT: shrl $15, %ecx
+; SSE-NEXT: orl %edi, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i16:
+; AVX: # BB#0:
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: shll $15, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $2, %edx
+; AVX-NEXT: shll $13, %edx
+; AVX-NEXT: leal (%rdx,%rax), %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $4, %edx
+; AVX-NEXT: shll $11, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $8, %edx
+; AVX-NEXT: shll $9, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $16, %edx
+; AVX-NEXT: shll $7, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $32, %edx
+; AVX-NEXT: shll $5, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $64, %edx
+; AVX-NEXT: shll $3, %edx
+; AVX-NEXT: leal (%rdi,%rdi), %esi
+; AVX-NEXT: andl $256, %esi # imm = 0x100
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: andl $128, %edx
+; AVX-NEXT: orl %esi, %edx
+; AVX-NEXT: movl %edi, %esi
+; AVX-NEXT: shrl $3, %esi
+; AVX-NEXT: andl $64, %esi
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $5, %edx
+; AVX-NEXT: andl $32, %edx
+; AVX-NEXT: orl %esi, %edx
+; AVX-NEXT: movl %edi, %esi
+; AVX-NEXT: shrl $7, %esi
+; AVX-NEXT: andl $16, %esi
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $9, %edx
+; AVX-NEXT: andl $8, %edx
+; AVX-NEXT: orl %esi, %edx
+; AVX-NEXT: movl %edi, %esi
+; AVX-NEXT: shrl $11, %esi
+; AVX-NEXT: andl $4, %esi
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: shrl $13, %edi
+; AVX-NEXT: andl $2, %edi
+; AVX-NEXT: orl %esi, %edi
+; AVX-NEXT: shrl $15, %ecx
+; AVX-NEXT: orl %edi, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i16:
+; XOP: # BB#0:
+; XOP-NEXT: vmovd %edi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vmovd %xmm0, %eax
+; XOP-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; XOP-NEXT: retq
+ %b = call i16 @llvm.bitreverse.i16(i16 %a)
+ ret i16 %b
+}
+
+define i32 @test_bitreverse_i32(i32 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i32:
+; SSE: # BB#0:
+; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: shll $31, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $2, %ecx
+; SSE-NEXT: shll $29, %ecx
+; SSE-NEXT: leal (%rcx,%rax), %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $4, %ecx
+; SSE-NEXT: shll $27, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $8, %ecx
+; SSE-NEXT: shll $25, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $16, %ecx
+; SSE-NEXT: shll $23, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $32, %ecx
+; SSE-NEXT: shll $21, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $64, %ecx
+; SSE-NEXT: shll $19, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $17, %edx
+; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shll $15, %ecx
+; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $13, %edx
+; SSE-NEXT: andl $4194304, %edx # imm = 0x400000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shll $11, %ecx
+; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $9, %edx
+; SSE-NEXT: andl $1048576, %edx # imm = 0x100000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shll $7, %ecx
+; SSE-NEXT: andl $524288, %ecx # imm = 0x80000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $5, %edx
+; SSE-NEXT: andl $262144, %edx # imm = 0x40000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: leal (,%rdi,8), %ecx
+; SSE-NEXT: andl $131072, %ecx # imm = 0x20000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: leal (%rdi,%rdi), %edx
+; SSE-NEXT: andl $65536, %edx # imm = 0x10000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl %ecx
+; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $3, %edx
+; SSE-NEXT: andl $16384, %edx # imm = 0x4000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $5, %ecx
+; SSE-NEXT: andl $8192, %ecx # imm = 0x2000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $7, %edx
+; SSE-NEXT: andl $4096, %edx # imm = 0x1000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $9, %ecx
+; SSE-NEXT: andl $2048, %ecx # imm = 0x800
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $11, %edx
+; SSE-NEXT: andl $1024, %edx # imm = 0x400
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $13, %ecx
+; SSE-NEXT: andl $512, %ecx # imm = 0x200
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $15, %edx
+; SSE-NEXT: andl $256, %edx # imm = 0x100
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $17, %ecx
+; SSE-NEXT: andl $128, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $19, %edx
+; SSE-NEXT: andl $64, %edx
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $21, %ecx
+; SSE-NEXT: andl $32, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $23, %edx
+; SSE-NEXT: andl $16, %edx
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $25, %ecx
+; SSE-NEXT: andl $8, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $27, %edx
+; SSE-NEXT: andl $4, %edx
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $29, %ecx
+; SSE-NEXT: andl $2, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: shrl $31, %edi
+; SSE-NEXT: orl %ecx, %edi
+; SSE-NEXT: orl %edi, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i32:
+; AVX: # BB#0:
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: shll $31, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $2, %ecx
+; AVX-NEXT: shll $29, %ecx
+; AVX-NEXT: leal (%rcx,%rax), %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $4, %ecx
+; AVX-NEXT: shll $27, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $8, %ecx
+; AVX-NEXT: shll $25, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $16, %ecx
+; AVX-NEXT: shll $23, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $32, %ecx
+; AVX-NEXT: shll $21, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $64, %ecx
+; AVX-NEXT: shll $19, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $17, %edx
+; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shll $15, %ecx
+; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $13, %edx
+; AVX-NEXT: andl $4194304, %edx # imm = 0x400000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shll $11, %ecx
+; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $9, %edx
+; AVX-NEXT: andl $1048576, %edx # imm = 0x100000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shll $7, %ecx
+; AVX-NEXT: andl $524288, %ecx # imm = 0x80000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $5, %edx
+; AVX-NEXT: andl $262144, %edx # imm = 0x40000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: leal (,%rdi,8), %ecx
+; AVX-NEXT: andl $131072, %ecx # imm = 0x20000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: leal (%rdi,%rdi), %edx
+; AVX-NEXT: andl $65536, %edx # imm = 0x10000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $3, %edx
+; AVX-NEXT: andl $16384, %edx # imm = 0x4000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $5, %ecx
+; AVX-NEXT: andl $8192, %ecx # imm = 0x2000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $7, %edx
+; AVX-NEXT: andl $4096, %edx # imm = 0x1000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $9, %ecx
+; AVX-NEXT: andl $2048, %ecx # imm = 0x800
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $11, %edx
+; AVX-NEXT: andl $1024, %edx # imm = 0x400
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $13, %ecx
+; AVX-NEXT: andl $512, %ecx # imm = 0x200
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $15, %edx
+; AVX-NEXT: andl $256, %edx # imm = 0x100
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $17, %ecx
+; AVX-NEXT: andl $128, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $19, %edx
+; AVX-NEXT: andl $64, %edx
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $21, %ecx
+; AVX-NEXT: andl $32, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $23, %edx
+; AVX-NEXT: andl $16, %edx
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $25, %ecx
+; AVX-NEXT: andl $8, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $27, %edx
+; AVX-NEXT: andl $4, %edx
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $29, %ecx
+; AVX-NEXT: andl $2, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: shrl $31, %edi
+; AVX-NEXT: orl %ecx, %edi
+; AVX-NEXT: orl %edi, %eax
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i32:
+; XOP: # BB#0:
+; XOP-NEXT: vmovd %edi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vmovd %xmm0, %eax
+; XOP-NEXT: retq
+ %b = call i32 @llvm.bitreverse.i32(i32 %a)
+ ret i32 %b
+}
+
+define i64 @test_bitreverse_i64(i64 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i64:
+; SSE: # BB#0:
+; SSE-NEXT: leaq (%rdi,%rdi), %rax
+; SSE-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
+; SSE-NEXT: andq %rax, %rcx
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: shlq $63, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $2, %rdx
+; SSE-NEXT: shlq $61, %rdx
+; SSE-NEXT: leaq (%rdx,%rax), %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $4, %rdx
+; SSE-NEXT: shlq $59, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $8, %rdx
+; SSE-NEXT: shlq $57, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $16, %rdx
+; SSE-NEXT: shlq $55, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $32, %rdx
+; SSE-NEXT: shlq $53, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $64, %rdx
+; SSE-NEXT: shlq $51, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $128, %rsi
+; SSE-NEXT: shlq $49, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $256, %rdx # imm = 0x100
+; SSE-NEXT: shlq $47, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $512, %rsi # imm = 0x200
+; SSE-NEXT: shlq $45, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $1024, %rdx # imm = 0x400
+; SSE-NEXT: shlq $43, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $2048, %rsi # imm = 0x800
+; SSE-NEXT: shlq $41, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $4096, %rdx # imm = 0x1000
+; SSE-NEXT: shlq $39, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $8192, %rsi # imm = 0x2000
+; SSE-NEXT: shlq $37, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $16384, %rdx # imm = 0x4000
+; SSE-NEXT: shlq $35, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $32768, %rsi # imm = 0x8000
+; SSE-NEXT: shlq $33, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $65536, %rdx # imm = 0x10000
+; SSE-NEXT: shlq $31, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $131072, %rsi # imm = 0x20000
+; SSE-NEXT: shlq $29, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $262144, %rdx # imm = 0x40000
+; SSE-NEXT: shlq $27, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $524288, %rsi # imm = 0x80000
+; SSE-NEXT: shlq $25, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $1048576, %rdx # imm = 0x100000
+; SSE-NEXT: shlq $23, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $2097152, %rsi # imm = 0x200000
+; SSE-NEXT: shlq $21, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $4194304, %rdx # imm = 0x400000
+; SSE-NEXT: shlq $19, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $8388608, %rsi # imm = 0x800000
+; SSE-NEXT: shlq $17, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $16777216, %rdx # imm = 0x1000000
+; SSE-NEXT: shlq $15, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $33554432, %rsi # imm = 0x2000000
+; SSE-NEXT: shlq $13, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $67108864, %rdx # imm = 0x4000000
+; SSE-NEXT: shlq $11, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $134217728, %rsi # imm = 0x8000000
+; SSE-NEXT: shlq $9, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $268435456, %rdx # imm = 0x10000000
+; SSE-NEXT: shlq $7, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $536870912, %rsi # imm = 0x20000000
+; SSE-NEXT: shlq $5, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $1073741824, %rdx # imm = 0x40000000
+; SSE-NEXT: shlq $3, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $3, %rdx
+; SSE-NEXT: andl $1073741824, %edx # imm = 0x40000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $5, %rcx
+; SSE-NEXT: andl $536870912, %ecx # imm = 0x20000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $7, %rdx
+; SSE-NEXT: andl $268435456, %edx # imm = 0x10000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $9, %rcx
+; SSE-NEXT: andl $134217728, %ecx # imm = 0x8000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $11, %rdx
+; SSE-NEXT: andl $67108864, %edx # imm = 0x4000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $13, %rcx
+; SSE-NEXT: andl $33554432, %ecx # imm = 0x2000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $15, %rdx
+; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $17, %rcx
+; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $19, %rdx
+; SSE-NEXT: andl $4194304, %edx # imm = 0x400000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $21, %rcx
+; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $23, %rdx
+; SSE-NEXT: andl $1048576, %edx # imm = 0x100000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $25, %rcx
+; SSE-NEXT: andl $524288, %ecx # imm = 0x80000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $27, %rdx
+; SSE-NEXT: andl $262144, %edx # imm = 0x40000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $29, %rcx
+; SSE-NEXT: andl $131072, %ecx # imm = 0x20000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $31, %rdx
+; SSE-NEXT: andl $65536, %edx # imm = 0x10000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $33, %rcx
+; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $35, %rdx
+; SSE-NEXT: andl $16384, %edx # imm = 0x4000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $37, %rcx
+; SSE-NEXT: andl $8192, %ecx # imm = 0x2000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $39, %rdx
+; SSE-NEXT: andl $4096, %edx # imm = 0x1000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $41, %rcx
+; SSE-NEXT: andl $2048, %ecx # imm = 0x800
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $43, %rdx
+; SSE-NEXT: andl $1024, %edx # imm = 0x400
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $45, %rcx
+; SSE-NEXT: andl $512, %ecx # imm = 0x200
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $47, %rdx
+; SSE-NEXT: andl $256, %edx # imm = 0x100
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $49, %rcx
+; SSE-NEXT: andl $128, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $51, %rdx
+; SSE-NEXT: andl $64, %edx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $53, %rcx
+; SSE-NEXT: andl $32, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $55, %rdx
+; SSE-NEXT: andl $16, %edx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $57, %rcx
+; SSE-NEXT: andl $8, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $59, %rdx
+; SSE-NEXT: andl $4, %edx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $61, %rcx
+; SSE-NEXT: andl $2, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: shrq $63, %rdi
+; SSE-NEXT: orq %rcx, %rdi
+; SSE-NEXT: orq %rdi, %rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i64:
+; AVX: # BB#0:
+; AVX-NEXT: leaq (%rdi,%rdi), %rax
+; AVX-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
+; AVX-NEXT: andq %rax, %rcx
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: shlq $63, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $2, %rdx
+; AVX-NEXT: shlq $61, %rdx
+; AVX-NEXT: leaq (%rdx,%rax), %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $4, %rdx
+; AVX-NEXT: shlq $59, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $8, %rdx
+; AVX-NEXT: shlq $57, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $16, %rdx
+; AVX-NEXT: shlq $55, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $32, %rdx
+; AVX-NEXT: shlq $53, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $64, %rdx
+; AVX-NEXT: shlq $51, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $128, %rsi
+; AVX-NEXT: shlq $49, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $256, %rdx # imm = 0x100
+; AVX-NEXT: shlq $47, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $512, %rsi # imm = 0x200
+; AVX-NEXT: shlq $45, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $1024, %rdx # imm = 0x400
+; AVX-NEXT: shlq $43, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $2048, %rsi # imm = 0x800
+; AVX-NEXT: shlq $41, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $4096, %rdx # imm = 0x1000
+; AVX-NEXT: shlq $39, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $8192, %rsi # imm = 0x2000
+; AVX-NEXT: shlq $37, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $16384, %rdx # imm = 0x4000
+; AVX-NEXT: shlq $35, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $32768, %rsi # imm = 0x8000
+; AVX-NEXT: shlq $33, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $65536, %rdx # imm = 0x10000
+; AVX-NEXT: shlq $31, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $131072, %rsi # imm = 0x20000
+; AVX-NEXT: shlq $29, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $262144, %rdx # imm = 0x40000
+; AVX-NEXT: shlq $27, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $524288, %rsi # imm = 0x80000
+; AVX-NEXT: shlq $25, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $1048576, %rdx # imm = 0x100000
+; AVX-NEXT: shlq $23, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $2097152, %rsi # imm = 0x200000
+; AVX-NEXT: shlq $21, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $4194304, %rdx # imm = 0x400000
+; AVX-NEXT: shlq $19, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $8388608, %rsi # imm = 0x800000
+; AVX-NEXT: shlq $17, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $16777216, %rdx # imm = 0x1000000
+; AVX-NEXT: shlq $15, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $33554432, %rsi # imm = 0x2000000
+; AVX-NEXT: shlq $13, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $67108864, %rdx # imm = 0x4000000
+; AVX-NEXT: shlq $11, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $134217728, %rsi # imm = 0x8000000
+; AVX-NEXT: shlq $9, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $268435456, %rdx # imm = 0x10000000
+; AVX-NEXT: shlq $7, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $536870912, %rsi # imm = 0x20000000
+; AVX-NEXT: shlq $5, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $1073741824, %rdx # imm = 0x40000000
+; AVX-NEXT: shlq $3, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $3, %rdx
+; AVX-NEXT: andl $1073741824, %edx # imm = 0x40000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $5, %rcx
+; AVX-NEXT: andl $536870912, %ecx # imm = 0x20000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $7, %rdx
+; AVX-NEXT: andl $268435456, %edx # imm = 0x10000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $9, %rcx
+; AVX-NEXT: andl $134217728, %ecx # imm = 0x8000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $11, %rdx
+; AVX-NEXT: andl $67108864, %edx # imm = 0x4000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $13, %rcx
+; AVX-NEXT: andl $33554432, %ecx # imm = 0x2000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $15, %rdx
+; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $17, %rcx
+; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $19, %rdx
+; AVX-NEXT: andl $4194304, %edx # imm = 0x400000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $21, %rcx
+; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $23, %rdx
+; AVX-NEXT: andl $1048576, %edx # imm = 0x100000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $25, %rcx
+; AVX-NEXT: andl $524288, %ecx # imm = 0x80000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $27, %rdx
+; AVX-NEXT: andl $262144, %edx # imm = 0x40000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $29, %rcx
+; AVX-NEXT: andl $131072, %ecx # imm = 0x20000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $31, %rdx
+; AVX-NEXT: andl $65536, %edx # imm = 0x10000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $33, %rcx
+; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $35, %rdx
+; AVX-NEXT: andl $16384, %edx # imm = 0x4000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $37, %rcx
+; AVX-NEXT: andl $8192, %ecx # imm = 0x2000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $39, %rdx
+; AVX-NEXT: andl $4096, %edx # imm = 0x1000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $41, %rcx
+; AVX-NEXT: andl $2048, %ecx # imm = 0x800
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $43, %rdx
+; AVX-NEXT: andl $1024, %edx # imm = 0x400
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $45, %rcx
+; AVX-NEXT: andl $512, %ecx # imm = 0x200
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $47, %rdx
+; AVX-NEXT: andl $256, %edx # imm = 0x100
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $49, %rcx
+; AVX-NEXT: andl $128, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $51, %rdx
+; AVX-NEXT: andl $64, %edx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $53, %rcx
+; AVX-NEXT: andl $32, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $55, %rdx
+; AVX-NEXT: andl $16, %edx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $57, %rcx
+; AVX-NEXT: andl $8, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $59, %rdx
+; AVX-NEXT: andl $4, %edx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $61, %rcx
+; AVX-NEXT: andl $2, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: shrq $63, %rdi
+; AVX-NEXT: orq %rcx, %rdi
+; AVX-NEXT: orq %rdi, %rax
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i64:
+; XOP: # BB#0:
+; XOP-NEXT: vmovq %rdi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vmovq %xmm0, %rax
+; XOP-NEXT: retq
+ %b = call i64 @llvm.bitreverse.i64(i64 %a)
+ ret i64 %b
+}
+
+define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm1, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $7, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm3, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $3, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: paddb %xmm3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrlw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v16i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v16i8:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+ ret <16 x i8> %b
+}
+
+define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm2, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v8i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v8i16:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+ ret <8 x i16> %b
+}
+
+define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm2, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v4i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v4i32:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+ ret <4 x i32> %b
+}
+
+define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm2, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v2i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v2i64:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+ ret <2 x i64> %b
+}
+
+define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v32i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $7, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm10, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm3, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $7, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v32i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm5
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v32i8:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v32i8:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+ ret <32 x i8> %b
+}
+
+define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v16i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm13, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v16i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v16i16:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v16i16:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+ ret <16 x i16> %b
+}
+
+define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm13, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v8i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v8i32:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v8i32:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+ ret <8 x i32> %b
+}
+
+define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm3, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm13, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v4i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v4i64:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v4i64:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+ ret <4 x i64> %b
+}
+
+define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v64i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm10, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $5, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm4
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm6, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v64i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm5, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm1, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: movdqa %xmm9, %xmm7
+; SSSE3-NEXT: pshufb %xmm1, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
+; SSSE3-NEXT: por %xmm7, %xmm6
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm9
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm8, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v64i8:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v64i8:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+ ret <64 x i8> %b
+}
+
+define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v32i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
+; SSE2-NEXT: pand %xmm15, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v32i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSSE3-NEXT: pshufb %xmm8, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pshufb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: pshufb %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pshufb %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v32i16:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v32i16:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+ ret <32 x i16> %b
+}
+
+define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v16i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
+; SSE2-NEXT: pand %xmm15, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v16i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: pshufb %xmm8, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pshufb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: pshufb %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pshufb %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v16i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpslld $29, %zmm0, %zmm1
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm1
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512F-NEXT: vpslld $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $29, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v16i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v16i32:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v16i32:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+ ret <16 x i32> %b
+}
+
+define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v8i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
+; SSE2-NEXT: pand %xmm15, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v8i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; SSSE3-NEXT: pshufb %xmm8, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pshufb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: pshufb %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pshufb %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v8i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsllq $61, %zmm0, %zmm1
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm2
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq $59, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $57, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $55, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $53, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $51, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $49, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $47, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $45, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $43, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $41, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $39, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $37, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $35, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $33, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $31, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $29, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $29, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $31, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $33, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $35, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $37, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $39, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $41, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $43, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $45, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $47, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $49, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $51, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $53, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $55, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $57, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $59, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $61, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v8i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v8i64:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v8i64:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+ ret <8 x i64> %b
+}
+
+declare i8 @llvm.bitreverse.i8(i8) readnone
+declare i16 @llvm.bitreverse.i16(i16) readnone
+declare i32 @llvm.bitreverse.i32(i32) readnone
+declare i64 @llvm.bitreverse.i64(i64) readnone
+
+declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
+declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
+declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
+declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
+
+declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone
+declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
+declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone
+declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone
+
+declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone
+declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
+declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone
+declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index aaf81f2f9bb6..309fa98145c6 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
@@ -272,15 +273,15 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
; SSE41-LABEL: vsel_i8:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_i8:
; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
entry:
%vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
@@ -632,8 +633,8 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSE2-NEXT: andps %xmm4, %xmm3
; SSE2-NEXT: andnps %xmm1, %xmm4
; SSE2-NEXT: orps %xmm3, %xmm4
-; SSE2-NEXT: movaps %xmm5, %xmm0
-; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: constant_pblendvb_avx2:
@@ -651,20 +652,19 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSE41-LABEL: constant_pblendvb_avx2:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; SSE41-NEXT: pblendvb %xmm2, %xmm4
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: pblendvb %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_pblendvb_avx2:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa .LCPI18_0(%rip), %xmm4 # xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_pblendvb_avx2:
@@ -801,3 +801,254 @@ entry:
%select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
ret <4 x i64> %select
}
+
+define <4 x i32> @blend_logic_v4i32(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) {
+; SSE2-LABEL: blend_logic_v4i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_logic_v4i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_logic_v4i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_logic_v4i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: retq
+entry:
+ %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <4 x i32> zeroinitializer, %a
+ %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <4 x i32> %c, %0
+ %2 = and <4 x i32> %a, %b.lobit
+ %cond = or <4 x i32> %1, %2
+ ret <4 x i32> %cond
+}
+
+define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
+; SSE2-LABEL: blend_logic_v8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pandn %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_logic_v8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm3
+; SSSE3-NEXT: pandn %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_logic_v8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: blend_logic_v8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm2
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_logic_v8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <8 x i32> zeroinitializer, %a
+ %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <8 x i32> %c, %0
+ %2 = and <8 x i32> %a, %b.lobit
+ %cond = or <8 x i32> %1, %2
+ ret <8 x i32> %cond
+}
+
+define <4 x i32> @blend_neg_logic_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: blend_neg_logic_v4i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_neg_logic_v4i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_neg_logic_v4i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_neg_logic_v4i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <4 x i32> zeroinitializer, %a
+ %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <4 x i32> %a, %0
+ %2 = and <4 x i32> %b.lobit, %sub
+ %cond = or <4 x i32> %1, %2
+ ret <4 x i32> %cond
+}
+
+define <8 x i32> @blend_neg_logic_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: blend_neg_logic_v8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: psubd %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_neg_logic_v8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: psubd %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_neg_logic_v8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm1
+; SSE41-NEXT: psubd %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: blend_neg_logic_v8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_neg_logic_v8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <8 x i32> zeroinitializer, %a
+ %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <8 x i32> %a, %0
+ %2 = and <8 x i32> %b.lobit, %sub
+ %cond = or <8 x i32> %1, %2
+ ret <8 x i32> %cond
+}
+
+define <4 x i32> @blend_neg_logic_v4i32_2(<4 x i32> %v, <4 x i32> %c) {
+; SSE2-LABEL: blend_neg_logic_v4i32_2:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrld $31, %xmm1
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_neg_logic_v4i32_2:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrld $31, %xmm1
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: psubd %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_neg_logic_v4i32_2:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrld $31, %xmm1
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: psubd %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm2, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_neg_logic_v4i32_2:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrld $31, %xmm1, %xmm1
+; AVX-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm2
+; AVX-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = ashr <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
+ %1 = trunc <4 x i32> %0 to <4 x i1>
+ %2 = sub nsw <4 x i32> zeroinitializer, %v
+ %3 = select <4 x i1> %1, <4 x i32> %v, <4 x i32> %2
+ ret <4 x i32> %3
+}
diff --git a/test/CodeGen/X86/vector-compare-combines.ll b/test/CodeGen/X86/vector-compare-combines.ll
new file mode 100644
index 000000000000..c25474d92f9c
--- /dev/null
+++ b/test/CodeGen/X86/vector-compare-combines.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+
+; If we have SSE/AVX intrinsics in the code, we miss obvious combines
+; unless we do them late on X86-specific nodes.
+
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @PR27924_cmpeq(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: PR27924_cmpeq:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR27924_cmpeq:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <4 x i32> %a, %b
+ %max = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+ %sse_max = tail call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a, <4 x i32> %b)
+ %truth = icmp eq <4 x i32> %max, %sse_max
+ %ret = sext <4 x i1> %truth to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @PR27924_cmpgt(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: PR27924_cmpgt:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR27924_cmpgt:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <4 x i32> %a, %b
+ %max = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+ %sse_max = tail call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a, <4 x i32> %b)
+ %untruth = icmp sgt <4 x i32> %max, %sse_max
+ %ret = sext <4 x i1> %untruth to <4 x i32>
+ ret <4 x i32> %ret
+}
+
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
new file mode 100644
index 000000000000..595d3a42b76f
--- /dev/null
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -0,0 +1,6625 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+
+;
+; 128-bit vector comparisons
+;
+
+define <2 x i1> @test_cmp_v2f64(<2 x double> %a0, <2 x double> %a1) nounwind {
+; SSE-LABEL: test_cmp_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fcmp ogt <2 x double> %a0, %a1
+ ret <2 x i1> %1
+}
+
+define <4 x i1> @test_cmp_v4f32(<4 x float> %a0, <4 x float> %a1) nounwind {
+; SSE-LABEL: test_cmp_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fcmp ogt <4 x float> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <2 x i1> @test_cmp_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v2i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <2 x i64> %a0, %a1
+ ret <2 x i1> %1
+}
+
+define <4 x i1> @test_cmp_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
+; SSE-LABEL: test_cmp_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <4 x i32> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
+; SSE-LABEL: test_cmp_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <8 x i16> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <16 x i8> %a0, %a1
+ ret <16 x i1> %1
+}
+
+;
+; 256-bit vector comparisons
+;
+
+define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v4f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltpd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: cmpltpd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v4f64:
+; SSE42: # BB#0:
+; SSE42-NEXT: cmpltpd %xmm1, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,2]
+; SSE42-NEXT: cmpltpd %xmm0, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v4f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <4 x double> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltps %xmm1, %xmm3
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: cmpltps %xmm0, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8f32:
+; SSE42: # BB#0:
+; SSE42-NEXT: cmpltps %xmm1, %xmm3
+; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm1, %xmm3
+; SSE42-NEXT: cmpltps %xmm0, %xmm2
+; SSE42-NEXT: pshufb %xmm1, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <8 x float> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v4i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm3, %xmm1
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = icmp sgt <4 x i64> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8i32:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm3, %xmm1
+; SSE42-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE42-NEXT: pshufb %xmm3, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = icmp sgt <8 x i32> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v16i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v16i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm3, %xmm1
+; SSE42-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE42-NEXT: pshufb %xmm3, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <16 x i16> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE42-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE42-NEXT: pextrb $15, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <32 x i8> %a0, %a1
+ ret <32 x i1> %1
+}
+
+;
+; 512-bit vector comparisons
+;
+
+define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltpd %xmm3, %xmm7
+; SSE2-NEXT: cmpltpd %xmm1, %xmm5
+; SSE2-NEXT: pextrw $4, %xmm5, %eax
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE2-NEXT: cmpltpd %xmm2, %xmm6
+; SSE2-NEXT: cmpltpd %xmm0, %xmm4
+; SSE2-NEXT: pextrw $4, %xmm4, %ecx
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT: pextrw $4, %xmm7, %edx
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pextrw $4, %xmm6, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8f64:
+; SSE42: # BB#0:
+; SSE42-NEXT: cmpltpd %xmm3, %xmm7
+; SSE42-NEXT: xorpd %xmm3, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm3[1,2,3],xmm7[4],xmm3[5,6,7]
+; SSE42-NEXT: cmpltpd %xmm2, %xmm6
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1,2,3],xmm6[4],xmm3[5,6,7]
+; SSE42-NEXT: packusdw %xmm7, %xmm6
+; SSE42-NEXT: cmpltpd %xmm1, %xmm5
+; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm3[1,2,3],xmm5[4],xmm3[5,6,7]
+; SSE42-NEXT: cmpltpd %xmm0, %xmm4
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; SSE42-NEXT: packusdw %xmm5, %xmm3
+; SSE42-NEXT: packusdw %xmm6, %xmm3
+; SSE42-NEXT: movdqa %xmm3, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <8 x double> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltps %xmm3, %xmm7
+; SSE-NEXT: movaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: andps %xmm3, %xmm7
+; SSE-NEXT: cmpltps %xmm2, %xmm6
+; SSE-NEXT: andps %xmm3, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: cmpltps %xmm1, %xmm5
+; SSE-NEXT: andps %xmm3, %xmm5
+; SSE-NEXT: cmpltps %xmm0, %xmm4
+; SSE-NEXT: andps %xmm4, %xmm3
+; SSE-NEXT: packuswb %xmm5, %xmm3
+; SSE-NEXT: packuswb %xmm6, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vmovaps {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <16 x float> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pextrw $4, %xmm1, %eax
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pextrw $4, %xmm9, %ecx
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: pextrw $4, %xmm0, %ecx
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: pextrw $4, %xmm3, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm7, %xmm3
+; SSE42-NEXT: pxor %xmm7, %xmm7
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1,2,3],xmm3[4],xmm7[5,6,7]
+; SSE42-NEXT: pcmpgtq %xmm6, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2,3],xmm2[4],xmm7[5,6,7]
+; SSE42-NEXT: packusdw %xmm3, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm5, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7]
+; SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7]
+; SSE42-NEXT: packusdw %xmm1, %xmm0
+; SSE42-NEXT: packusdw %xmm2, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <8 x i64> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <16 x i32> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm5, %xmm1
+; SSE42-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE42-NEXT: pshufb %xmm5, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE42-NEXT: pshufb %xmm5, %xmm3
+; SSE42-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE42-NEXT: pshufb %xmm5, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <32 x i16> %a0, %a1
+ ret <32 x i1> %1
+}
+
+define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v64i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v64i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE42-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE42-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE42-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE42-NEXT: pextrb $15, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v64i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm3
+; AVX512-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vpcmpgtb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm2
+; AVX512-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpgtb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX512-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT: retq
+ %1 = icmp sgt <64 x i8> %a0, %a1
+ ret <64 x i1> %1
+}
+
+;
+; 1024-bit vector comparisons
+;
+
+define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16f64:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm0, %xmm8
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15
+; SSE-NEXT: cmpltpd %xmm7, %xmm15
+; SSE-NEXT: movapd {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: andpd %xmm7, %xmm15
+; SSE-NEXT: cmpltpd %xmm6, %xmm14
+; SSE-NEXT: andpd %xmm7, %xmm14
+; SSE-NEXT: packuswb %xmm15, %xmm14
+; SSE-NEXT: cmpltpd %xmm5, %xmm13
+; SSE-NEXT: andpd %xmm7, %xmm13
+; SSE-NEXT: cmpltpd %xmm4, %xmm9
+; SSE-NEXT: andpd %xmm7, %xmm9
+; SSE-NEXT: packuswb %xmm13, %xmm9
+; SSE-NEXT: packuswb %xmm14, %xmm9
+; SSE-NEXT: cmpltpd %xmm3, %xmm12
+; SSE-NEXT: andpd %xmm7, %xmm12
+; SSE-NEXT: cmpltpd %xmm2, %xmm10
+; SSE-NEXT: andpd %xmm7, %xmm10
+; SSE-NEXT: packuswb %xmm12, %xmm10
+; SSE-NEXT: cmpltpd %xmm1, %xmm11
+; SSE-NEXT: andpd %xmm7, %xmm11
+; SSE-NEXT: cmpltpd %xmm8, %xmm0
+; SSE-NEXT: andpd %xmm7, %xmm0
+; SSE-NEXT: packuswb %xmm11, %xmm0
+; SSE-NEXT: packuswb %xmm10, %xmm0
+; SSE-NEXT: packuswb %xmm9, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandpd %xmm7, %xmm8, %xmm8
+; AVX1-NEXT: vandpd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vandpd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vandpd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2
+; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movq $-1, %rcx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512-NEXT: vucomisd %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vucomisd %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT: vucomisd %xmm2, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vucomisd %xmm2, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT: vucomisd %xmm3, %xmm1
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vucomisd %xmm3, %xmm1
+; AVX512-NEXT: cmovaq %rcx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <16 x double> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: cmpltps %xmm3, %xmm15
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm2, %xmm14
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: cmpltps %xmm1, %xmm13
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm0, %xmm12
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: cmpltps %xmm7, %xmm11
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm6, %xmm10
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: cmpltps %xmm5, %xmm9
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm4, %xmm8
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: packuswb %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32f32:
+; SSE42: # BB#0:
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE42-NEXT: cmpltps %xmm3, %xmm15
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm3, %xmm15
+; SSE42-NEXT: cmpltps %xmm2, %xmm13
+; SSE42-NEXT: pshufb %xmm3, %xmm13
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0]
+; SSE42-NEXT: psllw $15, %xmm13
+; SSE42-NEXT: psraw $15, %xmm13
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm2, %xmm13
+; SSE42-NEXT: cmpltps %xmm1, %xmm14
+; SSE42-NEXT: pshufb %xmm3, %xmm14
+; SSE42-NEXT: cmpltps %xmm0, %xmm8
+; SSE42-NEXT: pshufb %xmm3, %xmm8
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm14[0]
+; SSE42-NEXT: psllw $15, %xmm8
+; SSE42-NEXT: psraw $15, %xmm8
+; SSE42-NEXT: pshufb %xmm2, %xmm8
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm13[0]
+; SSE42-NEXT: cmpltps %xmm7, %xmm12
+; SSE42-NEXT: pshufb %xmm3, %xmm12
+; SSE42-NEXT: cmpltps %xmm6, %xmm10
+; SSE42-NEXT: pshufb %xmm3, %xmm10
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm12[0]
+; SSE42-NEXT: psllw $15, %xmm10
+; SSE42-NEXT: psraw $15, %xmm10
+; SSE42-NEXT: pshufb %xmm2, %xmm10
+; SSE42-NEXT: cmpltps %xmm5, %xmm11
+; SSE42-NEXT: pshufb %xmm3, %xmm11
+; SSE42-NEXT: cmpltps %xmm4, %xmm9
+; SSE42-NEXT: pshufb %xmm3, %xmm9
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; SSE42-NEXT: psllw $15, %xmm9
+; SSE42-NEXT: psraw $15, %xmm9
+; SSE42-NEXT: pshufb %xmm2, %xmm9
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
+; SSE42-NEXT: pextrb $15, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm3, %ymm7, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vmovaps {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm7, %xmm8, %xmm8
+; AVX1-NEXT: vandps %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vcmpltps %ymm2, %ymm6, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vandps %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vandps %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcmpltps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm3, %ymm7, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vcmpltps %ymm2, %ymm6, %ymm2
+; AVX2-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vcmpltps %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX2-NEXT: vcmpltps %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm6
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $-1, %ecx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512-NEXT: vucomiss %xmm7, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm7
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm7
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm8
+; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm2, %xmm5
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm2, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm6
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512-NEXT: vucomiss %xmm7, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm0, %xmm5
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm0, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm3, %xmm1
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm3, %xmm1
+; AVX512-NEXT: cmoval %ecx, %eax
+; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <32 x float> %a0, %a1
+ ret <32 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v16i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm10, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm11
+; SSE2-NEXT: pand %xmm10, %xmm11
+; SSE2-NEXT: packuswb %xmm9, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm10, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm10, %xmm4
+; SSE2-NEXT: packuswb %xmm6, %xmm4
+; SSE2-NEXT: packuswb %xmm11, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm10, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pxor {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v16i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE42-NEXT: pand %xmm8, %xmm7
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pand %xmm8, %xmm6
+; SSE42-NEXT: packuswb %xmm7, %xmm6
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pand %xmm8, %xmm5
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pand %xmm8, %xmm4
+; SSE42-NEXT: packuswb %xmm5, %xmm4
+; SSE42-NEXT: packuswb %xmm6, %xmm4
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pand %xmm8, %xmm3
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pand %xmm8, %xmm2
+; SSE42-NEXT: packuswb %xmm3, %xmm2
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pand %xmm8, %xmm1
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pand %xmm8, %xmm0
+; SSE42-NEXT: packuswb %xmm1, %xmm0
+; SSE42-NEXT: packuswb %xmm2, %xmm0
+; SSE42-NEXT: packuswb %xmm4, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpcmpgtq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: cmpq %rcx, %rdx
+; AVX512-NEXT: movq $-1, %rcx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm4, %rdx
+; AVX512-NEXT: vmovq %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vmovq %xmm5, %rdx
+; AVX512-NEXT: vmovq %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vmovq %xmm5, %rdx
+; AVX512-NEXT: vmovq %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm2, %rdx
+; AVX512-NEXT: vmovq %xmm0, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vmovq %xmm2, %rdx
+; AVX512-NEXT: vmovq %xmm4, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm4, %rdx
+; AVX512-NEXT: vmovq %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm4, %rdx
+; AVX512-NEXT: vmovq %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vmovq %xmm3, %rdx
+; AVX512-NEXT: vmovq %xmm1, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: cmovgq %rcx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <16 x i64> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: packuswb %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32i32:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm8, %xmm3
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pshufb %xmm8, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: psllw $15, %xmm2
+; SSE42-NEXT: psraw $15, %xmm2
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm3, %xmm2
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pshufb %xmm8, %xmm1
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pshufb %xmm8, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: psllw $15, %xmm0
+; SSE42-NEXT: psraw $15, %xmm0
+; SSE42-NEXT: pshufb %xmm3, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pshufb %xmm8, %xmm7
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pshufb %xmm8, %xmm6
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE42-NEXT: psllw $15, %xmm6
+; SSE42-NEXT: psraw $15, %xmm6
+; SSE42-NEXT: pshufb %xmm3, %xmm6
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pshufb %xmm8, %xmm5
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pshufb %xmm8, %xmm4
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE42-NEXT: psllw $15, %xmm4
+; SSE42-NEXT: psraw $15, %xmm4
+; SSE42-NEXT: pshufb %xmm3, %xmm4
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE42-NEXT: pextrb $15, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vpcmpgtd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %ecx
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: cmpl %ecx, %edx
+; AVX512-NEXT: movl $-1, %ecx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm4, %esi
+; AVX512-NEXT: vmovd %xmm5, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm5, %esi
+; AVX512-NEXT: vmovd %xmm6, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm7
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm5, %esi
+; AVX512-NEXT: vmovd %xmm6, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm7
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512-NEXT: vpextrd $1, %xmm0, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm2, %esi
+; AVX512-NEXT: vmovd %xmm0, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512-NEXT: vpextrd $2, %xmm0, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512-NEXT: vpextrd $3, %xmm0, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm2, %esi
+; AVX512-NEXT: vmovd %xmm4, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512-NEXT: vpextrd $2, %xmm4, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512-NEXT: vpextrd $3, %xmm4, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm4, %esi
+; AVX512-NEXT: vmovd %xmm5, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm4, %esi
+; AVX512-NEXT: vmovd %xmm5, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm3, %edx
+; AVX512-NEXT: vpextrd $1, %xmm1, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm3, %esi
+; AVX512-NEXT: vmovd %xmm1, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $2, %xmm3, %edx
+; AVX512-NEXT: vpextrd $2, %xmm1, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $3, %xmm3, %edx
+; AVX512-NEXT: vpextrd $3, %xmm1, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: cmovgl %ecx, %eax
+; AVX512-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <32 x i32> %a0, %a1
+ ret <32 x i1> %1
+}
+
+define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v64i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: packuswb %xmm5, %xmm4
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: packuswb %xmm7, %xmm6
+; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v64i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm8 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm8, %xmm1
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pshufb %xmm8, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufb %xmm8, %xmm3
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pshufb %xmm8, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pshufb %xmm8, %xmm5
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pshufb %xmm8, %xmm4
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pshufb %xmm8, %xmm7
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pshufb %xmm8, %xmm6
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE42-NEXT: pextrb $15, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v64i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; AVX1-NEXT: vpextrb $15, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v64i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]
+; AVX2-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX2-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX2-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX2-NEXT: vpextrb $15, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v64i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm3
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm2
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vpsllw $7, %ymm2, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpand %ymm2, %ymm3, %ymm3
+; AVX512-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX512-NEXT: vpcmpgtb %ymm3, %ymm6, %ymm3
+; AVX512-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vpcmpgtb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm2
+; AVX512-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpgtb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX512-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT: retq
+ %1 = icmp sgt <64 x i16> %a0, %a1
+ ret <64 x i1> %1
+}
+
+define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v128i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v128i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pextrb $15, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v128i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm5, %xmm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm7, %xmm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpextrb $15, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v128i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpextrb $15, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpextrb $15, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v128i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
+; AVX512-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512-NEXT: vpmovsxbd %xmm4, %zmm4
+; AVX512-NEXT: vpslld $31, %zmm4, %zmm4
+; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k0
+; AVX512-NEXT: kmovw %k0, 14(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT: kmovw %k0, 12(%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT: kmovw %k0, 10(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT: kmovw %k0, 8(%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT: kmovw %k0, 6(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT: kmovw %k0, 4(%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT: kmovw %k0, 2(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kmovw %k0, (%rdi)
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: retq
+ %1 = icmp sgt <128 x i8> %a0, %a1
+ ret <128 x i1> %1
+}
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index 47878360ca0a..4c5c348302b7 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -114,3 +114,12 @@ entry:
ret <4 x i16*> %A
;CHECK: ret
}
+
+;CHECK-LABEL: AGEP9:
+define <64 x i16*> @AGEP9(i16* %param, <64 x i32> %off) nounwind {
+entry:
+;CHECK: vbroadcastss
+ %A = getelementptr i16, i16* %param, <64 x i32> %off
+ ret <64 x i16*> %A
+;CHECK: ret
+}
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
new file mode 100644
index 000000000000..b091d1bca2ef
--- /dev/null
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -0,0 +1,3922 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
+
+;
+; Half to Float
+;
+
+define float @cvt_i16_to_f32(i16 %a0) {
+; ALL-LABEL: cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = bitcast i16 %a0 to half
+ %2 = fpext half %1 to float
+ ret float %2
+}
+
+define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) {
+; ALL-LABEL: cvt_4i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movq %rax, %rdx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrq $48, %rdx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %esi, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = bitcast <4 x i16> %a0 to <4 x half>
+ %2 = fpext <4 x half> %1 to <4 x float>
+ ret <4 x float> %2
+}
+
+define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) {
+; ALL-LABEL: cvt_8i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movq %rax, %rdx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrq $48, %rdx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %esi, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x float>
+ ret <4 x float> %3
+}
+
+define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) {
+; AVX1-LABEL: cvt_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movswl %dx, %r9d
+; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: shrq $32, %r8
+; AVX1-NEXT: shrq $48, %r10
+; AVX1-NEXT: vmovq %xmm0, %rdi
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movswl %di, %ecx
+; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; AVX1-NEXT: shrl $16, %edi
+; AVX1-NEXT: shrq $32, %rax
+; AVX1-NEXT: shrq $48, %rsi
+; AVX1-NEXT: movswl %si, %esi
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: movswl %di, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: movswl %r10w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: movswl %r8w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl %dx, %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: vmovd %r9d, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movswl %dx, %r9d
+; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: shrq $32, %r8
+; AVX2-NEXT: shrq $48, %r10
+; AVX2-NEXT: vmovq %xmm0, %rdi
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movswl %di, %ecx
+; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; AVX2-NEXT: shrl $16, %edi
+; AVX2-NEXT: shrq $32, %rax
+; AVX2-NEXT: shrq $48, %rsi
+; AVX2-NEXT: movswl %si, %esi
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: movswl %di, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vmovd %ecx, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: movswl %r10w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: movswl %r8w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl %dx, %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: vmovd %r9d, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8i16_to_8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r8
+; AVX512-NEXT: movq %rdx, %r10
+; AVX512-NEXT: movswl %dx, %r9d
+; AVX512-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; AVX512-NEXT: shrl $16, %edx
+; AVX512-NEXT: shrq $32, %r8
+; AVX512-NEXT: shrq $48, %r10
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: movq %rdi, %rsi
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; AVX512-NEXT: shrl $16, %edi
+; AVX512-NEXT: shrq $32, %rax
+; AVX512-NEXT: shrq $48, %rsi
+; AVX512-NEXT: movswl %si, %esi
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vmovd %ecx, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl %r8w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vmovd %r9d, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast <8 x i16> %a0 to <8 x half>
+ %2 = fpext <8 x half> %1 to <8 x float>
+ ret <8 x float> %2
+}
+
+define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) {
+; AVX1-LABEL: cvt_16i16_to_16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vmovq %xmm4, %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm8
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm9
+; AVX1-NEXT: movswl %ax, %ecx
+; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm10
+; AVX1-NEXT: vpextrq $1, %xmm4, %rax
+; AVX1-NEXT: vmovd %ecx, %xmm11
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm12
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm13
+; AVX1-NEXT: movswl %ax, %ecx
+; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm14
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vmovd %ecx, %xmm15
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: movswl %ax, %ecx
+; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm5
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm6
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm7
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_16i16_to_16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm8
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm9
+; AVX2-NEXT: movswl %ax, %ecx
+; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm10
+; AVX2-NEXT: vpextrq $1, %xmm4, %rax
+; AVX2-NEXT: vmovd %ecx, %xmm11
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm12
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm13
+; AVX2-NEXT: movswl %ax, %ecx
+; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm14
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vmovd %ecx, %xmm15
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm3
+; AVX2-NEXT: movswl %ax, %ecx
+; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm5
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm6
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm7
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_16i16_to_16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
+; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm8
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm9
+; AVX512-NEXT: movswl %ax, %ecx
+; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm11
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vmovd %ecx, %xmm12
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm13
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm14
+; AVX512-NEXT: movswl %ax, %ecx
+; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm15
+; AVX512-NEXT: vmovq %xmm10, %rax
+; AVX512-NEXT: vmovd %ecx, %xmm2
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm3
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: movswl %ax, %ecx
+; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm10, %rax
+; AVX512-NEXT: vmovd %ecx, %xmm10
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm5
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm6
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm7
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX512-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX512-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX512-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX512-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX512-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast <16 x i16> %a0 to <16 x half>
+ %2 = fpext <16 x half> %1 to <16 x float>
+ ret <16 x float> %2
+}
+
+;
+; Half to Float (Load)
+;
+
+define float @load_cvt_i16_to_f32(i16* %a0) {
+; ALL-LABEL: load_cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = load i16, i16* %a0
+ %2 = bitcast i16 %1 to half
+ %3 = fpext half %2 to float
+ ret float %3
+}
+
+define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) {
+; ALL-LABEL: load_cvt_4i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = load <4 x i16>, <4 x i16>* %a0
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x float>
+ ret <4 x float> %3
+}
+
+define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) {
+; ALL-LABEL: load_cvt_8i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: movq (%rdi), %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movq %rax, %rdx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrq $48, %rdx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %esi, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = bitcast <4 x i16> %2 to <4 x half>
+ %4 = fpext <4 x half> %3 to <4 x float>
+ ret <4 x float> %4
+}
+
+define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) {
+; AVX1-LABEL: load_cvt_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl 6(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: movswl 4(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: movswl 2(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl 8(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_cvt_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: movswl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: movswl 14(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: movswl 12(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl 8(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl 10(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_cvt_8i16_to_8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = bitcast <8 x i16> %1 to <8 x half>
+ %3 = fpext <8 x half> %2 to <8 x float>
+ ret <8 x float> %3
+}
+
+define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) {
+; AVX1-LABEL: load_cvt_16i16_to_16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl 22(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8
+; AVX1-NEXT: movswl 20(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9
+; AVX1-NEXT: movswl 16(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10
+; AVX1-NEXT: movswl 18(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11
+; AVX1-NEXT: movswl 30(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12
+; AVX1-NEXT: movswl 28(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13
+; AVX1-NEXT: movswl 24(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14
+; AVX1-NEXT: movswl 26(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15
+; AVX1-NEXT: movswl 6(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: movswl 4(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: movswl 2(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl 8(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_cvt_16i16_to_16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl 22(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8
+; AVX2-NEXT: movswl 20(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9
+; AVX2-NEXT: movswl 16(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10
+; AVX2-NEXT: movswl 18(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11
+; AVX2-NEXT: movswl 30(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12
+; AVX2-NEXT: movswl 28(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13
+; AVX2-NEXT: movswl 24(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14
+; AVX2-NEXT: movswl 26(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15
+; AVX2-NEXT: movswl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: movswl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: movswl 14(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl 12(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl 8(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: movswl 10(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_cvt_16i16_to_16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm8
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm9
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm10
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm11
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm12
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm13
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm14
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm15
+; AVX512-NEXT: movswl 22(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 20(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 16(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 18(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 30(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 28(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 24(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 26(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %a0
+ %2 = bitcast <16 x i16> %1 to <16 x half>
+ %3 = fpext <16 x half> %2 to <16 x float>
+ ret <16 x float> %3
+}
+
+;
+; Half to Double
+;
+
+define double @cvt_i16_to_f64(i16 %a0) {
+; ALL-LABEL: cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = bitcast i16 %a0 to half
+ %2 = fpext half %1 to double
+ ret double %2
+}
+
+define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) {
+; ALL-LABEL: cvt_2i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movswl %ax, %ecx
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: retq
+ %1 = bitcast <2 x i16> %a0 to <2 x half>
+ %2 = fpext <2 x half> %1 to <2 x double>
+ ret <2 x double> %2
+}
+
+define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) {
+; ALL-LABEL: cvt_4i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %esi, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = bitcast <4 x i16> %a0 to <4 x half>
+ %2 = fpext <4 x half> %1 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) {
+; ALL-LABEL: cvt_8i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movswl %ax, %ecx
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %2 = bitcast <2 x i16> %1 to <2 x half>
+ %3 = fpext <2 x half> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) {
+; ALL-LABEL: cvt_8i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %esi, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x double>
+ ret <4 x double> %3
+}
+
+define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) {
+; AVX1-LABEL: cvt_8i16_to_8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq %xmm0, %rdx
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movl %edx, %r10d
+; AVX1-NEXT: movswl %dx, %r8d
+; AVX1-NEXT: shrq $48, %rdx
+; AVX1-NEXT: shrq $32, %r9
+; AVX1-NEXT: shrl $16, %r10d
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: movswl %di, %ecx
+; AVX1-NEXT: shrq $48, %rdi
+; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX1-NEXT: movswl %si, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX1-NEXT: movswl %di, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX1-NEXT: movswl %r10w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: vmovd %r8d, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl %r9w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl %dx, %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8i16_to_8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movl %edx, %r10d
+; AVX2-NEXT: movswl %dx, %r8d
+; AVX2-NEXT: shrq $48, %rdx
+; AVX2-NEXT: shrq $32, %r9
+; AVX2-NEXT: shrl $16, %r10d
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: movswl %di, %ecx
+; AVX2-NEXT: shrq $48, %rdi
+; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX2-NEXT: movswl %si, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX2-NEXT: movswl %di, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX2-NEXT: movswl %r10w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: vmovd %r8d, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl %r9w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl %dx, %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r8
+; AVX512-NEXT: movl %edx, %r10d
+; AVX512-NEXT: movswl %dx, %r9d
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: shrq $32, %r8
+; AVX512-NEXT: shrl $16, %r10d
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: movl %edi, %esi
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: shrq $32, %rax
+; AVX512-NEXT: shrl $16, %esi
+; AVX512-NEXT: movswl %si, %esi
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vmovd %r9d, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %r8w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast <8 x i16> %a0 to <8 x half>
+ %2 = fpext <8 x half> %1 to <8 x double>
+ ret <8 x double> %2
+}
+
+;
+; Half to Double (Load)
+;
+
+define double @load_cvt_i16_to_f64(i16* %a0) {
+; ALL-LABEL: load_cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = load i16, i16* %a0
+ %2 = bitcast i16 %1 to half
+ %3 = fpext half %2 to double
+ ret double %3
+}
+
+define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) {
+; ALL-LABEL: load_cvt_2i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: retq
+ %1 = load <2 x i16>, <2 x i16>* %a0
+ %2 = bitcast <2 x i16> %1 to <2 x half>
+ %3 = fpext <2 x half> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) {
+; ALL-LABEL: load_cvt_4i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = load <4 x i16>, <4 x i16>* %a0
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x double>
+ ret <4 x double> %3
+}
+
+define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) {
+; ALL-LABEL: load_cvt_8i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: movq (%rdi), %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %esi, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = bitcast <4 x i16> %2 to <4 x half>
+ %4 = fpext <4 x half> %3 to <4 x double>
+ ret <4 x double> %4
+}
+
+define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) {
+; AVX1-LABEL: load_cvt_8i16_to_8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl 8(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: movswl 2(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl 4(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl 6(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_cvt_8i16_to_8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl 8(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX2-NEXT: movswl 10(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX2-NEXT: movswl 12(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX2-NEXT: movswl 14(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: movswl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = bitcast <8 x i16> %1 to <8 x half>
+ %3 = fpext <8 x half> %2 to <8 x double>
+ ret <8 x double> %3
+}
+
+;
+; Float to Half
+;
+
+define i16 @cvt_f32_to_i16(float %a0) {
+; ALL-LABEL: cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
+ %1 = fptrunc float %a0 to half
+ %2 = bitcast half %1 to i16
+ ret i16 %2
+}
+
+define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) {
+; ALL-LABEL: cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ ret <4 x i16> %2
+}
+
+define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) {
+; ALL-LABEL: cvt_4f32_to_8i16_undef:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) {
+; ALL-LABEL: cvt_4f32_to_8i16_zero:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) {
+; AVX1-LABEL: cvt_8f32_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: movzwl %cx, %ecx
+; AVX1-NEXT: orl %eax, %ecx
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %edx
+; AVX1-NEXT: shll $16, %edx
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %edx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: shll $16, %ecx
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %edx
+; AVX1-NEXT: movzwl %dx, %edx
+; AVX1-NEXT: orl %ecx, %edx
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: shll $16, %ecx
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %esi
+; AVX1-NEXT: movzwl %si, %esi
+; AVX1-NEXT: orl %ecx, %esi
+; AVX1-NEXT: shlq $32, %rsi
+; AVX1-NEXT: orq %rdx, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm0
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8f32_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: shll $16, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: orl %eax, %ecx
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %edx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %ecx
+; AVX2-NEXT: shll $16, %ecx
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: movzwl %dx, %edx
+; AVX2-NEXT: orl %ecx, %edx
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %ecx
+; AVX2-NEXT: shll $16, %ecx
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: orl %ecx, %esi
+; AVX2-NEXT: shlq $32, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vmovq %rsi, %xmm0
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8f32_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: shll $16, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %ecx
+; AVX512-NEXT: movzwl %cx, %ecx
+; AVX512-NEXT: orl %eax, %ecx
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %edx
+; AVX512-NEXT: shll $16, %edx
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %edx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %rcx, %rax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %ecx
+; AVX512-NEXT: shll $16, %ecx
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %edx
+; AVX512-NEXT: movzwl %dx, %edx
+; AVX512-NEXT: orl %ecx, %edx
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %ecx
+; AVX512-NEXT: shll $16, %ecx
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %esi
+; AVX512-NEXT: movzwl %si, %esi
+; AVX512-NEXT: orl %ecx, %esi
+; AVX512-NEXT: shlq $32, %rsi
+; AVX512-NEXT: orq %rdx, %rsi
+; AVX512-NEXT: vmovq %rsi, %xmm0
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x float> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) {
+; AVX1-LABEL: cvt_16f32_to_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_16f32_to_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = fptrunc <16 x float> %a0 to <16 x half>
+ %2 = bitcast <16 x half> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+;
+; Float to Half (Store)
+;
+
+define void @store_cvt_f32_to_i16(float %a0, i16* %a1) {
+; ALL-LABEL: store_cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movw %ax, (%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc float %a0 to half
+ %2 = bitcast half %1 to i16
+ store i16 %2, i16* %a1
+ ret void
+}
+
+define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) {
+; ALL-LABEL: store_cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, (%rdi)
+; ALL-NEXT: movw %dx, 6(%rdi)
+; ALL-NEXT: movw %cx, 4(%rdi)
+; ALL-NEXT: movw %ax, 2(%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ store <4 x i16> %2, <4 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) {
+; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: vmovdqa %xmm0, (%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) {
+; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: vmovdqa %xmm0, (%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_8f32_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %r8d
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %r9d
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %r10d
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %r11d
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %ecx
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %esi
+; AVX1-NEXT: movw %si, 8(%rdi)
+; AVX1-NEXT: movw %dx, (%rdi)
+; AVX1-NEXT: movw %cx, 14(%rdi)
+; AVX1-NEXT: movw %ax, 12(%rdi)
+; AVX1-NEXT: movw %r11w, 10(%rdi)
+; AVX1-NEXT: movw %r10w, 6(%rdi)
+; AVX1-NEXT: movw %r9w, 4(%rdi)
+; AVX1-NEXT: movw %r8w, 2(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_8f32_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %r8d
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %r9d
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %r10d
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %r11d
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %ecx
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %esi
+; AVX2-NEXT: movw %si, 8(%rdi)
+; AVX2-NEXT: movw %dx, (%rdi)
+; AVX2-NEXT: movw %cx, 14(%rdi)
+; AVX2-NEXT: movw %ax, 12(%rdi)
+; AVX2-NEXT: movw %r11w, 10(%rdi)
+; AVX2-NEXT: movw %r10w, 6(%rdi)
+; AVX2-NEXT: movw %r9w, 4(%rdi)
+; AVX2-NEXT: movw %r8w, 2(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_8f32_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %r8d
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %r9d
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %r10d
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %r11d
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %ecx
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %edx
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %esi
+; AVX512-NEXT: movw %si, 8(%rdi)
+; AVX512-NEXT: movw %dx, (%rdi)
+; AVX512-NEXT: movw %cx, 14(%rdi)
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: movw %r11w, 10(%rdi)
+; AVX512-NEXT: movw %r10w, 6(%rdi)
+; AVX512-NEXT: movw %r9w, 4(%rdi)
+; AVX512-NEXT: movw %r8w, 2(%rdi)
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x float> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ store <8 x i16> %2, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_16f32_to_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX1-NEXT: movw %ax, 24(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX1-NEXT: movw %ax, 8(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX1-NEXT: movw %ax, 30(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: movw %ax, 28(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: movw %ax, 26(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: movw %ax, 22(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: movw %ax, 20(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: movw %ax, 18(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: movw %ax, 14(%rdi)
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 12(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, 10(%rdi)
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: movw %ax, 6(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: movw %ax, 4(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: movw %ax, 2(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_16f32_to_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX2-NEXT: movw %ax, 24(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX2-NEXT: movw %ax, 8(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX2-NEXT: movw %ax, 30(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: movw %ax, 28(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: movw %ax, 26(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: movw %ax, 22(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: movw %ax, 20(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: movw %ax, 18(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: movw %ax, 14(%rdi)
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 12(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, 10(%rdi)
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: movw %ax, 6(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: movw %ax, 4(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: movw %ax, 2(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX512-NEXT: movw %ax, 24(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX512-NEXT: movw %ax, 16(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX512-NEXT: movw %ax, 8(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, 30(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 28(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 26(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 22(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 20(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 18(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: movw %ax, 14(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: movw %ax, 10(%rdi)
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: movw %ax, 6(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: movw %ax, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: movw %ax, 2(%rdi)
+; AVX512-NEXT: retq
+ %1 = fptrunc <16 x float> %a0 to <16 x half>
+ %2 = bitcast <16 x half> %1 to <16 x i16>
+ store <16 x i16> %2, <16 x i16>* %a1
+ ret void
+}
+
+;
+; Double to Half
+;
+
+define i16 @cvt_f64_to_i16(double %a0) {
+; ALL-LABEL: cvt_f64_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: jmp __truncdfhf2 # TAILCALL
+ %1 = fptrunc double %a0 to half
+ %2 = bitcast half %1 to i16
+ ret i16 %2
+}
+
+define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) {
+; ALL-LABEL: cvt_2f64_to_2i16:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: .Ltmp0:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: subq $16, %rsp
+; ALL-NEXT: .Ltmp1:
+; ALL-NEXT: .cfi_def_cfa_offset 32
+; ALL-NEXT: .Ltmp2:
+; ALL-NEXT: .cfi_offset %rbx, -16
+; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movw %ax, %bx
+; ALL-NEXT: shll $16, %ebx
+; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movzwl %ax, %eax
+; ALL-NEXT: orl %ebx, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: addq $16, %rsp
+; ALL-NEXT: popq %rbx
+; ALL-NEXT: retq
+ %1 = fptrunc <2 x double> %a0 to <2 x half>
+ %2 = bitcast <2 x half> %1 to <2 x i16>
+ ret <2 x i16> %2
+}
+
+define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) {
+; AVX1-LABEL: cvt_4f64_to_4i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp4:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: subq $40, %rsp
+; AVX1-NEXT: .Ltmp5:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp6:
+; AVX1-NEXT: .cfi_offset %rbx, -24
+; AVX1-NEXT: .Ltmp7:
+; AVX1-NEXT: .cfi_offset %r14, -16
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r14, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: addq $40, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_4f64_to_4i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp3:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp4:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: subq $40, %rsp
+; AVX2-NEXT: .Ltmp5:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp6:
+; AVX2-NEXT: .cfi_offset %rbx, -24
+; AVX2-NEXT: .Ltmp7:
+; AVX2-NEXT: .cfi_offset %r14, -16
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r14, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: addq $40, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_4i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp3:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp4:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: .Ltmp5:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp6:
+; AVX512-NEXT: .cfi_offset %rbx, -24
+; AVX512-NEXT: .Ltmp7:
+; AVX512-NEXT: .cfi_offset %r14, -16
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ ret <4 x i16> %2
+}
+
+define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) {
+; AVX1-LABEL: cvt_4f64_to_8i16_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp8:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp9:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: subq $40, %rsp
+; AVX1-NEXT: .Ltmp10:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp11:
+; AVX1-NEXT: .cfi_offset %rbx, -24
+; AVX1-NEXT: .Ltmp12:
+; AVX1-NEXT: .cfi_offset %r14, -16
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r14, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: addq $40, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_4f64_to_8i16_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp8:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp9:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: subq $40, %rsp
+; AVX2-NEXT: .Ltmp10:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp11:
+; AVX2-NEXT: .cfi_offset %rbx, -24
+; AVX2-NEXT: .Ltmp12:
+; AVX2-NEXT: .cfi_offset %r14, -16
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r14, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: addq $40, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_8i16_undef:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp8:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp9:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: .Ltmp10:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp11:
+; AVX512-NEXT: .cfi_offset %rbx, -24
+; AVX512-NEXT: .Ltmp12:
+; AVX512-NEXT: .cfi_offset %r14, -16
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) {
+; AVX1-LABEL: cvt_4f64_to_8i16_zero:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp13:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp14:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: subq $40, %rsp
+; AVX1-NEXT: .Ltmp15:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp16:
+; AVX1-NEXT: .cfi_offset %rbx, -24
+; AVX1-NEXT: .Ltmp17:
+; AVX1-NEXT: .cfi_offset %r14, -16
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r14, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: addq $40, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_4f64_to_8i16_zero:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp13:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp14:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: subq $40, %rsp
+; AVX2-NEXT: .Ltmp15:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp16:
+; AVX2-NEXT: .cfi_offset %rbx, -24
+; AVX2-NEXT: .Ltmp17:
+; AVX2-NEXT: .cfi_offset %r14, -16
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r14, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: addq $40, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_8i16_zero:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp13:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp14:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: .Ltmp15:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp16:
+; AVX512-NEXT: .cfi_offset %rbx, -24
+; AVX512-NEXT: .Ltmp17:
+; AVX512-NEXT: .cfi_offset %r14, -16
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) {
+; AVX1-LABEL: cvt_8f64_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Ltmp18:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp19:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp20:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: .Ltmp21:
+; AVX1-NEXT: .cfi_def_cfa_offset 96
+; AVX1-NEXT: .Ltmp22:
+; AVX1-NEXT: .cfi_offset %rbx, -32
+; AVX1-NEXT: .Ltmp23:
+; AVX1-NEXT: .cfi_offset %r14, -24
+; AVX1-NEXT: .Ltmp24:
+; AVX1-NEXT: .cfi_offset %r15, -16
+; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r15d
+; AVX1-NEXT: orl %ebx, %r15d
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: shlq $32, %r14
+; AVX1-NEXT: orq %r15, %r14
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r15d
+; AVX1-NEXT: orl %ebx, %r15d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r15, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vmovq %r14, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: addq $64, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8f64_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Ltmp18:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp19:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp20:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: .Ltmp21:
+; AVX2-NEXT: .cfi_def_cfa_offset 96
+; AVX2-NEXT: .Ltmp22:
+; AVX2-NEXT: .cfi_offset %rbx, -32
+; AVX2-NEXT: .Ltmp23:
+; AVX2-NEXT: .cfi_offset %r14, -24
+; AVX2-NEXT: .Ltmp24:
+; AVX2-NEXT: .cfi_offset %r15, -16
+; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r15d
+; AVX2-NEXT: orl %ebx, %r15d
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: shlq $32, %r14
+; AVX2-NEXT: orq %r15, %r14
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r15d
+; AVX2-NEXT: orl %ebx, %r15d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r15, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vmovq %r14, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: addq $64, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8f64_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Ltmp18:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp19:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp20:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: subq $96, %rsp
+; AVX512-NEXT: .Ltmp21:
+; AVX512-NEXT: .cfi_def_cfa_offset 128
+; AVX512-NEXT: .Ltmp22:
+; AVX512-NEXT: .cfi_offset %rbx, -32
+; AVX512-NEXT: .Ltmp23:
+; AVX512-NEXT: .cfi_offset %r14, -24
+; AVX512-NEXT: .Ltmp24:
+; AVX512-NEXT: .cfi_offset %r15, -16
+; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r15d
+; AVX512-NEXT: orl %ebx, %r15d
+; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: shlq $32, %r14
+; AVX512-NEXT: orq %r15, %r14
+; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r15d
+; AVX512-NEXT: orl %ebx, %r15d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r15, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vmovq %r14, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: addq $96, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x double> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+;
+; Double to Half (Store)
+;
+
+define void @store_cvt_f64_to_i16(double %a0, i16* %a1) {
+; ALL-LABEL: store_cvt_f64_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: .Ltmp25:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: .Ltmp26:
+; ALL-NEXT: .cfi_offset %rbx, -16
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movw %ax, (%rbx)
+; ALL-NEXT: popq %rbx
+; ALL-NEXT: retq
+ %1 = fptrunc double %a0 to half
+ %2 = bitcast half %1 to i16
+ store i16 %2, i16* %a1
+ ret void
+}
+
+define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) {
+; ALL-LABEL: store_cvt_2f64_to_2i16:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: .Ltmp27:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: .Ltmp28:
+; ALL-NEXT: .cfi_def_cfa_offset 24
+; ALL-NEXT: subq $24, %rsp
+; ALL-NEXT: .Ltmp29:
+; ALL-NEXT: .cfi_def_cfa_offset 48
+; ALL-NEXT: .Ltmp30:
+; ALL-NEXT: .cfi_offset %rbx, -24
+; ALL-NEXT: .Ltmp31:
+; ALL-NEXT: .cfi_offset %rbp, -16
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movl %eax, %ebp
+; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movw %ax, (%rbx)
+; ALL-NEXT: movw %bp, 2(%rbx)
+; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: popq %rbx
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
+ %1 = fptrunc <2 x double> %a0 to <2 x half>
+ %2 = bitcast <2 x half> %1 to <2 x i16>
+ store <2 x i16> %2, <2 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_4f64_to_4i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp32:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Ltmp33:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp34:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp35:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: subq $88, %rsp
+; AVX1-NEXT: .Ltmp36:
+; AVX1-NEXT: .cfi_def_cfa_offset 128
+; AVX1-NEXT: .Ltmp37:
+; AVX1-NEXT: .cfi_offset %rbx, -40
+; AVX1-NEXT: .Ltmp38:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Ltmp39:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: .Ltmp40:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r14d
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r15d
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %ebp
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, 4(%rbx)
+; AVX1-NEXT: movw %bp, (%rbx)
+; AVX1-NEXT: movw %r15w, 6(%rbx)
+; AVX1-NEXT: movw %r14w, 2(%rbx)
+; AVX1-NEXT: addq $88, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_4f64_to_4i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp32:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Ltmp33:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp34:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp35:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: subq $88, %rsp
+; AVX2-NEXT: .Ltmp36:
+; AVX2-NEXT: .cfi_def_cfa_offset 128
+; AVX2-NEXT: .Ltmp37:
+; AVX2-NEXT: .cfi_offset %rbx, -40
+; AVX2-NEXT: .Ltmp38:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Ltmp39:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: .Ltmp40:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r14d
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r15d
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %ebp
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, 4(%rbx)
+; AVX2-NEXT: movw %bp, (%rbx)
+; AVX2-NEXT: movw %r15w, 6(%rbx)
+; AVX2-NEXT: movw %r14w, 2(%rbx)
+; AVX2-NEXT: addq $88, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_4i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp32:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Ltmp33:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp34:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp35:
+; AVX512-NEXT: .cfi_def_cfa_offset 40
+; AVX512-NEXT: subq $88, %rsp
+; AVX512-NEXT: .Ltmp36:
+; AVX512-NEXT: .cfi_def_cfa_offset 128
+; AVX512-NEXT: .Ltmp37:
+; AVX512-NEXT: .cfi_offset %rbx, -40
+; AVX512-NEXT: .Ltmp38:
+; AVX512-NEXT: .cfi_offset %r14, -32
+; AVX512-NEXT: .Ltmp39:
+; AVX512-NEXT: .cfi_offset %r15, -24
+; AVX512-NEXT: .Ltmp40:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r14d
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r15d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, 4(%rbx)
+; AVX512-NEXT: movw %bp, (%rbx)
+; AVX512-NEXT: movw %r15w, 6(%rbx)
+; AVX512-NEXT: movw %r14w, 2(%rbx)
+; AVX512-NEXT: addq $88, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ store <4 x i16> %2, <4 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp41:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp42:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp43:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: .Ltmp44:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp45:
+; AVX1-NEXT: .cfi_offset %rbx, -32
+; AVX1-NEXT: .Ltmp46:
+; AVX1-NEXT: .cfi_offset %r14, -24
+; AVX1-NEXT: .Ltmp47:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %r14
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %ebx
+; AVX1-NEXT: orl %ebp, %ebx
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebp, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rbx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa %xmm0, (%r14)
+; AVX1-NEXT: addq $32, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp41:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp42:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp43:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: .Ltmp44:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp45:
+; AVX2-NEXT: .cfi_offset %rbx, -32
+; AVX2-NEXT: .Ltmp46:
+; AVX2-NEXT: .cfi_offset %r14, -24
+; AVX2-NEXT: .Ltmp47:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %r14
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %ebx
+; AVX2-NEXT: orl %ebp, %ebx
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebp, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vmovdqa %xmm0, (%r14)
+; AVX2-NEXT: addq $32, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp41:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp42:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp43:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: subq $32, %rsp
+; AVX512-NEXT: .Ltmp44:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp45:
+; AVX512-NEXT: .cfi_offset %rbx, -32
+; AVX512-NEXT: .Ltmp46:
+; AVX512-NEXT: .cfi_offset %r14, -24
+; AVX512-NEXT: .Ltmp47:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %r14
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %ebx
+; AVX512-NEXT: orl %ebp, %ebx
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebp, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %rbx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vmovdqa %xmm0, (%r14)
+; AVX512-NEXT: addq $32, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp48:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp49:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp50:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: .Ltmp51:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp52:
+; AVX1-NEXT: .cfi_offset %rbx, -32
+; AVX1-NEXT: .Ltmp53:
+; AVX1-NEXT: .cfi_offset %r14, -24
+; AVX1-NEXT: .Ltmp54:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %r14
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %ebx
+; AVX1-NEXT: orl %ebp, %ebx
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebp, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rbx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovdqa %xmm0, (%r14)
+; AVX1-NEXT: addq $32, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp48:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp49:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp50:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: .Ltmp51:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp52:
+; AVX2-NEXT: .cfi_offset %rbx, -32
+; AVX2-NEXT: .Ltmp53:
+; AVX2-NEXT: .cfi_offset %r14, -24
+; AVX2-NEXT: .Ltmp54:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %r14
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %ebx
+; AVX2-NEXT: orl %ebp, %ebx
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebp, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vmovdqa %xmm0, (%r14)
+; AVX2-NEXT: addq $32, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp48:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp49:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp50:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: subq $32, %rsp
+; AVX512-NEXT: .Ltmp51:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp52:
+; AVX512-NEXT: .cfi_offset %rbx, -32
+; AVX512-NEXT: .Ltmp53:
+; AVX512-NEXT: .cfi_offset %r14, -24
+; AVX512-NEXT: .Ltmp54:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %r14
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %ebx
+; AVX512-NEXT: orl %ebp, %ebx
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebp, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %rbx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa %xmm0, (%r14)
+; AVX512-NEXT: addq $32, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_8f64_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp55:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Ltmp56:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp57:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: .Ltmp58:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: .Ltmp59:
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp60:
+; AVX1-NEXT: .cfi_def_cfa_offset 56
+; AVX1-NEXT: subq $136, %rsp
+; AVX1-NEXT: .Ltmp61:
+; AVX1-NEXT: .cfi_def_cfa_offset 192
+; AVX1-NEXT: .Ltmp62:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Ltmp63:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Ltmp64:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Ltmp65:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Ltmp66:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: .Ltmp67:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r12d
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r13d
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %ebp
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r14d
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r15d
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, 12(%rbx)
+; AVX1-NEXT: movw %r15w, 8(%rbx)
+; AVX1-NEXT: movw %r14w, 4(%rbx)
+; AVX1-NEXT: movw %bp, (%rbx)
+; AVX1-NEXT: movw %r13w, 14(%rbx)
+; AVX1-NEXT: movw %r12w, 10(%rbx)
+; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX1-NEXT: movw %ax, 6(%rbx)
+; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX1-NEXT: movw %ax, 2(%rbx)
+; AVX1-NEXT: addq $136, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_8f64_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp55:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Ltmp56:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp57:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: .Ltmp58:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: .Ltmp59:
+; AVX2-NEXT: .cfi_def_cfa_offset 48
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp60:
+; AVX2-NEXT: .cfi_def_cfa_offset 56
+; AVX2-NEXT: subq $136, %rsp
+; AVX2-NEXT: .Ltmp61:
+; AVX2-NEXT: .cfi_def_cfa_offset 192
+; AVX2-NEXT: .Ltmp62:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Ltmp63:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Ltmp64:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Ltmp65:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Ltmp66:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: .Ltmp67:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r12d
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r13d
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %ebp
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r14d
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r15d
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, 12(%rbx)
+; AVX2-NEXT: movw %r15w, 8(%rbx)
+; AVX2-NEXT: movw %r14w, 4(%rbx)
+; AVX2-NEXT: movw %bp, (%rbx)
+; AVX2-NEXT: movw %r13w, 14(%rbx)
+; AVX2-NEXT: movw %r12w, 10(%rbx)
+; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX2-NEXT: movw %ax, 6(%rbx)
+; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX2-NEXT: movw %ax, 2(%rbx)
+; AVX2-NEXT: addq $136, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_8f64_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp55:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Ltmp56:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp57:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: .Ltmp58:
+; AVX512-NEXT: .cfi_def_cfa_offset 40
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: .Ltmp59:
+; AVX512-NEXT: .cfi_def_cfa_offset 48
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp60:
+; AVX512-NEXT: .cfi_def_cfa_offset 56
+; AVX512-NEXT: subq $200, %rsp
+; AVX512-NEXT: .Ltmp61:
+; AVX512-NEXT: .cfi_def_cfa_offset 256
+; AVX512-NEXT: .Ltmp62:
+; AVX512-NEXT: .cfi_offset %rbx, -56
+; AVX512-NEXT: .Ltmp63:
+; AVX512-NEXT: .cfi_offset %r12, -48
+; AVX512-NEXT: .Ltmp64:
+; AVX512-NEXT: .cfi_offset %r13, -40
+; AVX512-NEXT: .Ltmp65:
+; AVX512-NEXT: .cfi_offset %r14, -32
+; AVX512-NEXT: .Ltmp66:
+; AVX512-NEXT: .cfi_offset %r15, -24
+; AVX512-NEXT: .Ltmp67:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r12d
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r13d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r14d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r15d
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, 12(%rbx)
+; AVX512-NEXT: movw %r15w, 8(%rbx)
+; AVX512-NEXT: movw %r14w, 4(%rbx)
+; AVX512-NEXT: movw %bp, (%rbx)
+; AVX512-NEXT: movw %r13w, 14(%rbx)
+; AVX512-NEXT: movw %r12w, 10(%rbx)
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX512-NEXT: movw %ax, 6(%rbx)
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX512-NEXT: movw %ax, 2(%rbx)
+; AVX512-NEXT: addq $200, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x double> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ store <8 x i16> %2, <8 x i16>* %a1
+ ret void
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
new file mode 100644
index 000000000000..f344d6dc3cc6
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -0,0 +1,622 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; sdiv by 7
+;
+
+define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_div7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; SSE2-NEXT: imulq %rcx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: movd %rdx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: imulq %rcx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: movd %rdx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; SSE41-NEXT: imulq %rcx
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: movd %rdx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: imulq %rcx
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: movd %rdx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_div7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = sdiv <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_div7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $31, %xmm0
+; SSE2-NEXT: psrad $2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: pmuldq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $31, %xmm0
+; SSE41-NEXT: psrad $2, %xmm1
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_div7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $15, %xmm1
+; SSE-NEXT: psraw $1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_div7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_div7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $7, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
+
+;
+; srem by 7
+;
+
+define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_rem7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: imulq %rsi
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: imulq %rsi
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rcx
+; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: imulq %rsi
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: subq %rax, %rcx
+; SSE41-NEXT: movd %rcx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: imulq %rsi
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: subq %rax, %rcx
+; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_rem7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = srem <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_rem7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrld $31, %xmm2
+; SSE2-NEXT: psrad $2, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: pmuldq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrld $31, %xmm2
+; SSE41-NEXT: psrad $2, %xmm1
+; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_rem7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE-NEXT: pmulhw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrlw $15, %xmm2
+; SSE-NEXT: psraw $1, %xmm1
+; SSE-NEXT: paddw %xmm2, %xmm1
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_rem7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
+; AVX-NEXT: vpsraw $1, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_rem7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: psubb %xmm3, %xmm2
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: psubb %xmm3, %xmm2
+; SSE41-NEXT: psrlw $7, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm3
+; SSE41-NEXT: pmullw %xmm3, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm3, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm2
+; SSE41-NEXT: psubb %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
new file mode 100644
index 000000000000..cfd2fc625a6c
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -0,0 +1,545 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; sdiv by 7
+;
+
+define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_div7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_div7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1
+; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_div7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm1
+; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_div7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7
+; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
+
+;
+; srem by 7
+;
+
+define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_rem7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_rem7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2
+; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_rem7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4
+; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2
+; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_rem7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm1
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm4
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3
+; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsubb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
new file mode 100644
index 000000000000..1bb7181d31df
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -0,0 +1,2392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; sdiv by 7
+;
+
+define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_div7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rax
+; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = sdiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_div7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = sdiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_div7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm3
+; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm2
+; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm1
+; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = sdiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_div7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpxor %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
+
+;
+; srem by 7
+;
+
+define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_rem7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = srem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_rem7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_rem7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4
+; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3
+; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = srem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_rem7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm0, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $7, %ymm4, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %ymm10, %ymm6, %ymm8
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpxor %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpsubb %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm8, %ymm4, %ymm8
+; AVX512F-NEXT: vpmovsxbw %xmm8, %ymm9
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm4
+; AVX512F-NEXT: vpmullw %ymm4, %ymm9, %ymm9
+; AVX512F-NEXT: vpmovsxwd %ymm9, %zmm9
+; AVX512F-NEXT: vpmovdb %zmm9, %xmm9
+; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm5
+; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovsxwd %ymm5, %zmm5
+; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm5
+; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm5
+; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],ymm2[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
+; AVX512F-NEXT: vpand %ymm10, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm3
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movb $7, %dil
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %esi
+; AVX512BW-NEXT: imull $-109, %esi, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %esi
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %esi
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %esi
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
new file mode 100644
index 000000000000..1e68dc9170bf
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -0,0 +1,592 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; udiv by 7
+;
+
+define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_div7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: addq %rdx, %rcx
+; SSE2-NEXT: shrq $2, %rcx
+; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: addq %rdx, %rcx
+; SSE2-NEXT: shrq $2, %rcx
+; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rcx
+; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: shrq %rcx
+; SSE41-NEXT: addq %rdx, %rcx
+; SSE41-NEXT: shrq $2, %rcx
+; SSE41-NEXT: movd %rcx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: shrq %rcx
+; SSE41-NEXT: addq %rdx, %rcx
+; SSE41-NEXT: shrq $2, %rcx
+; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_div7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = udiv <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_div7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: psrld $1, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: psrld $2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_div7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: psrlw $1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_div7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_div7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm1, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: psubb %xmm3, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: psubb %xmm1, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
+
+;
+; urem by 7
+;
+
+define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_rem7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: addq %rdx, %rax
+; SSE2-NEXT: shrq $2, %rax
+; SSE2-NEXT: leaq (,%rax,8), %rdx
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: addq %rdx, %rax
+; SSE2-NEXT: shrq $2, %rax
+; SSE2-NEXT: leaq (,%rax,8), %rdx
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rcx
+; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: shrq %rax
+; SSE41-NEXT: addq %rdx, %rax
+; SSE41-NEXT: shrq $2, %rax
+; SSE41-NEXT: leaq (,%rax,8), %rdx
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: movd %rcx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: shrq %rax
+; SSE41-NEXT: addq %rdx, %rax
+; SSE41-NEXT: shrq $2, %rax
+; SSE41-NEXT: leaq (,%rax,8), %rdx
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_rem7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = urem <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_rem7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: psrld $2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: psrld $1, %xmm2
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: psrld $2, %xmm2
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_rem7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psubw %xmm1, %xmm2
+; SSE-NEXT: psrlw $1, %xmm2
+; SSE-NEXT: paddw %xmm1, %xmm2
+; SSE-NEXT: psrlw $2, %xmm2
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_rem7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_rem7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm1, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubb %xmm3, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: paddb %xmm3, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubb %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm1
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm3
+; SSE41-NEXT: pmullw %xmm3, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm3, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm1
+; SSE41-NEXT: psubb %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = urem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
new file mode 100644
index 000000000000..a1d356a0e762
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -0,0 +1,551 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; udiv by 7
+;
+
+define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_div7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_div7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_div7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_div7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
+
+;
+; urem by 7
+;
+
+define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_rem7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_rem7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm3
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_rem7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_rem7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm6
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm7
+; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3
+; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-512.ll b/test/CodeGen/X86/vector-idiv-udiv-512.ll
new file mode 100644
index 000000000000..35c902c5cc21
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -0,0 +1,2100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; udiv by 7
+;
+
+define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_div7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = udiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_div7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = udiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_div7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = udiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_div7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm3[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = udiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
+
+;
+; urem by 7
+;
+
+define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_rem7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = urem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_rem7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = urem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_rem7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = urem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_rem7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm5
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm5[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm5
+; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT: vpaddb %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7
+; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm8
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm8
+; AVX512F-NEXT: vpmovsxwd %ymm8, %zmm8
+; AVX512F-NEXT: vpmovdb %zmm8, %xmm8
+; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7
+; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm7
+; AVX512F-NEXT: vpmovsxwd %ymm7, %zmm7
+; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
+; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm7
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmullw %ymm4, %ymm7, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm2[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movb $7, %cl
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = urem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 1117e206e5b0..6719a66f030f 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -1,1212 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-target triple = "x86_64-unknown-unknown"
-
-define <4 x i32> @test1(<4 x i32> %a) #0 {
-; SSE41-LABEL: test1:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm2, %xmm3
-; SSE41-NEXT: pmuludq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: psubd %xmm1, %xmm0
-; SSE41-NEXT: psrld $1, %xmm0
-; SSE41-NEXT: paddd %xmm1, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test1:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: psubd %xmm2, %xmm0
-; SSE-NEXT: psrld $1, %xmm0
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: psrld $2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test1:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
- ret <4 x i32> %div
-}
-
-define <8 x i32> @test2(<8 x i32> %a) #0 {
-; SSE41-LABEL: test2:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmuludq %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; SSE41-NEXT: psubd %xmm5, %xmm0
-; SSE41-NEXT: psrld $1, %xmm0
-; SSE41-NEXT: paddd %xmm5, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: pmuludq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: psrld $1, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
-; SSE41-NEXT: psrld $2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test2:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE-NEXT: psubd %xmm3, %xmm0
-; SSE-NEXT: psrld $1, %xmm0
-; SSE-NEXT: paddd %xmm3, %xmm0
-; SSE-NEXT: psrld $2, %xmm0
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm2, %xmm1
-; SSE-NEXT: psrld $1, %xmm1
-; SSE-NEXT: paddd %xmm2, %xmm1
-; SSE-NEXT: psrld $2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test2:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrld $2, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %div
-}
-
-define <8 x i16> @test3(<8 x i16> %a) #0 {
-; SSE41-LABEL: test3:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: psubw %xmm1, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: psrlw $2, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test3:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: psrlw $2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test3:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
- ret <8 x i16> %div
-}
-
-define <16 x i16> @test4(<16 x i16> %a) #0 {
-; SSE41-LABEL: test4:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pmulhuw %xmm2, %xmm3
-; SSE41-NEXT: psubw %xmm3, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: paddw %xmm3, %xmm0
-; SSE41-NEXT: psrlw $2, %xmm0
-; SSE41-NEXT: pmulhuw %xmm1, %xmm2
-; SSE41-NEXT: psubw %xmm2, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test4:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmulhuw %xmm2, %xmm3
-; SSE-NEXT: psubw %xmm3, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm3, %xmm0
-; SSE-NEXT: psrlw $2, %xmm0
-; SSE-NEXT: pmulhuw %xmm1, %xmm2
-; SSE-NEXT: psubw %xmm2, %xmm1
-; SSE-NEXT: psrlw $1, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: psrlw $2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test4:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
-; AVX-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
- ret <16 x i16> %div
-}
-
-define <8 x i16> @test5(<8 x i16> %a) #0 {
-; SSE41-LABEL: test5:
-; SSE41: # BB#0:
-; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $15, %xmm1
-; SSE41-NEXT: psraw $1, %xmm0
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test5:
-; SSE: # BB#0:
-; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $15, %xmm1
-; SSE-NEXT: psraw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test5:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
- ret <8 x i16> %div
-}
-
-define <16 x i16> @test6(<16 x i16> %a) #0 {
-; SSE41-LABEL: test6:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
-; SSE41-NEXT: pmulhw %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlw $15, %xmm3
-; SSE41-NEXT: psraw $1, %xmm0
-; SSE41-NEXT: paddw %xmm3, %xmm0
-; SSE41-NEXT: pmulhw %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $15, %xmm2
-; SSE41-NEXT: psraw $1, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test6:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
-; SSE-NEXT: pmulhw %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlw $15, %xmm3
-; SSE-NEXT: psraw $1, %xmm0
-; SSE-NEXT: paddw %xmm3, %xmm0
-; SSE-NEXT: pmulhw %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrlw $15, %xmm2
-; SSE-NEXT: psraw $1, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test6:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vpsrlw $15, %ymm0, %ymm1
-; AVX-NEXT: vpsraw $1, %ymm0, %ymm0
-; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
- ret <16 x i16> %div
-}
-
-define <16 x i8> @test7(<16 x i8> %a) #0 {
-; SSE41-LABEL: test7:
-; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pextrb $0, %xmm0, %ecx
-; SSE41-NEXT: movsbl %cl, %ecx
-; SSE41-NEXT: imull $-109, %ecx, %edx
-; SSE41-NEXT: shrl $8, %edx
-; SSE41-NEXT: addb %dl, %cl
-; SSE41-NEXT: movb %cl, %dl
-; SSE41-NEXT: shrb $7, %dl
-; SSE41-NEXT: sarb $2, %cl
-; SSE41-NEXT: addb %dl, %cl
-; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrb $1, %eax, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $2, %eax, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $3, %eax, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $4, %eax, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $5, %eax, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $6, %eax, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $7, %eax, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $8, %eax, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $9, %eax, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $10, %eax, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $11, %eax, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $12, %eax, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $13, %eax, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $14, %eax, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $15, %eax, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test7:
-; SSE: # BB#0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: imull $-109, %eax, %ecx
-; SSE-NEXT: shrl $8, %ecx
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movb %cl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %cl
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movzbl %cl, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r14d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edx
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r9d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r11d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r8d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi
-; SSE-NEXT: imull $-109, %esi, %edi
-; SSE-NEXT: shrl $8, %edi
-; SSE-NEXT: addb %sil, %dil
-; SSE-NEXT: movb %dil, %bl
-; SSE-NEXT: shrb $7, %bl
-; SSE-NEXT: sarb $2, %dil
-; SSE-NEXT: addb %bl, %dil
-; SSE-NEXT: movzbl %dil, %esi
-; SSE-NEXT: movd %esi, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: imull $-109, %eax, %esi
-; SSE-NEXT: shrl $8, %esi
-; SSE-NEXT: addb %al, %sil
-; SSE-NEXT: movb %sil, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %sil
-; SSE-NEXT: addb %al, %sil
-; SSE-NEXT: movzbl %sil, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ebp
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r10d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edi
-; SSE-NEXT: imull $-109, %edi, %ebx
-; SSE-NEXT: shrl $8, %ebx
-; SSE-NEXT: addb %dil, %bl
-; SSE-NEXT: movb %bl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %bl
-; SSE-NEXT: addb %al, %bl
-; SSE-NEXT: movzbl %bl, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: imull $-109, %edx, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: imull $-109, %esi, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %sil, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT: imull $-109, %ecx, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: imull $-109, %eax, %edx
-; SSE-NEXT: shrl $8, %edx
-; SSE-NEXT: addb %al, %dl
-; SSE-NEXT: movb %dl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %dl
-; SSE-NEXT: addb %al, %dl
-; SSE-NEXT: movzbl %dl, %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: imull $-109, %r14d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r14b, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: imull $-109, %ebp, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %bpl, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: imull $-109, %r11d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r11b, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: imull $-109, %ecx, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: imull $-109, %r9d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r9b, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: imull $-109, %r10d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r10b, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: imull $-109, %r8d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r8b, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: imull $-109, %eax, %ecx
-; SSE-NEXT: shrl $8, %ecx
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movb %cl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %cl
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movzbl %cl, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test7:
-; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: movsbl %al, %eax
-; AVX-NEXT: imull $-109, %eax, %ecx
-; AVX-NEXT: shrl $8, %ecx
-; AVX-NEXT: addb %cl, %al
-; AVX-NEXT: movb %al, %cl
-; AVX-NEXT: shrb $7, %cl
-; AVX-NEXT: sarb $2, %al
-; AVX-NEXT: addb %cl, %al
-; AVX-NEXT: movzbl %al, %eax
-; AVX-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %dl
-; AVX-NEXT: shrb $7, %dl
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movzbl %cl, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
- ret <16 x i8> %div
-}
-
-define <4 x i32> @test8(<4 x i32> %a) #0 {
-; SSE41-LABEL: test8:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm2, %xmm3
-; SSE41-NEXT: pmuldq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: psrad $2, %xmm1
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test8:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: psubd %xmm2, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $31, %xmm0
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test8:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrld $31, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
- ret <4 x i32> %div
-}
-
-define <8 x i32> @test9(<8 x i32> %a) #0 {
-; SSE41-LABEL: test9:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pmuldq %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: psrad $2, %xmm2
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm4, %xmm0
-; SSE41-NEXT: pmuldq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: psrad $2, %xmm3
-; SSE41-NEXT: paddd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test9:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: psrad $31, %xmm5
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: paddd %xmm0, %xmm5
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pmuludq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm6, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT: psubd %xmm5, %xmm0
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrld $31, %xmm2
-; SSE-NEXT: psrad $2, %xmm0
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: psrad $31, %xmm5
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: paddd %xmm4, %xmm5
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm6, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm5, %xmm2
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: psrld $31, %xmm1
-; SSE-NEXT: psrad $2, %xmm2
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test9:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vpsrld $31, %ymm0, %ymm1
-; AVX-NEXT: vpsrad $2, %ymm0, %ymm0
-; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %div
-}
-
-define <8 x i32> @test10(<8 x i32> %a) #0 {
-; SSE41-LABEL: test10:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmuludq %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psubd %xmm5, %xmm4
-; SSE41-NEXT: psrld $1, %xmm4
-; SSE41-NEXT: paddd %xmm5, %xmm4
-; SSE41-NEXT: psrld $2, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7]
-; SSE41-NEXT: pmulld %xmm5, %xmm4
-; SSE41-NEXT: psubd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: pmuludq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psubd %xmm2, %xmm3
-; SSE41-NEXT: psrld $1, %xmm3
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: psrld $2, %xmm3
-; SSE41-NEXT: pmulld %xmm5, %xmm3
-; SSE41-NEXT: psubd %xmm3, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test10:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pmuludq %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: psubd %xmm2, %xmm5
-; SSE-NEXT: psrld $1, %xmm5
-; SSE-NEXT: paddd %xmm2, %xmm5
-; SSE-NEXT: psrld $2, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE-NEXT: psubd %xmm5, %xmm0
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psubd %xmm3, %xmm4
-; SSE-NEXT: psrld $1, %xmm4
-; SSE-NEXT: paddd %xmm3, %xmm4
-; SSE-NEXT: psrld $2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: psubd %xmm4, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test10:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2
-; AVX-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX-NEXT: vpaddd %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vpsrld $2, %ymm1, %ymm1
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %rem
-}
-
-define <8 x i32> @test11(<8 x i32> %a) #0 {
-; SSE41-LABEL: test11:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmuldq %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: psrld $31, %xmm4
-; SSE41-NEXT: psrad $2, %xmm5
-; SSE41-NEXT: paddd %xmm4, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7]
-; SSE41-NEXT: pmulld %xmm4, %xmm5
-; SSE41-NEXT: psubd %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm3, %xmm5
-; SSE41-NEXT: pmuldq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrld $31, %xmm3
-; SSE41-NEXT: psrad $2, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: pmulld %xmm4, %xmm2
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test11:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: psrad $31, %xmm6
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: paddd %xmm4, %xmm6
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT: psubd %xmm6, %xmm7
-; SSE-NEXT: paddd %xmm0, %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: psrld $31, %xmm4
-; SSE-NEXT: psrad $2, %xmm7
-; SSE-NEXT: paddd %xmm4, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE-NEXT: psubd %xmm7, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: psrad $31, %xmm6
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: paddd %xmm3, %xmm6
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm6, %xmm2
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrld $31, %xmm3
-; SSE-NEXT: psrad $2, %xmm2
-; SSE-NEXT: paddd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test11:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1
-; AVX-NEXT: vpsrld $31, %ymm1, %ymm2
-; AVX-NEXT: vpsrad $2, %ymm1, %ymm1
-; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %rem
-}
-
-define <2 x i16> @test12() #0 {
-; SSE41-LABEL: test12:
-; SSE41: # BB#0:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test12:
+define <2 x i16> @test_urem_unary_v2i16() nounwind {
+; SSE-LABEL: test_urem_unary_v2i16:
; SSE: # BB#0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test12:
+; AVX-LABEL: test_urem_unary_v2i16:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1216,7 +20,30 @@ define <2 x i16> @test12() #0 {
ret <2 x i16> %B9
}
-define <4 x i32> @PR20355(<4 x i32> %a) #0 {
+define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
+; SSE2-LABEL: PR20355:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT: psubd %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: psrld $31, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: PR20355:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
@@ -1231,44 +58,32 @@ define <4 x i32> @PR20355(<4 x i32> %a) #0 {
; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; SSE-LABEL: PR20355:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: paddd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: psubd %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: psrld $31, %xmm0
-; SSE-NEXT: paddd %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: PR20355:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX-NEXT: vpsrld $31, %xmm0, %xmm1
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: PR20355:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR20355:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
entry:
%sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
ret <4 x i32> %sdiv
}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index 8bf0af68e6dc..06c785575339 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1,11 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VLCD --check-prefix=ALL --check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=AVX512CD --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
@@ -101,8 +105,40 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv2i64:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pushl %esi
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: movl $63, %ecx
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: pextrd $3, %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %esi
+; X32-SSE-NEXT: xorl $31, %esi
+; X32-SSE-NEXT: testl %edx, %edx
+; X32-SSE-NEXT: cmovel %eax, %esi
+; X32-SSE-NEXT: movd %esi, %xmm1
+; X32-SSE-NEXT: movd %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: pextrd $1, %xmm0, %ecx
+; X32-SSE-NEXT: bsrl %ecx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: testl %ecx, %ecx
+; X32-SSE-NEXT: cmovel %eax, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
ret <2 x i64> %out
@@ -187,8 +223,35 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv2i64u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %ecx
+; X32-SSE-NEXT: xorl $31, %ecx
+; X32-SSE-NEXT: pextrd $2, %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm1
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %ecx
+; X32-SSE-NEXT: xorl $31, %ecx
+; X32-SSE-NEXT: movd %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1)
ret <2 x i64> %out
@@ -349,8 +412,36 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv4i32:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: movl $63, %ecx
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: movd %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %edx
+; X32-SSE-NEXT: cmovel %ecx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: movd %edx, %xmm1
+; X32-SSE-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $2, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0)
ret <4 x i32> %out
@@ -486,8 +577,31 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv4i32u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: movd %xmm0, %ecx
+; X32-SSE-NEXT: bsrl %ecx, %ecx
+; X32-SSE-NEXT: xorl $31, %ecx
+; X32-SSE-NEXT: movd %ecx, %xmm1
+; X32-SSE-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $2, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1)
ret <4 x i32> %out
@@ -600,150 +714,75 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %cx
-; SSSE3-NEXT: movw $31, %ax
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: pextrw $3, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: pextrw $5, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: pextrw $1, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: pextrw $6, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: pextrw $2, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT: pextrw $4, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movd %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: paddb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm1
+; SSSE3-NEXT: paddw %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrw $1, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %cx
-; SSE41-NEXT: movw $31, %ax
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: movd %xmm0, %edx
-; SSE41-NEXT: bsrw %dx, %dx
-; SSE41-NEXT: cmovew %ax, %dx
-; SSE41-NEXT: xorl $15, %edx
-; SSE41-NEXT: movd %edx, %xmm1
-; SSE41-NEXT: pinsrw $1, %ecx, %xmm1
-; SSE41-NEXT: pextrw $2, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $2, %ecx, %xmm1
-; SSE41-NEXT: pextrw $3, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $3, %ecx, %xmm1
-; SSE41-NEXT: pextrw $4, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $4, %ecx, %xmm1
-; SSE41-NEXT: pextrw $5, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $5, %ecx, %xmm1
-; SSE41-NEXT: pextrw $6, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $6, %ecx, %xmm1
-; SSE41-NEXT: pextrw $7, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $7, %ecx, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16:
; AVX: # BB#0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %cx
-; AVX-NEXT: movw $31, %ax
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vmovd %xmm0, %edx
-; AVX-NEXT: bsrw %dx, %dx
-; AVX-NEXT: cmovew %ax, %dx
-; AVX-NEXT: xorl $15, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %xmm0, %ymm0
+; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
@@ -756,6 +795,30 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: paddb %xmm3, %xmm1
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: paddw %xmm0, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
ret <8 x i16> %out
}
@@ -849,123 +912,75 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16u:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: pextrw $3, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: pextrw $5, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: pextrw $1, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: movd %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: paddb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm1
+; SSSE3-NEXT: paddw %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16u:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrw $1, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: movd %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrw $1, %eax, %xmm1
-; SSE41-NEXT: pextrw $2, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $2, %eax, %xmm1
-; SSE41-NEXT: pextrw $3, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $3, %eax, %xmm1
-; SSE41-NEXT: pextrw $4, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $4, %eax, %xmm1
-; SSE41-NEXT: pextrw $5, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $5, %eax, %xmm1
-; SSE41-NEXT: pextrw $6, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $6, %eax, %xmm1
-; SSE41-NEXT: pextrw $7, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $7, %eax, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16u:
; AVX: # BB#0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16u:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %xmm0, %ymm0
+; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
@@ -978,6 +993,30 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: paddb %xmm3, %xmm1
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: paddw %xmm0, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
ret <8 x i16> %out
}
@@ -1195,295 +1234,80 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pushq %rbp
-; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %ecx
-; SSSE3-NEXT: movl $15, %eax
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %edx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
-; SSSE3-NEXT: bsrl %ebp, %ebp
-; SSSE3-NEXT: cmovel %eax, %ebp
-; SSSE3-NEXT: xorl $7, %ebp
-; SSSE3-NEXT: movd %ebp, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: bsrl %edi, %edi
-; SSSE3-NEXT: cmovel %eax, %edi
-; SSSE3-NEXT: xorl $7, %edi
-; SSSE3-NEXT: movd %edi, %xmm1
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: bsrl %esi, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %ebx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %edx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r11d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %esi, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSSE3-NEXT: bsrl %r9d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %r10d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r8d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: popq %rbx
-; SSSE3-NEXT: popq %rbp
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %ecx
-; SSE41-NEXT: movl $15, %eax
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pextrb $0, %xmm0, %edx
-; SSE41-NEXT: bsrl %edx, %edx
-; SSE41-NEXT: cmovel %eax, %edx
-; SSE41-NEXT: xorl $7, %edx
-; SSE41-NEXT: movd %edx, %xmm1
-; SSE41-NEXT: pinsrb $1, %ecx, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $2, %ecx, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $3, %ecx, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $4, %ecx, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $5, %ecx, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $6, %ecx, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $7, %ecx, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $8, %ecx, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $9, %ecx, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $10, %ecx, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $11, %ecx, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $12, %ecx, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $13, %ecx, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $14, %ecx, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $15, %ecx, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pshufb %xmm3, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %ecx
-; AVX-NEXT: movl $15, %eax
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpextrb $0, %xmm0, %edx
-; AVX-NEXT: bsrl %edx, %edx
-; AVX-NEXT: cmovel %eax, %edx
-; AVX-NEXT: xorl $7, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i8:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VLCD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i8:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i8:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm4, %xmm2
+; X32-SSE-NEXT: pshufb %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
ret <16 x i8> %out
}
@@ -1663,242 +1487,80 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8u:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: bsrl %esi, %esi
-; SSSE3-NEXT: xorl $7, %esi
-; SSSE3-NEXT: movd %esi, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT: bsrl %ebx, %ebx
-; SSSE3-NEXT: xorl $7, %ebx
-; SSSE3-NEXT: movd %ebx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: bsrl %edx, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: bsrl %esi, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: bsrl %edx, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: bsrl %edi, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r10d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: bsrl %ecx, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: bsrl %r9d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: bsrl %r11d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r8d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: popq %rbx
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8u:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pextrb $0, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrb $1, %eax, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $2, %eax, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $3, %eax, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $4, %eax, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $5, %eax, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $6, %eax, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $7, %eax, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $8, %eax, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $9, %eax, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $10, %eax, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $11, %eax, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $12, %eax, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $13, %eax, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $14, %eax, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $15, %eax, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pshufb %xmm3, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8u:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $3, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $4, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $5, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $6, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $7, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $8, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $9, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $10, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $11, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $12, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $13, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $14, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $15, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i8u:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VLCD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i8u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i8u:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm4, %xmm2
+; X32-SSE-NEXT: pshufb %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
ret <16 x i8> %out
}
@@ -1916,17 +1578,17 @@ define <2 x i64> @foldv2i64() nounwind {
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: foldv2i64:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: movl $55, %eax
-; AVX512VLCD-NEXT: vmovq %rax, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: foldv2i64:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: movl $55, %eax
-; AVX512CD-NEXT: vmovq %rax, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: foldv2i64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: movl $55, %eax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $55, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
}
@@ -1944,17 +1606,17 @@ define <2 x i64> @foldv2i64u() nounwind {
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: foldv2i64u:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: movl $55, %eax
-; AVX512VLCD-NEXT: vmovq %rax, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: foldv2i64u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: movl $55, %eax
-; AVX512CD-NEXT: vmovq %rax, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: foldv2i64u:
+; AVX512: ## BB#0:
+; AVX512-NEXT: movl $55, %eax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $55, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
}
@@ -1979,6 +1641,11 @@ define <4 x i32> @foldv4i32() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
ret <4 x i32> %out
}
@@ -2003,6 +1670,11 @@ define <4 x i32> @foldv4i32u() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
ret <4 x i32> %out
}
@@ -2027,6 +1699,11 @@ define <8 x i16> @foldv8i16() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
ret <8 x i16> %out
}
@@ -2051,6 +1728,11 @@ define <8 x i16> @foldv8i16u() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
ret <8 x i16> %out
}
@@ -2075,6 +1757,11 @@ define <16 x i8> @foldv16i8() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
ret <16 x i8> %out
}
@@ -2099,6 +1786,11 @@ define <16 x i8> @foldv16i8u() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
ret <16 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index 1608bf53748d..ed31e49cb07c 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s --check-prefix=AVX512VLCD --check-prefix=ALL --check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=AVX512CD --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
@@ -12,55 +12,55 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: movl $127, %ecx
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %rax, %xmm3
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: movl $127, %ecx
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64:
@@ -70,7 +70,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv4i64:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0)
@@ -83,47 +85,52 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %rax, %xmm3
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64u:
@@ -133,7 +140,9 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv4i64u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1)
@@ -148,91 +157,64 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: bsrl %eax, %ecx
; AVX1-NEXT: movl $63, %eax
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vmovd %xmm1, %edx
; AVX1-NEXT: bsrl %edx, %edx
; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $31, %edx
; AVX1-NEXT: vmovd %edx, %xmm2
; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vmovd %xmm0, %edx
; AVX1-NEXT: bsrl %edx, %edx
; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $31, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %edx, %xmm3
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $3, %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
-; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0
+; AVX1-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %ecx
-; AVX2-NEXT: movl $63, %eax
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $31, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $31, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32:
@@ -242,7 +224,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv8i32:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0)
@@ -255,75 +239,57 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrd $1, %xmm1, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vmovd %xmm1, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $2, %xmm1, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $3, %xmm1, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vmovd %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $31, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $3, %xmm0, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32u:
@@ -333,7 +299,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv8i32u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1)
@@ -344,192 +312,65 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %cx
-; AVX1-NEXT: movw $31, %ax
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: bsrw %dx, %dx
-; AVX1-NEXT: cmovew %ax, %dx
-; AVX1-NEXT: xorl $15, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: bsrw %dx, %dx
-; AVX1-NEXT: cmovew %ax, %dx
-; AVX1-NEXT: xorl $15, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %cx
-; AVX2-NEXT: movw $31, %ax
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: bsrw %dx, %dx
-; AVX2-NEXT: cmovew %ax, %dx
-; AVX2-NEXT: xorl $15, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: bsrw %dx, %dx
-; AVX2-NEXT: cmovew %ax, %dx
-; AVX2-NEXT: xorl $15, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i16:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i16:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i16:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0)
ret <16 x i16> %out
}
@@ -538,158 +379,65 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i16u:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i16u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i16u:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1)
ret <16 x i16> %out
}
@@ -698,346 +446,52 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %ecx
-; AVX1-NEXT: movl $15, %eax
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpextrb $0, %xmm1, %edx
-; AVX1-NEXT: bsrl %edx, %edx
-; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $7, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpextrb $0, %xmm0, %edx
-; AVX1-NEXT: bsrl %edx, %edx
-; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $7, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %ecx
-; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm1, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $7, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm0, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $7, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv32i8:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
@@ -1047,12 +501,12 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CD-LABEL: testv32i8:
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
@@ -1066,280 +520,52 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv32i8u:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
@@ -1349,12 +575,12 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CD-LABEL: testv32i8u:
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
index 20ea86e5d439..4014cfd7ba2c 100644
--- a/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -1,5 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; ALL-LABEL: testv8i64:
@@ -38,28 +39,28 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16:
-; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwd %ymm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; ALL-NEXT: vpsubw %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpmovzxwd %ymm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdw %zmm1, %ymm1
-; ALL-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpsubw %ymm2, %ymm0, %ymm0
@@ -70,28 +71,28 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
}
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwd %ymm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; ALL-NEXT: vpsubw %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpmovzxwd %ymm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdw %zmm1, %ymm1
-; ALL-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv32i16u:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpsubw %ymm2, %ymm0, %ymm0
@@ -102,51 +103,51 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8:
-; ALL: ## BB#0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vpsubb %xmm3, %xmm0, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdb %zmm1, %xmm1
-; ALL-NEXT: vpsubb %xmm3, %xmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
@@ -158,51 +159,51 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
}
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8u:
-; ALL: ## BB#0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vpsubb %xmm3, %xmm0, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdb %zmm1, %xmm1
-; ALL-NEXT: vpsubb %xmm3, %xmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv64i8u:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
new file mode 100644
index 000000000000..0718edf5a143
--- /dev/null
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -0,0 +1,495 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; Lower common integer comparisons such as 'isPositive' efficiently:
+; https://llvm.org/bugs/show_bug.cgi?id=26701
+
+define <16 x i8> @test_pcmpgtb(<16 x i8> %x) {
+; SSE-LABEL: test_pcmpgtb:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtb:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %not = xor <16 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ ret <16 x i8> %not
+}
+
+define <8 x i16> @test_pcmpgtw(<8 x i16> %x) {
+; SSE-LABEL: test_pcmpgtw:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtw:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %not = xor <8 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ ret <8 x i16> %not
+}
+
+define <4 x i32> @test_pcmpgtd(<4 x i32> %x) {
+; SSE-LABEL: test_pcmpgtd:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtd:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+ %not = xor <4 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %not
+}
+
+define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
+; SSE2-LABEL: test_pcmpgtq:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_pcmpgtq:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtq:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <2 x i64> %x, <i64 63, i64 63>
+ %not = xor <2 x i64> %sign, <i64 -1, i64 -1>
+ ret <2 x i64> %not
+}
+
+define <1 x i128> @test_strange_type(<1 x i128> %x) {
+; SSE2-LABEL: test_strange_type:
+; SSE2: # BB#0:
+; SSE2-NEXT: sarq $63, %rsi
+; SSE2-NEXT: movd %rsi, %xmm0
+; SSE2-NEXT: notq %rsi
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: movq %rsi, %rdx
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_strange_type:
+; SSE42: # BB#0:
+; SSE42-NEXT: sarq $63, %rsi
+; SSE42-NEXT: movd %rsi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pxor %xmm0, %xmm1
+; SSE42-NEXT: movd %xmm1, %rax
+; SSE42-NEXT: pextrq $1, %xmm1, %rdx
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_strange_type:
+; AVX1: # BB#0:
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_strange_type:
+; AVX2: # BB#0:
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vmovq %rsi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: retq
+;
+ %sign = ashr <1 x i128> %x, <i128 127>
+ %not = xor <1 x i128> %sign, <i128 -1>
+ ret <1 x i128> %not
+}
+
+define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) {
+; SSE-LABEL: test_pcmpgtb_256:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtb_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtb_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <32 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %not = xor <32 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ ret <32 x i8> %not
+}
+
+define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) {
+; SSE-LABEL: test_pcmpgtw_256:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE-NEXT: pcmpgtw %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtw_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtw_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %not = xor <16 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ ret <16 x i16> %not
+}
+
+define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
+; SSE-LABEL: test_pcmpgtd_256:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtd_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtd_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ %not = xor <8 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <8 x i32> %not
+}
+
+define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
+; SSE2-LABEL: test_pcmpgtq_256:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_pcmpgtq_256:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtq_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtq_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <4 x i64> %x, <i64 63, i64 63, i64 63, i64 63>
+ %not = xor <4 x i64> %sign, <i64 -1, i64 -1, i64 -1, i64 -1>
+ ret <4 x i64> %not
+}
+
+define <16 x i8> @cmpeq_zext_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: cmpeq_zext_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmpeq_zext_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp eq <16 x i8> %a, %b
+ %zext = zext <16 x i1> %cmp to <16 x i8>
+ ret <16 x i8> %zext
+}
+
+define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: cmpeq_zext_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
+; SSE-NEXT: pcmpeqw %xmm3, %xmm1
+; SSE-NEXT: psrlw $15, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: cmpeq_zext_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpeq_zext_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp eq <16 x i16> %a, %b
+ %zext = zext <16 x i1> %cmp to <16 x i16>
+ ret <16 x i16> %zext
+}
+
+define <4 x i32> @cmpeq_zext_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: cmpeq_zext_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmpeq_zext_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp eq <4 x i32> %a, %b
+ %zext = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %zext
+}
+
+define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: cmpeq_zext_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: cmpeq_zext_v4i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT: psrlq $63, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm3, %xmm1
+; SSE42-NEXT: psrlq $63, %xmm1
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: cmpeq_zext_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpeq_zext_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp eq <4 x i64> %a, %b
+ %zext = zext <4 x i1> %cmp to <4 x i64>
+ ret <4 x i64> %zext
+}
+
+define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: cmpgt_zext_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: cmpgt_zext_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpgt_zext_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp sgt <32 x i8> %a, %b
+ %zext = zext <32 x i1> %cmp to <32 x i8>
+ ret <32 x i8> %zext
+}
+
+define <8 x i16> @cmpgt_zext_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: cmpgt_zext_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmpgt_zext_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <8 x i16> %a, %b
+ %zext = zext <8 x i1> %cmp to <8 x i16>
+ ret <8 x i16> %zext
+}
+
+define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: cmpgt_zext_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
+; SSE-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE-NEXT: psrld $31, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: cmpgt_zext_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpgt_zext_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp sgt <8 x i32> %a, %b
+ %zext = zext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %zext
+}
+
+define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: cmpgt_zext_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: cmpgt_zext_v2i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: psrlq $63, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: cmpgt_zext_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <2 x i64> %a, %b
+ %zext = zext <2 x i1> %cmp to <2 x i64>
+ ret <2 x i64> %zext
+}
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index 54b7af6830c0..cf4f21e62b61 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -1,156 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
-; ALL-LABEL: testv8i64:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrq $1, %xmm1, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vmovq %xmm1, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm1
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vpextrq $1, %xmm0, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm0, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm0
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv8i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv8i64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
ret <8 x i64> %out
}
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
-; ALL-LABEL: testv16i32:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrd $1, %xmm1, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm1, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm2
-; ALL-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $2, %xmm1, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $3, %xmm1, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm0, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm0, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm0, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm0, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv16i32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv16i32:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
ret <16 x i32> %out
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16:
-; ALL: ## BB#0:
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vpsllw $8, %ymm0, %ymm3
-; ALL-NEXT: vpaddb %ymm0, %ymm3, %ymm0
-; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
-; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv32i16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv32i16:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8:
-; ALL: ## BB#0:
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv64i8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv64i8:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-rem.ll b/test/CodeGen/X86/vector-rem.ll
index 51cd872643f2..5fb37ec8710f 100644
--- a/test/CodeGen/X86/vector-rem.ll
+++ b/test/CodeGen/X86/vector-rem.ll
@@ -1,15 +1,117 @@
-; RUN: llc < %s -march=x86-64 | grep div | count 8
-; RUN: llc < %s -march=x86-64 | grep fmodf | count 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) {
- %m = srem <4 x i32> %t, %u
- ret <4 x i32> %m
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm3
+; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+ %m = srem <4 x i32> %t, %u
+ ret <4 x i32> %m
}
+
define <4 x i32> @bar(<4 x i32> %t, <4 x i32> %u) {
- %m = urem <4 x i32> %t, %u
- ret <4 x i32> %m
+; CHECK-LABEL: bar:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm3
+; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+ %m = urem <4 x i32> %t, %u
+ ret <4 x i32> %m
}
+
define <4 x float> @qux(<4 x float> %t, <4 x float> %u) {
- %m = frem <4 x float> %t, %u
- ret <4 x float> %m
+; CHECK-LABEL: qux:
+; CHECK: # BB#0:
+; CHECK-NEXT: subq $72, %rsp
+; CHECK: movaps %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: movaps %xmm0, (%rsp)
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: unpcklps (%rsp), %xmm0
+; CHECK: movaps %xmm0, (%rsp)
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT: unpcklps (%rsp), %xmm1
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: addq $72, %rsp
+; CHECK-NEXT: retq
+;
+ %m = frem <4 x float> %t, %u
+ ret <4 x float> %m
}
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 4ad4aa46c5a0..50febd4c1ec7 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -215,7 +215,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
; X32-SSE-NEXT: psubd %xmm1, %xmm2
; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd .LCPI1_1, %xmm1
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; X32-SSE-NEXT: pmuludq %xmm0, %xmm1
@@ -667,7 +667,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psllw $4, %xmm5
-; X32-SSE-NEXT: pand .LCPI3_1, %xmm5
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5
; X32-SSE-NEXT: pand %xmm2, %xmm5
; X32-SSE-NEXT: pandn %xmm0, %xmm2
; X32-SSE-NEXT: por %xmm5, %xmm2
@@ -677,7 +677,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm2, %xmm6
; X32-SSE-NEXT: psllw $2, %xmm2
-; X32-SSE-NEXT: pand .LCPI3_2, %xmm2
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: pand %xmm5, %xmm2
; X32-SSE-NEXT: por %xmm6, %xmm2
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -693,7 +693,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -702,7 +702,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_4, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -710,7 +710,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_5, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: por %xmm1, %xmm0
@@ -955,44 +955,34 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_rotate_v8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [256,61680,57568,53456,49344,45232,41120,37008]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $4, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [512,57824,49600,41376,33152,24928,16704,8480]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $2, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [1024,50112,33664,17216,768,49856,33408,16960]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $1, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [2048,34688,1792,34432,1536,34176,1280,33920]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4],xmm2[5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_rotate_v8i16:
; AVX1: # BB#0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,61680,57568,53456,49344,45232,41120,37008]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,57824,49600,41376,33152,24928,16704,8480]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4],xmm0[5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,50112,33664,17216,768,49856,33408,16960]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2048,34688,1792,34432,1536,34176,1280,33920]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -1000,8 +990,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
@@ -1202,7 +1191,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm4
-; X32-SSE-NEXT: pand .LCPI7_1, %xmm4
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
; X32-SSE-NEXT: pand %xmm1, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm1
; X32-SSE-NEXT: por %xmm4, %xmm1
@@ -1212,7 +1201,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm4, %xmm5
; X32-SSE-NEXT: pandn %xmm1, %xmm5
; X32-SSE-NEXT: psllw $2, %xmm1
-; X32-SSE-NEXT: pand .LCPI7_2, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: pand %xmm4, %xmm1
; X32-SSE-NEXT: por %xmm5, %xmm1
; X32-SSE-NEXT: paddb %xmm3, %xmm3
@@ -1229,7 +1218,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_4, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -1238,7 +1227,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_5, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -1246,7 +1235,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm2, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_6, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: por %xmm3, %xmm0
@@ -1393,9 +1382,9 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $4, %xmm1
-; X32-SSE-NEXT: pand .LCPI11_0, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
@@ -1440,8 +1429,8 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllq $15, %xmm1
; X32-SSE-NEXT: psrlq $49, %xmm0
-; X32-SSE-NEXT: pand .LCPI12_0, %xmm0
-; X32-SSE-NEXT: pand .LCPI12_1, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -1485,8 +1474,8 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pslld $4, %xmm1
; X32-SSE-NEXT: psrld $28, %xmm0
-; X32-SSE-NEXT: pand .LCPI13_0, %xmm0
-; X32-SSE-NEXT: pand .LCPI13_1, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -1530,8 +1519,8 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $5, %xmm1
; X32-SSE-NEXT: psrlw $11, %xmm0
-; X32-SSE-NEXT: pand .LCPI14_0, %xmm0
-; X32-SSE-NEXT: pand .LCPI14_1, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -1578,11 +1567,11 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $4, %xmm1
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_1, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_2, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_3, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index 379b5fcb635f..af1755e14314 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -223,11 +223,11 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
; AVX2-NEXT: vpsllvd %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
@@ -498,59 +498,51 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
ret <8 x i32> %or
}
-define <16 x i16> @constant_rotate_v8i16(<16 x i16> %a) nounwind {
-; AVX1-LABEL: constant_rotate_v8i16:
+define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: constant_rotate_v16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,28784,24672,20560,16448,12336,8224,4112]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,57568,49344,41120,32896,24672,16448,8224]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,49600,33152,16704,256,49344,32896,16448]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6],xmm2[7]
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,33664,768,33408,512,33152,256,32896]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,61680,57568,53456,49344,45232,41120,37008]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,57824,49600,41376,33152,24928,16704,8480]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4],xmm0[5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,50112,33664,17216,768,49856,33408,16960]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3,4],xmm3[5,6],xmm0[7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2048,34688,1792,34432,1536,34176,1280,33920]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: constant_rotate_v8i16:
+; AVX2-LABEL: constant_rotate_v16i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX2-NEXT: vpsrlvd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_rotate_v8i16:
+; XOPAVX1-LABEL: constant_rotate_v16i16:
; XOPAVX1: # BB#0:
; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -565,7 +557,7 @@ define <16 x i16> @constant_rotate_v8i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: constant_rotate_v8i16:
+; XOPAVX2-LABEL: constant_rotate_v16i16:
; XOPAVX2: # BB#0:
; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index b63c3f084b22..018c5922a432 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -4,6 +4,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
;
; Just one 32-bit run to make sure we do reasonable things there.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
@@ -81,6 +82,11 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_16i8_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2
@@ -143,14 +149,12 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSSE3-LABEL: sext_16i8_to_8i32:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i32:
@@ -171,11 +175,14 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX2-LABEL: sext_16i8_to_8i32:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_16i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_16i8_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2
@@ -285,12 +292,14 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX2-LABEL: sext_16i8_to_4i64:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_16i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_16i8_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
@@ -304,6 +313,137 @@ entry:
ret <4 x i64> %C
}
+define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_8i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: psrad $24, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: psrld $16, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: psrad $24, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_16i8_to_8i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSSE3-NEXT: psrld $16, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: psrad $24, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: psrad $24, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT: psrld $16, %xmm3
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: psrad $24, %xmm3
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_16i8_to_8i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pmovsxbq %xmm1, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovsxbq %xmm2, %xmm2
+; SSE41-NEXT: psrlq $48, %xmm0
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sext_16i8_to_8i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sext_16i8_to_8i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $24, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
+; AVX2-NEXT: vmovdqa %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: sext_16i8_to_8i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpsllq $56, %zmm0, %zmm0
+; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_8i64:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4
+; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE41-NEXT: psrld $16, %xmm1
+; X32-SSE41-NEXT: pmovsxbq %xmm1, %xmm1
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT: pmovsxbq %xmm2, %xmm2
+; X32-SSE41-NEXT: psrlq $48, %xmm0
+; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm3
+; X32-SSE41-NEXT: movdqa %xmm4, %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %C = sext <8 x i8> %B to <8 x i64>
+ ret <8 x i64> %C
+}
+
define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_4i32:
; SSE2: # BB#0: # %entry
@@ -377,6 +517,11 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_8i16_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
@@ -479,12 +624,14 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
;
; AVX2-LABEL: sext_8i16_to_4i64:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_8i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_8i16_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2
@@ -577,6 +724,11 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_4i32_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
@@ -603,18 +755,40 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; AVX-LABEL: load_sext_2i1_to_2i64:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movzbl (%rdi), %eax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
-; AVX-NEXT: shlq $63, %rax
-; AVX-NEXT: sarq $63, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_2i1_to_2i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movzbl (%rdi), %eax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: shlq $63, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_2i1_to_2i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movzbl (%rdi), %eax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: shlq $63, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_2i1_to_2i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
; X32-SSE41: # BB#0: # %entry
@@ -749,25 +923,55 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; SSE41-NEXT: pinsrd $3, %eax, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: load_sext_4i1_to_4i32:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movzbl (%rdi), %eax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: shlq $63, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $61, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shlq $60, %rax
-; AVX-NEXT: sarq $63, %rax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_4i1_to_4i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movzbl (%rdi), %eax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shlq $60, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_4i1_to_4i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movzbl (%rdi), %eax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shlq $60, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_4i1_to_4i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
; X32-SSE41: # BB#0: # %entry
@@ -836,24 +1040,20 @@ entry:
define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSE2-LABEL: load_sext_4i1_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movzbl (%rdi), %eax
+; SSE2-NEXT: movl (%rdi), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $3, %ecx
-; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl %ecx
-; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: shrl $2, %eax
-; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
; SSE2-NEXT: psllq $63, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
@@ -866,24 +1066,20 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
;
; SSSE3-LABEL: load_sext_4i1_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movzbl (%rdi), %eax
+; SSSE3-NEXT: movl (%rdi), %eax
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $3, %ecx
-; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl %ecx
-; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: shrl $2, %eax
-; SSSE3-NEXT: andl $1, %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
; SSSE3-NEXT: psllq $63, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
@@ -896,21 +1092,17 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
;
; SSE41-LABEL: load_sext_4i1_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movzbl (%rdi), %eax
+; SSE41-NEXT: movl (%rdi), %eax
; SSE41-NEXT: movl %eax, %ecx
; SSE41-NEXT: shrl %ecx
-; SSE41-NEXT: andl $1, %ecx
-; SSE41-NEXT: movl %eax, %edx
-; SSE41-NEXT: andl $1, %edx
-; SSE41-NEXT: movd %edx, %xmm1
+; SSE41-NEXT: movd %eax, %xmm1
; SSE41-NEXT: pinsrd $1, %ecx, %xmm1
; SSE41-NEXT: movl %eax, %ecx
; SSE41-NEXT: shrl $2, %ecx
-; SSE41-NEXT: andl $1, %ecx
; SSE41-NEXT: pinsrd $2, %ecx, %xmm1
; SSE41-NEXT: shrl $3, %eax
-; SSE41-NEXT: andl $1, %eax
; SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: psllq $63, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -968,24 +1160,29 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i1_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movzbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shrl %ecx
-; X32-SSE41-NEXT: andl $1, %ecx
-; X32-SSE41-NEXT: movl %eax, %edx
-; X32-SSE41-NEXT: andl $1, %edx
-; X32-SSE41-NEXT: movd %edx, %xmm1
+; X32-SSE41-NEXT: movd %eax, %xmm1
; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm1
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shrl $2, %ecx
-; X32-SSE41-NEXT: andl $1, %ecx
; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm1
; X32-SSE41-NEXT: shrl $3, %eax
-; X32-SSE41-NEXT: andl $1, %eax
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; X32-SSE41-NEXT: psllq $63, %xmm0
; X32-SSE41-NEXT: psrad $31, %xmm0
@@ -1050,6 +1247,11 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1182,40 +1384,84 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; SSE41-NEXT: pinsrw $7, %eax, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: load_sext_8i1_to_8i16:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movsbq (%rdi), %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: shlq $63, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $61, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $60, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $59, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $58, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $57, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shrq $7, %rax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_8i1_to_8i16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movsbq (%rdi), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $60, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $59, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $58, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $57, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrq $7, %rax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_8i1_to_8i16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movsbq (%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $59, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $58, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $57, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrq $7, %rax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_8i1_to_8i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
; X32-SSE41: # BB#0: # %entry
@@ -1294,6 +1540,102 @@ entry:
ret <8 x i16> %Y
}
+define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_8i8_to_8i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movsbq 1(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movsbq (%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movsbq 3(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: movsbq 2(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: movsbq 5(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm3
+; SSE2-NEXT: movsbq 4(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: movsbq 7(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm4
+; SSE2-NEXT: movsbq 6(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_8i8_to_8i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movsbq 1(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movsbq (%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movsbq 3(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: movsbq 2(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT: movsbq 5(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm3
+; SSSE3-NEXT: movsbq 4(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT: movsbq 7(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm4
+; SSSE3-NEXT: movsbq 6(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm3
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_8i8_to_8i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
+; SSE41-NEXT: pmovsxbq 4(%rdi), %xmm2
+; SSE41-NEXT: pmovsxbq 6(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_sext_8i8_to_8i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_8i8_to_8i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT: vpmovsxbq 4(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_8i8_to_8i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
+; X32-SSE41-NEXT: pmovsxbq 4(%eax), %xmm2
+; X32-SSE41-NEXT: pmovsxbq 6(%eax), %xmm3
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i64>
+ ret <8 x i64> %Y
+}
+
define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSE2-LABEL: load_sext_8i1_to_8i32:
; SSE2: # BB#0: # %entry
@@ -1506,6 +1848,15 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_8i1_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1597,6 +1948,11 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1864,71 +2220,145 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: load_sext_16i1_to_16i8:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movswq (%rdi), %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: shlq $63, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $61, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $60, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $59, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $58, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $57, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movsbq %al, %rcx
-; AVX-NEXT: shrq $7, %rcx
-; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $55, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $54, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $53, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $52, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $51, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $50, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $49, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shrq $15, %rax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_16i1_to_16i8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movswq (%rdi), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $60, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $59, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $58, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $57, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movsbq %al, %rcx
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $55, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $54, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $53, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $52, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $51, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $50, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $49, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrq $15, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_16i1_to_16i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movswq (%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $59, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $58, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $57, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movsbq %al, %rcx
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $55, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $54, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $53, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $52, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $51, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $50, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $49, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrq $15, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_16i1_to_16i8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: kmovw (%rdi), %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
; X32-SSE41: # BB#0: # %entry
@@ -2460,6 +2890,14 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_16i1_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: kmovw (%rdi), %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3297,6 +3735,18 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_32i1_to_32i8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: kmovw (%rdi), %k1
+; AVX512-NEXT: kmovw 2(%rdi), %k2
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pushl %esi
@@ -3472,6 +3922,11 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3611,6 +4066,11 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3662,6 +4122,11 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3755,6 +4220,11 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3864,6 +4334,13 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_4i1_to_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_4i1_to_4i64:
; X32-SSE41: # BB#0:
; X32-SSE41-NEXT: pslld $31, %xmm0
@@ -3931,6 +4408,13 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_4i8_to_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_4i8_to_4i64:
; X32-SSE41: # BB#0:
; X32-SSE41-NEXT: pslld $24, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 771445df85e0..81eaeb998075 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -311,6 +311,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -323,7 +324,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
@@ -677,7 +681,7 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v4i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
@@ -745,9 +749,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE2-NEXT: psllw $5, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
@@ -949,9 +952,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; X32-SSE-NEXT: psllw $5, %xmm3
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
@@ -1194,48 +1196,33 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE41-LABEL: constant_shift_v8i16:
; SSE41: # BB#0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $8, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psraw $4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psraw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1248,8 +1235,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1656,7 +1645,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; X32-SSE-NEXT: pxor %xmm1, %xmm0
; X32-SSE-NEXT: psubb %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 0b9c318da047..af076fbbd818 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -178,11 +178,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
@@ -214,7 +214,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = ashr <16 x i16> %a, %b
ret <16 x i16> %shift
@@ -458,7 +461,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v8i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -763,30 +766,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsraw $8, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsraw $2, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -795,12 +787,12 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX2-NEXT: vpsravd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -829,8 +821,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 147e58f4710e..8183292c77fc 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -28,20 +28,20 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
; AVX512DQ-NEXT: vpsravd %ymm5, %ymm6, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
; AVX512DQ-NEXT: vpsravd %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
; AVX512DQ-NEXT: vpsravd %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
; AVX512DQ-NEXT: vpsravd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
@@ -129,7 +129,7 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
; ALL-LABEL: splatvar_shift_v16i32:
; ALL: ## BB#0:
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -238,21 +238,21 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX512DQ-NEXT: vpsravd %ymm4, %ymm5, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsravd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-NEXT: vpsravd %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpsravd %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsravd %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX512DQ-NEXT: vpsravd %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vpsrld $16, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-NEXT: vpsravd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
@@ -376,3 +376,21 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
%shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <64 x i8> %shift
}
+
+define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) {
+; AVX512DQ-LABEL: ashr_const7_v64i8:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: ashr_const7_v64i8:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = ashr <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 86e54612ae74..213e2a41a662 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -280,6 +280,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -292,7 +293,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
@@ -437,7 +441,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -446,7 +450,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -454,7 +458,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm2, %xmm1
; X32-SSE-NEXT: pandn %xmm0, %xmm1
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -529,7 +533,7 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v4i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
@@ -597,9 +601,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -727,9 +730,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -737,7 +739,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -746,7 +748,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -754,7 +756,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: pandn %xmm0, %xmm2
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
@@ -928,48 +930,33 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE41-LABEL: constant_shift_v8i16:
; SSE41: # BB#0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -982,8 +969,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1112,7 +1101,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -1121,7 +1110,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -1129,7 +1118,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: pandn %xmm0, %xmm2
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
@@ -1257,7 +1246,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index ecc68cf2e278..f9ff3092388b 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -155,11 +155,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
@@ -191,7 +191,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = lshr <16 x i16> %a, %b
ret <16 x i16> %shift
@@ -369,7 +372,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v8i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -618,30 +621,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -650,12 +642,12 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -684,8 +676,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 68644e61b0e5..a7759aa9472a 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -29,20 +29,20 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
; AVX512DQ-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
@@ -110,7 +110,7 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
; ALL-LABEL: splatvar_shift_v16i32:
; ALL: ## BB#0:
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -202,21 +202,21 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm5, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vpsrld $16, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 9b59c6224ef2..7202f1ec0cb8 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -131,7 +131,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; X32-SSE-LABEL: var_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd .LCPI1_0, %xmm1
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X32-SSE-NEXT: pmuludq %xmm0, %xmm1
@@ -237,6 +237,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -247,7 +248,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
@@ -386,7 +390,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -395,7 +399,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -477,7 +481,7 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v4i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
@@ -545,9 +549,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -667,9 +670,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -677,7 +679,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -686,7 +688,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -839,13 +841,15 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pmullw .LCPI10_0, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <8 x i16> %shift
@@ -951,7 +955,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -960,7 +964,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -1093,7 +1097,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $3, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 3daf24f1a82e..bc7d20cd86d8 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -136,11 +136,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
@@ -166,7 +166,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = shl <16 x i16> %a, %b
ret <16 x i16> %shift
@@ -333,7 +336,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v8i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -585,8 +588,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index 26ddb1c127e1..ac867c70f15f 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -29,20 +29,20 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
; AVX512DQ-NEXT: vpsllvd %ymm5, %ymm6, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
; AVX512DQ-NEXT: vpsllvd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
@@ -106,7 +106,7 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
; ALL-LABEL: splatvar_shift_v16i32:
; ALL: ## BB#0:
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 13a9543ddd90..2651063379ff 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -9,20 +9,11 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
-; FIXME: SSE2 should look like the following:
-; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
-; FIXME: # BB#0:
-; FIXME-NEXT: punpcklbw %xmm0, %xmm0
-; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; FIXME-NEXT: retq
-;
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -55,9 +46,8 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
@@ -82,10 +72,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
@@ -182,33 +172,16 @@ define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(
}
define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
-; FIXME: SSE2 should be the following:
-; FIXME-LABEL: @shuffle_v16i8_0101010101010101
-; FIXME: # BB#0:
-; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; FIXME-NEXT: retq
-;
-; SSE2-LABEL: shuffle_v16i8_0101010101010101:
-; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v16i8_0101010101010101:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v16i8_0101010101010101:
-; SSE41: # BB#0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSE41-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_0101010101010101:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_0101010101010101:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i8_0101010101010101:
@@ -252,9 +225,8 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
@@ -953,7 +925,7 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
@@ -1132,9 +1104,8 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
; SSE2-NEXT: packuswb %xmm5, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1377,9 +1348,8 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
@@ -1420,9 +1390,8 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
@@ -1466,9 +1435,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
@@ -1505,9 +1473,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
@@ -1545,9 +1512,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
@@ -1592,9 +1558,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 1d32f9e38523..0d50205aa4a5 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -159,7 +159,7 @@ define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: shuffle_v2f64_11:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
ret <2 x double> %shuffle
@@ -217,7 +217,7 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: shuffle_v2f64_33:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
ret <2 x double> %shuffle
@@ -762,7 +762,7 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
;
; AVX512VL-LABEL: shuffle_v2i64_z1:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
@@ -804,7 +804,7 @@ define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
;
; AVX512VL-LABEL: shuffle_v2f64_1z:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
@@ -833,7 +833,7 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
;
; AVX512VL-LABEL: shuffle_v2f64_z0:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0>
@@ -865,11 +865,23 @@ define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v2f64_z1:
-; AVX: # BB#0:
-; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v2f64_z1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2f64_z1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v2f64_z1:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
}
@@ -895,7 +907,7 @@ define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
;
; AVX512VL-LABEL: shuffle_v2f64_bitcast_1z:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; AVX512VL-NEXT: retq
%shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
@@ -929,33 +941,25 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
;
; SSE41-LABEL: shuffle_v2i64_bitcast_z123:
; SSE41: # BB#0:
-; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_bitcast_z123:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_bitcast_z123:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_bitcast_z123:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovss {{.*}}(%rip), %xmm1
-; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX512VL-NEXT: retq
%bitcast32 = bitcast <2 x i64> %x to <4 x float>
@@ -986,20 +990,10 @@ define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_mem_and_zero_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v2i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v2i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_and_zero_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -1027,20 +1021,10 @@ define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_mem_and_zero_v2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovsd (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_and_zero_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -1097,17 +1081,17 @@ define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
; SSE2-LABEL: insert_mem_lo_v2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movlpd (%rdi), %xmm0
+; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_mem_lo_v2i64:
; SSE3: # BB#0:
-; SSE3-NEXT: movlpd (%rdi), %xmm0
+; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_mem_lo_v2i64:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movlpd (%rdi), %xmm0
+; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_mem_lo_v2i64:
@@ -1130,7 +1114,7 @@ define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
;
; AVX512VL-LABEL: insert_mem_lo_v2i64:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm1
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: retq
%a = load i64, i64* %ptr
@@ -1163,23 +1147,11 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_mem_hi_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_hi_v2i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_hi_v2i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_hi_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
@@ -1193,20 +1165,10 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_reg_lo_v2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_reg_lo_v2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_reg_lo_v2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_reg_lo_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: retq
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %shuffle
@@ -1215,12 +1177,12 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
; SSE-LABEL: insert_mem_lo_v2f64:
; SSE: # BB#0:
-; SSE-NEXT: movlpd (%rdi), %xmm0
+; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_lo_v2f64:
; AVX: # BB#0:
-; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; AVX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1247,12 +1209,12 @@ define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) {
define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
; SSE-LABEL: insert_mem_hi_v2f64:
; SSE: # BB#0:
-; SSE-NEXT: movhpd (%rdi), %xmm0
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v2f64:
; AVX: # BB#0:
-; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 53dbb32235ae..aaf5fa673a15 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -227,7 +227,7 @@ define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: shuffle_v4f32_0011:
; AVX: # BB#0:
-; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
ret <4 x float> %shuffle
@@ -240,7 +240,7 @@ define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: shuffle_v4f32_2233:
; AVX: # BB#0:
-; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
ret <4 x float> %shuffle
@@ -302,6 +302,35 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_0145:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0145:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_6723:
+; SSE: # BB#0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_6723:
+; AVX: # BB#0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0124:
; SSE2: # BB#0:
@@ -1080,15 +1109,11 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
; SSE41-LABEL: shuffle_v4f32_0zz6:
; SSE41: # BB#0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz6:
; AVX: # BB#0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
@@ -1129,15 +1154,11 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSE41-LABEL: shuffle_v4f32_0z24:
; SSE41: # BB#0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
; AVX: # BB#0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
@@ -1805,6 +1826,162 @@ define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
ret <4 x float> %3
}
+define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: mask_v4f32_4127:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: mask_v4f32_4127:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: mask_v4f32_4127:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v4f32_4127:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mask_v4f32_4127:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a to <4 x i32>
+ %2 = bitcast <4 x float> %b to <4 x i32>
+ %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0>
+ %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1>
+ %5 = or <4 x i32> %4, %3
+ %6 = bitcast <4 x i32> %5 to <4 x float>
+ ret <4 x float> %6
+}
+
+define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: mask_v4f32_0127:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: mask_v4f32_0127:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: mask_v4f32_0127:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v4f32_0127:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mask_v4f32_0127:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a to <2 x i64>
+ %2 = bitcast <4 x float> %b to <2 x i64>
+ %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
+ %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
+ %5 = or <2 x i64> %4, %3
+ %6 = bitcast <2 x i64> %5 to <4 x float>
+ ret <4 x float> %6
+}
+
+define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: mask_v4i32_0127:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: mask_v4i32_0127:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: mask_v4i32_0127:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v4i32_0127:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: mask_v4i32_0127:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mask_v4i32_0127:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT: retq
+ %1 = bitcast <4 x i32> %a to <2 x i64>
+ %2 = bitcast <4 x i32> %b to <2 x i64>
+ %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
+ %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
+ %5 = or <2 x i64> %4, %3
+ %6 = bitcast <2 x i64> %5 to <4 x i32>
+ ret <4 x i32> %6
+}
+
+define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
+; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSE3: # BB#0:
+; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX-NEXT: retq
+ %1 = load <2 x float>, <2 x float>* %x, align 1
+ %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
; SSE-LABEL: insert_reg_and_zero_v4i32:
; SSE: # BB#0:
@@ -1935,17 +2112,17 @@ define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
; SSE2-LABEL: insert_mem_lo_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: movlpd (%rdi), %xmm0
+; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_mem_lo_v4i32:
; SSE3: # BB#0:
-; SSE3-NEXT: movlpd (%rdi), %xmm0
+; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_mem_lo_v4i32:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movlpd (%rdi), %xmm0
+; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_mem_lo_v4i32:
@@ -2027,12 +2204,12 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE-LABEL: insert_mem_lo_v4f32:
; SSE: # BB#0:
-; SSE-NEXT: movlpd (%rdi), %xmm0
+; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_lo_v4f32:
; AVX: # BB#0:
-; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; AVX-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -2060,12 +2237,12 @@ define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE-LABEL: insert_mem_hi_v4f32:
; SSE: # BB#0:
-; SSE-NEXT: movhpd (%rdi), %xmm0
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v4f32:
; AVX: # BB#0:
-; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 168b3e33bfcf..e64ca967eaa9 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -61,26 +61,16 @@ define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: shuffle_v8i16_00000000:
-; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v8i16_00000000:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v8i16_00000000:
-; SSE41: # BB#0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSE41-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_00000000:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_00000000:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i16_00000000:
@@ -2147,13 +2137,50 @@ define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
ret <8 x i16> %shuffle
}
+define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: mask_v8i16_012345ef:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: mask_v8i16_012345ef:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v8i16_012345ef:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: mask_v8i16_012345ef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mask_v8i16_012345ef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT: retq
+ %1 = bitcast <8 x i16> %a to <2 x i64>
+ %2 = bitcast <8 x i16> %b to <2 x i64>
+ %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
+ %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
+ %5 = or <2 x i64> %4, %3
+ %6 = bitcast <2 x i64> %5 to <8 x i16>
+ ret <8 x i16> %6
+}
+
define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_mem_v8i16_i32:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v8i16_i32:
@@ -2190,9 +2217,8 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16:
@@ -2234,9 +2260,8 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32:
@@ -2272,9 +2297,9 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
@@ -2311,9 +2336,8 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
@@ -2357,9 +2381,9 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 7e3dc6e294f8..2182ffe0983a 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -7,7 +7,8 @@ target triple = "x86_64-unknown-unknown"
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -22,18 +23,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <16 x i16> %shuffle
@@ -42,16 +41,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -60,16 +59,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -78,16 +77,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -96,16 +95,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -114,16 +113,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -132,16 +131,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -153,7 +152,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -161,7 +161,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -175,7 +176,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -196,7 +198,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -216,7 +219,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -236,7 +240,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -256,7 +261,8 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -276,7 +282,8 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -296,7 +303,8 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -313,16 +321,18 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
@@ -331,16 +341,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
ret <16 x i16> %shuffle
@@ -436,7 +448,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <16 x i16> %shuffle
@@ -452,7 +464,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -468,7 +480,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -484,7 +496,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -500,7 +512,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -516,7 +528,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -532,7 +544,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -541,11 +553,10 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
@@ -559,11 +570,10 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_3
define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
@@ -605,10 +615,10 @@ define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_1
define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
@@ -623,9 +633,10 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3
define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
@@ -640,11 +651,10 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1
define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
@@ -659,11 +669,10 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1
define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
@@ -699,9 +708,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_1
;
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
ret <16 x i16> %shuffle
@@ -721,8 +729,9 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
;
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1,16,17,16,17,20,21,16,17,16,17,16,17,28,29,16,17]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
@@ -1162,7 +1171,8 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1
define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1426,7 +1436,7 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_z
; AVX1: # BB#0:
; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
@@ -1654,7 +1664,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -2360,6 +2371,24 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
ret <16 x i16> %shuffle
}
+define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 2, i32 16, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
; AVX1: # BB#0:
@@ -2904,8 +2933,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
@@ -3269,13 +3298,15 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16>
define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -3286,15 +3317,15 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
@@ -3312,7 +3343,8 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16>
define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; ALL: # BB#0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
@@ -3322,18 +3354,76 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
+define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
+; AVX1-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
+ ret <16 x i16> %1
+}
+
+define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
+; AVX1-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,2,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
+ %2 = bitcast <16 x i16> %1 to <4 x i64>
+ %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ %4 = bitcast <4 x i64> %3 to <16 x i16>
+ ret <16 x i16> %4
+}
+
define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
; ALL: # BB#0:
@@ -3375,11 +3465,51 @@ define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_b
ret <16 x i16> %shuffle16
}
+define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: PR24935:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,5,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,3,2,3,4,5,6,7,8,9,8,9,0,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6],xmm2[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,4,5,4,5,10,11,4,5,14,15,12,13,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5],xmm0[6],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR24935:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5,6,7,8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 27, i32 26, i32 1, i32 29, i32 26, i32 23, i32 11, i32 16, i32 1, i32 9, i32 16, i32 28, i32 13, i32 4, i32 0, i32 24>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_mem_v16i16_i32:
; AVX1: # BB#0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -3399,7 +3529,8 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
; AVX1: # BB#0:
; AVX1-NEXT: movswl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -3421,7 +3552,8 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32:
; AVX1: # BB#0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 161a21cef030..b0566812ff7d 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -31,9 +31,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <32 x i8> %shuffle
@@ -50,9 +49,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -69,9 +67,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -88,9 +85,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -107,9 +103,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -126,9 +121,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -145,9 +139,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -164,9 +157,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -183,9 +175,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -202,9 +193,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -221,9 +211,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -240,9 +229,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -259,9 +247,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -278,9 +265,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -299,11 +285,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -818,7 +801,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <32 x i8> %shuffle
@@ -834,7 +817,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -850,7 +833,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -866,7 +849,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -882,7 +865,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -902,7 +885,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: movl $15, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -911,12 +894,10 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
@@ -931,12 +912,10 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
@@ -949,20 +928,27 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
}
define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
-; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; ALL: # BB#0:
+; ALL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255]
-; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; AVX2-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
; AVX2: # BB#0:
-; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2-NEXT: retq
- %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 2, i32 32, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %shuffle
}
@@ -970,16 +956,15 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
; AVX1: # BB#0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
; AVX2: # BB#0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32>
ret <32 x i8> %shuffle
@@ -988,21 +973,23 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -1465,7 +1452,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -2006,8 +1993,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
ret <32 x i8> %shuffle
@@ -2057,6 +2043,36 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
ret <32 x i8> %shuffle
}
+define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
+; AVX1-LABEL: PR28136:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,10,10,12,12,14,14,9,9,11,11,13,13,15,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,2,2,4,4,6,6,1,1,3,3,5,5,7,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR28136:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50,i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x i64> %3
+}
+
define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_mem_v32i8_i32:
; AVX1: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 7e33f5f3aa86..181b2e420203 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -112,8 +112,8 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_2200:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_2200:
@@ -153,8 +153,8 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3210:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3210:
@@ -488,7 +488,7 @@ define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_11uu:
; ALL: # BB#0:
-; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
ret <4 x double> %shuffle
@@ -517,9 +517,8 @@ define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3333:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3333:
@@ -558,7 +557,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0001:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -580,7 +579,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -642,8 +641,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1000:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -663,8 +662,8 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_2200:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_2200:
@@ -704,8 +703,8 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3210:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3210:
@@ -801,7 +800,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: retq
@@ -850,15 +849,15 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_0451:
; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0451:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
@@ -894,14 +893,14 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_4015:
; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_4015:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX512VL-NEXT: retq
@@ -1153,7 +1152,7 @@ define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_22uu:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_22uu:
@@ -1172,9 +1171,8 @@ define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3333:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3333:
@@ -1212,20 +1210,10 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
}
define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
-; AVX1-LABEL: insert_mem_and_zero_v4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v4i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: insert_mem_and_zero_v4i64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1235,12 +1223,14 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; AVX1-LABEL: insert_reg_and_zero_v4f64:
; AVX1: # BB#0:
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_reg_and_zero_v4f64:
; AVX2: # BB#0:
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-NEXT: retq
@@ -1248,7 +1238,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; AVX512VL-LABEL: insert_reg_and_zero_v4f64:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512VL-NEXT: retq
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1256,20 +1246,10 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
}
define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
-; AVX1-LABEL: insert_mem_and_zero_v4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v4f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovsd (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: insert_mem_and_zero_v4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1342,8 +1322,7 @@ define <4 x double> @splat_v4f64(<2 x double> %r) {
define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
; AVX1-LABEL: splat_mem_v4i64_from_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat_mem_v4i64_from_v2i64:
@@ -1416,6 +1395,28 @@ define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
ret <4 x double> %shuffle
}
+define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
+; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: broadcast_v4f64_0000_from_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: broadcast_v4f64_0000_from_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512VL-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <4 x i64> %1 to <4 x double>
+ %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %3
+}
+
define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: bitcast_v4f64_0426:
; AVX1: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index e8b886afd1ae..4aab5cd17009 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -29,8 +29,8 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00000010:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x float> %shuffle
@@ -46,8 +46,8 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00000200:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x float> %shuffle
@@ -63,8 +63,8 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00003000:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x float> %shuffle
@@ -152,8 +152,8 @@ define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00112233:
; AVX1: # BB#0:
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -176,8 +176,8 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00001111:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x float> %shuffle
@@ -195,18 +195,15 @@ define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_08080808:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_08080808:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x float> %shuffle
@@ -216,7 +213,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_08084c4c:
; ALL: # BB#0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x float> %shuffle
@@ -299,10 +296,10 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_08991abb:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
@@ -338,8 +335,8 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_09ab1def:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
@@ -647,10 +644,10 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_c348cda0:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
@@ -660,21 +657,21 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_f511235a:
; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_f511235a:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
-; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
@@ -690,42 +687,29 @@ define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_32103210:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_76547654:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_76547654:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_76547654:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_76543210:
-; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_76543210:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_76543210:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
@@ -783,11 +767,8 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
;
; AVX2-LABEL: PR21138:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,u,u,1,3,5,7>
-; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,7,u,u,u,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <8 x float> %shuffle
@@ -806,10 +787,10 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_ba983210:
; ALL: # BB#0:
-; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
@@ -844,15 +825,14 @@ define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_44444444:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_44444444:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle
@@ -897,17 +877,11 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_5555uuuu:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_5555uuuu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_5555uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %shuffle
}
@@ -930,15 +904,15 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000010:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00000010:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x i32> %shuffle
@@ -947,15 +921,15 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000200:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00000200:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -964,15 +938,15 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00003000:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00003000:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -1065,8 +1039,8 @@ define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00112233:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1082,15 +1056,15 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00001111:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00001111:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shuffle
@@ -1113,18 +1087,15 @@ define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08080808:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08080808:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x i32> %shuffle
@@ -1134,7 +1105,7 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08084c4c:
; AVX1: # BB#0:
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08084c4c:
@@ -1252,10 +1223,10 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-LABEL: shuffle_v8i32_08991abb:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
@@ -1265,8 +1236,8 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_091b2d3f:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
@@ -1283,15 +1254,15 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_09ab1def:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_09ab1def:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
@@ -1768,19 +1739,18 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_6caa87e5:
; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_6caa87e5:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u>
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,0,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
@@ -1796,8 +1766,8 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-LABEL: shuffle_v8i32_32103210:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
@@ -1806,15 +1776,14 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_76547654:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_76547654:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
@@ -1823,14 +1792,14 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_76543210:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_76543210:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
@@ -1935,7 +1904,7 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_zuu8zuuc:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX1-NEXT: retq
;
@@ -1950,7 +1919,7 @@ define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_9ubzdefz:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
@@ -2014,15 +1983,14 @@ define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_44444444:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_44444444:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -2032,13 +2000,13 @@ define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_5555uuuu:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_5555uuuu:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@@ -2077,9 +2045,9 @@ define <8 x float> @splat_v8f32(<4 x float> %r) {
define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_z0U2zUz6:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_z0U2zUz6:
@@ -2093,9 +2061,9 @@ define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_1U3z5zUU:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_1U3z5zUU:
@@ -2198,7 +2166,7 @@ define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_1:
; ALL: # BB#0: # %entry
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
entry:
%tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
@@ -2213,7 +2181,7 @@ define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_2:
; ALL: # BB#0: # %entry
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
entry:
%tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
@@ -2226,7 +2194,7 @@ define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_3:
; ALL: # BB#0: # %entry
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
entry:
%tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index bef54b05041b..d75184951344 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -4,6 +4,25 @@
target triple = "x86_64-unknown-unknown"
+define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastss %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; ALL-NEXT: vbroadcastss %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x float> %shuffle
+}
+
define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
; ALL: # BB#0:
@@ -70,6 +89,70 @@ define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz
ret <16 x float> %shuffle
}
+define <16 x float> @shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,3,4,5,6,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 12, i32 13>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4, i32 8, i32 8, i32 10, i32 8, i32 12, i32 12, i32 14, i32 12>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12>
+ ret <16 x float> %shuffle
+}
+
+define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastd %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i32> %shuffle
+}
+
+define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; ALL-NEXT: vpbroadcastd %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ ret <16 x i32> %shuffle
+}
+
define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
; ALL: # BB#0:
@@ -89,6 +172,21 @@ define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_z
ret <16 x i32> %shuffle
}
+define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) {
+; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
+; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3],zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19],zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35],zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51]
+; AVX512BW-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 20, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 28>
+ ret <16 x i32> %shuffle
+}
+
define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
; ALL: # BB#0:
@@ -172,3 +270,30 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
%v2 = shufflevector <16 x float> %v_a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x float> %v2
}
+
+define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; ALL-NEXT: retq
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ ret <16 x i32> %c
+}
+
+define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; ALL-NEXT: retq
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+ ret <16 x i32> %c
+}
+
+define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,1],zmm1[0,0],zmm0[4,5],zmm1[4,4],zmm0[8,9],zmm1[8,8],zmm0[12,13],zmm1[12,12]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+ ret <16 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index ab809beb4b48..bcc4ad2d1412 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -1,8 +1,27 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
target triple = "x86_64-unknown-unknown"
+define <32 x i16> @shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i16> %a) {
+; ALL-LABEL: shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastw %xmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) {
+; ALL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; ALL-NEXT: vpbroadcastw %xmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <32 x i16> %c
+}
+
define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) {
; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
; ALL: # BB#0:
@@ -26,8 +45,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = <0,32,1,33,2,34,3,35,8,40,9,41,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
+; ALL-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; ALL-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
@@ -36,9 +54,55 @@ define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i1
define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = <4,36,5,37,6,38,7,39,12,44,13,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
+; ALL-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; ALL-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
}
+
+define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
+; ALL: # BB#0:
+; ALL-NEXT: vpsrld $16, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 34, i32 3, i32 34, i32 5, i32 34, i32 7, i32 34, i32 9, i32 34, i32 11, i32 34, i32 13, i32 34, i32 15, i32 34, i32 17, i32 34, i32 19, i32 34, i32 21, i32 34, i32 23, i32 34, i32 25, i32 34, i32 27, i32 34, i32 29, i32 34, i32 31, i32 34>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
+; ALL: # BB#0:
+; ALL-NEXT: vpslld $16, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 34, i32 0, i32 34, i32 2, i32 34, i32 4, i32 34, i32 6, i32 34, i32 8, i32 34, i32 10, i32 34, i32 12, i32 34, i32 14, i32 34, i32 16, i32 34, i32 18, i32 34, i32 20, i32 34, i32 22, i32 34, i32 24, i32 34, i32 26, i32 34, i32 28, i32 34, i32 30>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
+; ALL: # BB#0:
+; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
+; ALL: # BB#0:
+; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
+ ret <32 x i16> %c
+}
+
diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
new file mode 100644
index 000000000000..d637c0e2428c
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+
+target triple = "x86_64-unknown-unknown"
+
+define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a) {
+; AVX512F-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512DQ-NEXT: retq
+ %b = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <64 x i8> %b
+}
+
+define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
+; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: retq
+ %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+ ret <64 x i8> %shuffle
+}
+
+define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
+; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: retq
+ %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
+ ret <64 x i8> %shuffle
+}
+
+define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
+; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: retq
+ %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+ ret <64 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 631968f6afa2..d39961d9c427 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1,9 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
-target triple = "x86_64-unknown-unknown"
-
define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00000000:
; AVX512F: # BB#0:
@@ -18,6 +16,38 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
ret <8 x double> %shuffle
}
+define <8 x double> @shuffle_v8f64_22222222(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_22222222:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_22222222:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_44444444:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_44444444:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00000010:
; AVX512F: # BB#0:
@@ -140,12 +170,12 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01014545:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01014545:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -187,16 +217,12 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_81a3c5e7:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15]
-; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_81a3c5e7:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0]
-; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x double> %shuffle
@@ -403,14 +429,12 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00014445:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00014445:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -420,14 +444,12 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00204464:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00204464:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x double> %shuffle
@@ -437,14 +459,12 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_03004744:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_03004744:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -454,14 +474,12 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10005444:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10005444:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -471,14 +489,12 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_22006644:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_22006644:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -488,14 +504,12 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_33307774:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_33307774:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -505,14 +519,12 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_32107654:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_32107654:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x double> %shuffle
@@ -522,14 +534,12 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00234467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,4,4,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00234467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,4,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -539,14 +549,12 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00224466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00224466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -556,14 +564,12 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10325476:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,5,4,7,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10325476:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,5,0,4,0,7,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x double> %shuffle
@@ -573,14 +579,12 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_11335577:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,5,5,7,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_11335577:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,5,0,5,0,7,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x double> %shuffle
@@ -590,14 +594,12 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10235467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,3,5,4,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10235467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -607,14 +609,12 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10225466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,2,5,4,6,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10225466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,2,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -777,14 +777,12 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10324567:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10324567:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -794,14 +792,12 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_11334567:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_11334567:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -811,14 +807,12 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_01235467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01235467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -828,14 +822,12 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_01235466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01235466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -879,14 +871,12 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_103245uu:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_103245uu:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -896,14 +886,12 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_1133uu67:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_1133uu67:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -913,14 +901,12 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_0uu354uu:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_0uu354uu:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -930,14 +916,12 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_uuu3uu66:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,u,u,3,u,u,6,6>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_uuu3uu66:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -994,6 +978,38 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
ret <8 x i64> %shuffle
}
+define <8 x i64> @shuffle_v8i64_44444444(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_44444444:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_44444444:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_66666666:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_66666666:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+ ret <8 x i64> %shuffle
+}
+
define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00000010:
@@ -1389,14 +1405,12 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00014445:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00014445:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x i64> %shuffle
@@ -1406,14 +1420,12 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00204464:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00204464:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x i64> %shuffle
@@ -1423,14 +1435,12 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_03004744:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_03004744:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1440,14 +1450,12 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10005444:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10005444:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1457,14 +1465,12 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_22006644:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_22006644:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1474,14 +1480,12 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_33307774:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_33307774:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -1491,14 +1495,12 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_32107654:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_32107654:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -1508,14 +1510,12 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00234467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,4,4,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00234467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,4,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1525,14 +1525,12 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00224466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00224466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1542,14 +1540,12 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10325476:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,5,4,7,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10325476:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,5,0,4,0,7,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -1559,14 +1555,12 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_11335577:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,5,5,7,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_11335577:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,5,0,5,0,7,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i64> %shuffle
@@ -1576,14 +1570,12 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10235467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,3,5,4,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10235467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1593,14 +1585,12 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10225466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,2,5,4,6,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10225466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,2,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -2096,15 +2086,15 @@ define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_maskz:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-32-NEXT: vpsllvq .LCPI122_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
%res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
@@ -2117,15 +2107,15 @@ define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1>
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshufi64x2_512_mask:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-32-NEXT: vpsllvq .LCPI123_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
%y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
%res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
@@ -2154,16 +2144,16 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double>
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-32-NEXT: vpsllvq .LCPI125_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-32-NEXT: retl
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
@@ -2177,16 +2167,16 @@ define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double>
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-32-NEXT: vpsllvq .LCPI126_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-32-NEXT: retl
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
@@ -2207,3 +2197,59 @@ define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) noun
%res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
ret <16 x float> %res
}
+
+define <8 x double> @shuffle_v8f64_23014567(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_23014567:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,4,5,6,7]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_23014567:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,4,5,6,7]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_v8f64_2301uu67(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_2301uu67:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,0,1,6,7]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_2301uu67:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,0,1,6,7]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 undef, i32 undef, i32 6, i32 7>
+ ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_v8f64_2301uuuu(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_2301uuuu:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_2301uuuu:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_v8f64_uuu2301(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_uuu2301:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_uuu2301:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 0, i32 1>
+ ret <8 x double> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
new file mode 100644
index 000000000000..ac18bba166f1
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
+;
+; Combine tests involving AVX target shuffles
+
+declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8)
+declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8)
+declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8)
+declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8)
+
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
+
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8)
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8)
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8)
+
+define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
+ ret <4 x float> %1
+}
+define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movddup_load:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; ALL-NEXT: retq
+ %1 = load <4 x float>, <4 x float> *%a0
+ %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movshdup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
+ ret <4 x float> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movsldup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
+ ret <4 x float> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_unpckh:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
+ ret <4 x float> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_unpckl:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+ ret <4 x float> %1
+}
+
+define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
+ ret <8 x float> %2
+}
+
+define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_10326u4u:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
+ ret <8 x float> %2
+}
+
+define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+ %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ ret <8 x float> %3
+}
+
+define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
+ %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ ret <8 x float> %3
+}
+
+define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
+; ALL-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
+; ALL: # BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+ %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ ret <4 x double> %3
+}
+
+define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
+ ret <8 x float> %1
+}
+define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movddup_load:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = load <8 x float>, <8 x float> *%a0
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
+ ret <8 x float> %2
+}
+
+define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movshdup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
+ ret <8 x float> %1
+}
+
+define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movsldup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
+ ret <8 x float> %1
+}
+
+define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_2f64_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>)
+ %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> <i64 2, i64 0>)
+ ret <2 x double> %2
+}
+
+define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_2f64_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; ALL-NEXT: retq
+ %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>)
+ ret <2 x double> %1
+}
+
+define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f64_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ ret <4 x double> %2
+}
+
+define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f64_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
+ ret <4 x double> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_4stage:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
+ %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>)
+ %4 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %3, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ ret <4 x float> %4
+}
+
+define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_4stage:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
+ %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>)
+ %4 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %3, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ ret <8 x float> %4
+}
+
+define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_as_insertps:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
+; ALL-NEXT: retq
+ %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
+ ret <4 x float> %2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
new file mode 100644
index 000000000000..a10ba6ccc41e
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -0,0 +1,324 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
+
+define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_pslldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <32 x i8> %2
+}
+
+define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_psrldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ ret <32 x i8> %2
+}
+
+define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
+; CHECK-LABEL: combine_pshufb_vpermd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
+; CHECK-NEXT: retq
+ %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
+ %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
+ ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
+; CHECK-LABEL: combine_pshufb_vpermps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
+; CHECK-NEXT: retq
+ %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
+ %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
+ ret <32 x i8> %tmp2
+}
+
+define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
+; CHECK-LABEL: combine_permq_pshufb_as_vperm2i128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
+ %4 = bitcast <32 x i8> %3 to <4 x i64>
+ %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
+ ret <4 x i64> %5
+}
+
+define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
+; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
+ ret <32 x i8> %3
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
+ ret <16 x i8> %1
+}
+
+define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
+ %4 = bitcast <32 x i8> %3 to <8 x i32>
+ %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
+ %6 = bitcast <8 x i32> %5 to <32 x i8>
+ ret <32 x i8> %6
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+ ret <16 x i8> %1
+}
+
+define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+ %4 = bitcast <32 x i8> %3 to <8 x i32>
+ %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
+ %6 = bitcast <8 x i32> %5 to <32 x i8>
+ ret <32 x i8> %6
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastd128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0
+; CHECK-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+ %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
+ ret <16 x i8> %2
+}
+
+define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastd256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
+ %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %3
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ ret <16 x i8> %1
+}
+
+define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastq256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+ %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %3
+}
+
+define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = bitcast <4 x float> %a to <16 x i8>
+ %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ ret <4 x float> %3
+}
+
+define <8 x float> @combine_permd_as_vpbroadcastss256(<4 x float> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastss256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
+ ret <8 x float> %2
+}
+
+define <4 x double> @combine_permd_as_vpbroadcastsd256(<2 x double> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastsd256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <4 x double> %1 to <8 x float>
+ %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+ %4 = bitcast <8 x float> %3 to <4 x double>
+ ret <4 x double> %4
+}
+
+define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
+; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
+ ret <16 x i8> %2
+}
+
+define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
+; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
+ %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
+ ret <32 x i8> %2
+}
+
+define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
+; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = bitcast <4 x float> %1 to <16 x i8>
+ %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+ %4 = bitcast <16 x i8> %3 to <4 x float>
+ ret <4 x float> %4
+}
+
+define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
+; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
+ ret <8 x float> %2
+}
+
+define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
+; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
+ %2 = bitcast <4 x double> %1 to <8 x float>
+ %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+ %4 = bitcast <8 x float> %3 to <4 x double>
+ ret <4 x double> %4
+}
+
+define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_permq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1]
+; CHECK-NEXT: retq
+ %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
+ ret <8 x i32> %1
+}
+
+define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
+; CHECK-LABEL: combine_permps_as_permpd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
+; CHECK-NEXT: retq
+ %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
+ ret <8 x float> %1
+}
+
+define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pslldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_psrldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshuflw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshufhw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_not_as_pshufw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <32 x i8> %res1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
new file mode 100644
index 000000000000..baf1054170ba
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s
+
+declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
+declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_permvar_8f64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
+ ret <8 x double> %res1
+}
+define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8f64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 %m)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 %m)
+ ret <8 x double> %res1
+}
+
+define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
+; CHECK-LABEL: combine_permvar_8i64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
+ ret <8 x i64> %res1
+}
+define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8i64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 %m)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 %m)
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8f64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
+ ret <8 x double> %res1
+}
+define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_vpermt2var_8f64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
+ ret <8 x double> %res1
+}
+
+define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+ ret <8 x double> %res0
+}
+define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8f64_movddup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %x0 = load <8 x double>, <8 x double> *%p0
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+ ret <8 x double> %res0
+}
+define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_vpermt2var_8f64_movddup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
+ ret <8 x double> %res0
+}
+
+define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8i64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
+ ret <8 x i64> %res1
+}
+define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
+; CHECK-LABEL: combine_vpermt2var_8i64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
+ ret <8 x i64> %res1
+}
+
+define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
+ ret <16 x float> %res1
+}
+define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; CHECK-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
+ ret <16 x float> %res1
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16i32_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
+ ret <16 x i32> %res1
+}
+define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16i32_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; CHECK-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
+ %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
+ ret <16 x i32> %res1
+}
+
+define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
+; CHECK-LABEL: combine_vpermt2var_32i16_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 -1)
+ ret <32 x i16> %res1
+}
+define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) {
+; CHECK-LABEL: combine_vpermt2var_32i16_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqu16 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; CHECK-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
+ %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
+ ret <32 x i16> %res1
+}
+
+define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
+; CHECK-LABEL: combine_pshufb_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
+ %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
+ ret <64 x i8> %res1
+}
+define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
+; CHECK-LABEL: combine_pshufb_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
+; CHECK-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
+ %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
+ ret <64 x i8> %res1
+}
+
+define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastw512:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %1
+}
+
+define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastd512:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1)
+ ret <16 x i32> %1
+}
+
+define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastq512:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
+; CHECK-LABEL: combine_permvar_8i64_as_permq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x i64> %x1, i8 -1)
+ ret <8 x i64> %1
+}
+define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8i64_as_permq_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x i64> %x1, i8 %m)
+ ret <8 x i64> %1
+}
+
+define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_permvar_8f64_as_permpd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: retq
+ %1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
+ ret <8 x double> %1
+}
+define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8f64_as_permpd_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 %m)
+ ret <8 x double> %1
+}
+
+define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
+; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
+ ret <16 x float> %res1
+}
+
+define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pslldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> undef, i64 -1)
+ ret <64 x i8> %res0
+}
+define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
+; CHECK-LABEL: combine_pshufb_as_pslldq_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> zeroinitializer, i64 %m)
+ ret <64 x i8> %res0
+}
+
+define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_psrldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> undef, i64 -1)
+ ret <64 x i8> %res0
+}
+define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
+; CHECK-LABEL: combine_pshufb_as_psrldq_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> zeroinitializer, i64 %m)
+ ret <64 x i8> %res0
+}
+
+define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
+; CHECK-LABEL: combine_permvar_as_pshuflw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %res0
+}
+
+define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshufhw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %res0
+}
+
+define <32 x i16> @combine_pshufb_as_pshufw(<32 x i16> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshufw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %res1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
new file mode 100644
index 000000000000..85e1071a35aa
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
+;
+; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB)
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @combine_vpshufb_zero(<16 x i8> %a0) {
+; SSE-LABEL: combine_vpshufb_zero:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vpshufb_zero:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <16 x i8> %res2
+}
+
+define <16 x i8> @combine_vpshufb_movq(<16 x i8> %a0) {
+; SSE-LABEL: combine_vpshufb_movq:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vpshufb_movq:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>)
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>)
+ ret <16 x i8> %res1
+}
+
+define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) {
+; SSE-LABEL: combine_pshufb_movddup:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_movddup:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a0 to <16 x i8>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x float> %4
+}
+
+define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) {
+; SSE-LABEL: combine_pshufb_movshdup:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_movshdup:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a0 to <16 x i8>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %4
+}
+
+define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
+; SSE-LABEL: combine_pshufb_movsldup:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_movsldup:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a0 to <16 x i8>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %4
+}
+
+define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_pshufb_palignr:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_palignr:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_pslldq:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_pslldq:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_psrldq:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_psrldq:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_pslldq:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_pslldq:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_psrldq:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_psrldq:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_pshuflw:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_pshuflw:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_pshufhw:
+; SSE: # BB#0:
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_pshufhw:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_not_as_pshufw:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_not_as_pshufw:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <16 x i8> %res1
+}
+
+define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_unpckl_arg0_pshufb:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_unpckl_arg0_pshufb:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>)
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_unpckl_arg1_pshufb:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_unpckl_arg1_pshufb:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>)
+ ret <16 x i8> %2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
new file mode 100644
index 000000000000..76226065fd7c
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s
+
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
+
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: combine_vpermil2pd_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> <i64 2, i64 0>, i8 0)
+ %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> <i64 2, i64 0>, i8 0)
+ ret <2 x double> %res1
+}
+
+define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: combine_vpermil2pd256_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
+ %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
+ ret <4 x double> %res1
+}
+
+define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
+ %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
+ ret <4 x float> %res1
+}
+
+define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps256_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
+ %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
+ ret <8 x float> %res1
+}
+
+define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps256_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>, i8 2)
+ ret <8 x float> %res0
+}
+
+define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps_blend_with_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: retq
+ %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 8, i32 1, i32 2, i32 3>, i8 2)
+ ret <4 x float> %res0
+}
+
+define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
+ %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+ ret <16 x i8> %res1
+}
+
+define <16 x i8> @combine_vpperm_zero(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res1, <16 x i8> undef, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <16 x i8> %res2
+}
+
+define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_identity_bitcast:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %mask = bitcast <2 x i64> <i64 1084818905618843912, i64 506097522914230528> to <16 x i8>
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %mask)
+ %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> %mask)
+ %res2 = bitcast <16 x i8> %res1 to <2 x i64>
+ %res3 = add <2 x i64> %res2, <i64 1084818905618843912, i64 506097522914230528>
+ %res4 = bitcast <2 x i64> %res3 to <16 x i8>
+ ret <16 x i8> %res4
+}
+
+define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_as_blend_with_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 0, i8 1, i8 128, i8 129, i8 4, i8 5, i8 6, i8 7, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_vpperm_as_unary_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_as_unary_unpckhwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_vpperm_as_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_as_unpckhwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
+ ret <16 x i8> %res0
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 75ce9753525b..266a3658eda9 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -96,10 +96,15 @@ define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_pshufd6:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_pshufd6:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_pshufd6:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
+; AVX2-NEXT: retq
entry:
%b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
%c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
@@ -1783,13 +1788,13 @@ define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
; SSE-LABEL: combine_test22:
; SSE: # BB#0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movhpd (%rsi), %xmm0
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test22:
; AVX: # BB#0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
; Current AVX2 lowering of this is still awful, not adding a test case.
%1 = load <2 x float>, <2 x float>* %a, align 8
@@ -1798,6 +1803,29 @@ define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
ret <8 x float> %3
}
+; PR22359
+define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
+; SSE-LABEL: combine_test23:
+; SSE: # BB#0:
+; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test23:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
+ %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
+ %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
+ store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
+ store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
+ ret void
+}
+
; Check some negative cases.
; FIXME: Do any of these really make sense? Are they redundant with the above tests?
@@ -2412,7 +2440,7 @@ define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
;
; AVX-LABEL: combine_undef_input_test9:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -2603,7 +2631,7 @@ define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
;
; AVX-LABEL: combine_undef_input_test19:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
%2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -2636,15 +2664,16 @@ define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_unneeded_subvector1:
; AVX2: # BB#0:
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
%b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
@@ -2795,6 +2824,50 @@ define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %d
}
+; FIXME: Failed to recognise that the VMOVSD has already zero'd the upper element
+define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
+; SSE2-LABEL: combine_scalar_load_with_blend_with_zero:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: movaps %xmm0, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_scalar_load_with_blend_with_zero:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: movaps %xmm0, (%rsi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_scalar_load_with_blend_with_zero:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: xorpd %xmm1, %xmm1
+; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE41-NEXT: movapd %xmm1, (%rsi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovapd %xmm0, (%rsi)
+; AVX-NEXT: retq
+ %1 = load double, double* %a0, align 8
+ %2 = insertelement <2 x double> undef, double %1, i32 0
+ %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
+ %4 = bitcast <2 x double> %3 to <4 x float>
+ %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+ store <4 x float> %5, <4 x float>* %a1, align 16
+ ret void
+}
+
define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: PR22377:
; SSE: # BB#0: # %entry
@@ -2898,8 +2971,8 @@ define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
; AVX2-LABEL: PR22412:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; AVX2-NEXT: retq
entry:
%s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
index 548de4ce6ea3..fc4652eca55d 100644
--- a/test/CodeGen/X86/vector-shuffle-sse1.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -91,6 +91,22 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0145:
+; SSE1: # BB#0:
+; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_6723:
+; SSE1: # BB#0:
+; SSE1-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_4zzz:
@@ -194,7 +210,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: xorps %xmm2, %xmm2
-; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE1-NEXT: movaps %xmm1, %xmm0
; SSE1-NEXT: retq
@@ -215,8 +231,8 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: xorps %xmm2, %xmm2
-; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -234,3 +250,21 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
%shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
+; SSE1-LABEL: shuffle_mem_v4f32_0145:
+; SSE1: # BB#0:
+; SSE1-NEXT: movhps (%rdi), %xmm0
+; SSE1-NEXT: retq
+ %b = load <4 x float>, <4 x float>* %pb, align 16
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) {
+; SSE1-LABEL: shuffle_mem_v4f32_6723:
+; SSE1: # BB#0:
+; SSE1-NEXT: movlps 8(%rdi), %xmm0
+; SSE1-NEXT: retq
+ %b = load <4 x float>, <4 x float>* %pb, align 16
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-sse41.ll b/test/CodeGen/X86/vector-shuffle-sse41.ll
new file mode 100644
index 000000000000..be9a4b950778
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse41.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
+; SSE41-LABEL: blend_packusdw:
+; SSE41: # BB#0:
+; SSE41-NEXT: packusdw %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_packusdw:
+; AVX: # BB#0:
+; AVX-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+ %p1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
+ %s0 = shufflevector <8 x i16> %p0, <8 x i16> %p1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i16> %s0
+}
+
+define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packuswb:
+; SSE41: # BB#0:
+; SSE41-NEXT: packuswb %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_packuswb:
+; AVX: # BB#0:
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+ %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+ %s0 = shufflevector <16 x i8> %p0, <16 x i8> %p1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i8> %s0
+}
+
+define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packusdw_packuswb:
+; SSE41: # BB#0:
+; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: packuswb %xmm3, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_packusdw_packuswb:
+; AVX: # BB#0:
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+ %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+ %b1 = bitcast <16 x i8> %p1 to <8 x i16>
+ %s0 = shufflevector <8 x i16> %p0, <8 x i16> %b1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i16> %s0
+}
+
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index a387f894a067..1c128645ad14 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -13,11 +13,11 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; VL_BW_DQ-LABEL: shuf2i1_1_0:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
@@ -35,14 +35,14 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; VL_BW_DQ-LABEL: shuf2i1_1_2:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: movb $1, %al
; VL_BW_DQ-NEXT: kmovb %eax, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
-; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0
+; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
@@ -59,11 +59,11 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -74,13 +74,13 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@@ -91,7 +91,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <8 x i64> %a, %a1
@@ -105,14 +105,14 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512F: # BB#0:
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
-; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@@ -125,7 +125,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
; VL_BW_DQ-NEXT: vpslld $31, %zmm1, %zmm0
-; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <16 x i32> %a, %a1
@@ -162,15 +162,14 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
-; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@@ -178,10 +177,10 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: kmovb %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
-; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0
+; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -192,15 +191,16 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
@@ -211,8 +211,9 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
@@ -223,23 +224,25 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: kmovb %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -250,15 +253,16 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
@@ -269,8 +273,9 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
@@ -281,15 +286,16 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
@@ -300,8 +306,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
@@ -312,19 +319,18 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: movb $51, %al
-; AVX512F-NEXT: movzbl %al, %eax
; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
@@ -337,8 +343,9 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
@@ -352,13 +359,14 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
@@ -367,11 +375,12 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
+; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
%c1 = bitcast <8 x i1>%c to i8
@@ -383,11 +392,13 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
-; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
@@ -396,8 +407,9 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovw %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i16 %a to <16 x i1>
%c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
@@ -417,8 +429,22 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: .Ltmp2:
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: movb $0, (%rsp)
+; AVX512F-NEXT: subq $96, %rsp
+; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %ecx
; AVX512F-NEXT: movq %rcx, %rax
; AVX512F-NEXT: shlq $32, %rax
@@ -429,11 +455,15 @@ define i64 @shuf64i1_zero(i64 %a) {
;
; VL_BW_DQ-LABEL: shuf64i1_zero:
; VL_BW_DQ: # BB#0:
-; VL_BW_DQ-NEXT: kxorq %k0, %k0, %k0
+; VL_BW_DQ-NEXT: kmovq %rdi, %k0
+; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0
+; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
+; VL_BW_DQ-NEXT: vpsllw $7, %zmm0, %zmm0
+; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovq %k0, %rax
; VL_BW_DQ-NEXT: retq
%b = bitcast i64 %a to <64 x i1>
- %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer
+ %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
%d = bitcast <64 x i1> %c to i64
ret i64 %d
}
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
new file mode 100644
index 000000000000..d130e7ff00b2
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -0,0 +1,1321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Unary shuffle indices from registers
+;
+
+define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
+; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <2 x double> %x, i64 %i0
+ %x1 = extractelement <2 x double> %x, i64 %i1
+ %r0 = insertelement <2 x double> undef, double %x0, i32 0
+ %r1 = insertelement <2 x double> %r0, double %x1, i32 1
+ ret <2 x double> %r1
+}
+
+define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
+; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
+; SSE: # BB#0:
+; SSE-NEXT: movslq %edi, %rax
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movslq %esi, %rcx
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %esi, %rcx
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <2 x i64> %x, i32 %i0
+ %x1 = extractelement <2 x i64> %x, i32 %i1
+ %r0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <2 x i64> %r0, i64 %x1, i32 1
+ ret <2 x i64> %r1
+}
+
+define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
+; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movslq %edi, %rax
+; SSE2-NEXT: movslq %esi, %rsi
+; SSE2-NEXT: movslq %edx, %rdx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movslq %ecx, %rcx
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movslq %edi, %rax
+; SSSE3-NEXT: movslq %esi, %rsi
+; SSSE3-NEXT: movslq %edx, %rdx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movslq %ecx, %rcx
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movslq %edi, %rax
+; SSE41-NEXT: movslq %esi, %rsi
+; SSE41-NEXT: movslq %edx, %rdx
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movslq %ecx, %rcx
+; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: movslq %esi, %rsi
+; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <4 x float> %x, i32 %i0
+ %x1 = extractelement <4 x float> %x, i32 %i1
+ %x2 = extractelement <4 x float> %x, i32 %i2
+ %x3 = extractelement <4 x float> %x, i32 %i3
+ %r0 = insertelement <4 x float> undef, float %x0, i32 0
+ %r1 = insertelement <4 x float> %r0, float %x1, i32 1
+ %r2 = insertelement <4 x float> %r1, float %x2, i32 2
+ %r3 = insertelement <4 x float> %r2, float %x3, i32 3
+ ret <4 x float> %r3
+}
+
+define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
+; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movslq %edi, %rax
+; SSE2-NEXT: movslq %esi, %rsi
+; SSE2-NEXT: movslq %edx, %rdx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movslq %ecx, %rcx
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movslq %edi, %rax
+; SSSE3-NEXT: movslq %esi, %rsi
+; SSSE3-NEXT: movslq %edx, %rdx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movslq %ecx, %rcx
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movslq %edi, %rax
+; SSE41-NEXT: movslq %esi, %rsi
+; SSE41-NEXT: movslq %edx, %rdx
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movslq %ecx, %rcx
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, -24(%rsp,%rsi,4), %xmm0
+; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0
+; SSE41-NEXT: pinsrd $3, -24(%rsp,%rcx,4), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: movslq %esi, %rsi
+; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <4 x i32> %x, i32 %i0
+ %x1 = extractelement <4 x i32> %x, i32 %i1
+ %x2 = extractelement <4 x i32> %x, i32 %i2
+ %x3 = extractelement <4 x i32> %x, i32 %i3
+ %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+ %r1 = insertelement <4 x i32> %r0, i32 %x1, i32 1
+ %r2 = insertelement <4 x i32> %r1, i32 %x2, i32 2
+ %r3 = insertelement <4 x i32> %r2, i32 %x3, i32 3
+ ret <4 x i32> %r3
+}
+
+define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
+; SSE2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: movswq %di, %rax
+; SSE2-NEXT: movswq %si, %rsi
+; SSE2-NEXT: movswq %dx, %rdx
+; SSE2-NEXT: movswq %cx, %r10
+; SSE2-NEXT: movswq %r8w, %r11
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movswq %r9w, %r8
+; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%r11,2), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: movd %edi, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movd %esi, %xmm1
+; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: movswq %di, %rax
+; SSSE3-NEXT: movswq %si, %rsi
+; SSSE3-NEXT: movswq %dx, %rdx
+; SSSE3-NEXT: movswq %cx, %r10
+; SSSE3-NEXT: movswq %r8w, %r11
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movswq %r9w, %r8
+; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rdi
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%r11,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movd %edi, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: movd %esi, %xmm1
+; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: movswq %di, %rax
+; SSE41-NEXT: movswq %si, %rbx
+; SSE41-NEXT: movswq %dx, %r11
+; SSE41-NEXT: movswq %cx, %r10
+; SSE41-NEXT: movswq %r8w, %rdi
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movswq %r9w, %rcx
+; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rsi
+; SSE41-NEXT: movzwl -16(%rsp,%rdx,2), %edx
+; SSE41-NEXT: movzwl -16(%rsp,%rsi,2), %esi
+; SSE41-NEXT: movzwl -16(%rsp,%rax,2), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pinsrw $1, -16(%rsp,%rbx,2), %xmm0
+; SSE41-NEXT: pinsrw $2, -16(%rsp,%r11,2), %xmm0
+; SSE41-NEXT: pinsrw $3, -16(%rsp,%r10,2), %xmm0
+; SSE41-NEXT: pinsrw $4, -16(%rsp,%rdi,2), %xmm0
+; SSE41-NEXT: pinsrw $5, -16(%rsp,%rcx,2), %xmm0
+; SSE41-NEXT: pinsrw $6, %edx, %xmm0
+; SSE41-NEXT: pinsrw $7, %esi, %xmm0
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movswq %di, %r10
+; AVX-NEXT: movswq %si, %r11
+; AVX-NEXT: movswq %dx, %r14
+; AVX-NEXT: movswq %cx, %rcx
+; AVX-NEXT: movswq %r8w, %rdi
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movswq %r9w, %rax
+; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; AVX-NEXT: movzwl -24(%rsp,%rdx,2), %edx
+; AVX-NEXT: movzwl -24(%rsp,%r10,2), %ebx
+; AVX-NEXT: vmovd %ebx, %xmm0
+; AVX-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $2, -24(%rsp,%r14,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $4, -24(%rsp,%rdi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r14
+; AVX-NEXT: retq
+ %x0 = extractelement <8 x i16> %x, i16 %i0
+ %x1 = extractelement <8 x i16> %x, i16 %i1
+ %x2 = extractelement <8 x i16> %x, i16 %i2
+ %x3 = extractelement <8 x i16> %x, i16 %i3
+ %x4 = extractelement <8 x i16> %x, i16 %i4
+ %x5 = extractelement <8 x i16> %x, i16 %i5
+ %x6 = extractelement <8 x i16> %x, i16 %i6
+ %x7 = extractelement <8 x i16> %x, i16 %i7
+ %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+ %r1 = insertelement <8 x i16> %r0, i16 %x1, i32 1
+ %r2 = insertelement <8 x i16> %r1, i16 %x2, i32 2
+ %r3 = insertelement <8 x i16> %r2, i16 %x3, i32 3
+ %r4 = insertelement <8 x i16> %r3, i16 %x4, i32 4
+ %r5 = insertelement <8 x i16> %r4, i16 %x5, i32 5
+ %r6 = insertelement <8 x i16> %r5, i16 %x6, i32 6
+ %r7 = insertelement <8 x i16> %r6, i16 %x7, i32 7
+ ret <8 x i16> %r7
+}
+
+define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %i0, i8 %i1, i8 %i2, i8 %i3, i8 %i4, i8 %i5, i8 %i6, i8 %i7, i8 %i8, i8 %i9, i8 %i10, i8 %i11, i8 %i12, i8 %i13, i8 %i14, i8 %i15) nounwind {
+; SSE2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT: movzbl (%r10,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: movsbq %dl, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: movsbq %dil, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm11
+; SSE2-NEXT: movsbq %r8b, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm7
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: movsbq %cl, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: movsbq %sil, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movsbq %r9b, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r11
+; SSSE3-NEXT: movzbl (%r10,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm8
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm9
+; SSSE3-NEXT: movsbq %dl, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm10
+; SSSE3-NEXT: movsbq %dil, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm11
+; SSSE3-NEXT: movsbq %r8b, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm7
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm12
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm13
+; SSSE3-NEXT: movsbq %cl, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm6
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: movsbq %sil, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm5
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: movsbq %r9b, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pushq %rbp
+; SSE41-NEXT: pushq %r15
+; SSE41-NEXT: pushq %r14
+; SSE41-NEXT: pushq %r13
+; SSE41-NEXT: pushq %r12
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: movsbq %dil, %r15
+; SSE41-NEXT: movsbq %sil, %r14
+; SSE41-NEXT: movsbq %dl, %r11
+; SSE41-NEXT: movsbq %cl, %r10
+; SSE41-NEXT: movsbq %r8b, %r8
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movsbq %r9b, %r9
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r12
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r13
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbx
+; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSE41-NEXT: movzbl (%r15,%rax), %ecx
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT: pinsrb $1, (%r14,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r14
+; SSE41-NEXT: pinsrb $2, (%r11,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r11
+; SSE41-NEXT: pinsrb $3, (%r10,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: pinsrb $4, (%r8,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: pinsrb $5, (%r9,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: movzbl (%r12,%rax), %esi
+; SSE41-NEXT: movzbl (%r13,%rax), %edi
+; SSE41-NEXT: movzbl (%rbp,%rax), %ebp
+; SSE41-NEXT: movzbl (%rbx,%rax), %ebx
+; SSE41-NEXT: movzbl (%r15,%rax), %r8d
+; SSE41-NEXT: movzbl (%r14,%rax), %r9d
+; SSE41-NEXT: movzbl (%r11,%rax), %r11d
+; SSE41-NEXT: movzbl (%r10,%rax), %r10d
+; SSE41-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE41-NEXT: movzbl (%rdx,%rax), %eax
+; SSE41-NEXT: pinsrb $6, %esi, %xmm0
+; SSE41-NEXT: pinsrb $7, %edi, %xmm0
+; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT: pinsrb $9, %ebx, %xmm0
+; SSE41-NEXT: pinsrb $10, %r8d, %xmm0
+; SSE41-NEXT: pinsrb $11, %r9d, %xmm0
+; SSE41-NEXT: pinsrb $12, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $13, %r10d, %xmm0
+; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT: pinsrb $15, %eax, %xmm0
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: popq %r12
+; SSE41-NEXT: popq %r13
+; SSE41-NEXT: popq %r14
+; SSE41-NEXT: popq %r15
+; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movsbq %dil, %r10
+; AVX-NEXT: movsbq %sil, %r11
+; AVX-NEXT: movsbq %dl, %r14
+; AVX-NEXT: movsbq %cl, %r15
+; AVX-NEXT: movsbq %r8b, %r8
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movsbq %r9b, %r9
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r12
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r13
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rdi
+; AVX-NEXT: movzbl (%r10,%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT: vpinsrb $1, (%r11,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r11
+; AVX-NEXT: vpinsrb $2, (%r14,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r14
+; AVX-NEXT: vpinsrb $3, (%r15,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT: vpinsrb $4, (%r8,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r8
+; AVX-NEXT: vpinsrb $5, (%r9,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: movzbl (%r12,%rdi), %edx
+; AVX-NEXT: movzbl (%r13,%rdi), %ebx
+; AVX-NEXT: movzbl (%rbp,%rdi), %ebp
+; AVX-NEXT: movzbl (%rcx,%rdi), %ecx
+; AVX-NEXT: movzbl (%r10,%rdi), %eax
+; AVX-NEXT: movzbl (%r11,%rdi), %r9d
+; AVX-NEXT: movzbl (%r14,%rdi), %r10d
+; AVX-NEXT: movzbl (%r15,%rdi), %r11d
+; AVX-NEXT: movzbl (%r8,%rdi), %r8d
+; AVX-NEXT: movzbl (%rsi,%rdi), %esi
+; AVX-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %ebx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %esi, %xmm0, %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r12
+; AVX-NEXT: popq %r13
+; AVX-NEXT: popq %r14
+; AVX-NEXT: popq %r15
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: retq
+ %x0 = extractelement <16 x i8> %x, i8 %i0
+ %x1 = extractelement <16 x i8> %x, i8 %i1
+ %x2 = extractelement <16 x i8> %x, i8 %i2
+ %x3 = extractelement <16 x i8> %x, i8 %i3
+ %x4 = extractelement <16 x i8> %x, i8 %i4
+ %x5 = extractelement <16 x i8> %x, i8 %i5
+ %x6 = extractelement <16 x i8> %x, i8 %i6
+ %x7 = extractelement <16 x i8> %x, i8 %i7
+ %x8 = extractelement <16 x i8> %x, i8 %i8
+ %x9 = extractelement <16 x i8> %x, i8 %i9
+ %x10 = extractelement <16 x i8> %x, i8 %i10
+ %x11 = extractelement <16 x i8> %x, i8 %i11
+ %x12 = extractelement <16 x i8> %x, i8 %i12
+ %x13 = extractelement <16 x i8> %x, i8 %i13
+ %x14 = extractelement <16 x i8> %x, i8 %i14
+ %x15 = extractelement <16 x i8> %x, i8 %i15
+ %r0 = insertelement <16 x i8> undef, i8 %x0 , i32 0
+ %r1 = insertelement <16 x i8> %r0 , i8 %x1 , i32 1
+ %r2 = insertelement <16 x i8> %r1 , i8 %x2 , i32 2
+ %r3 = insertelement <16 x i8> %r2 , i8 %x3 , i32 3
+ %r4 = insertelement <16 x i8> %r3 , i8 %x4 , i32 4
+ %r5 = insertelement <16 x i8> %r4 , i8 %x5 , i32 5
+ %r6 = insertelement <16 x i8> %r5 , i8 %x6 , i32 6
+ %r7 = insertelement <16 x i8> %r6 , i8 %x7 , i32 7
+ %r8 = insertelement <16 x i8> %r7 , i8 %x8 , i32 8
+ %r9 = insertelement <16 x i8> %r8 , i8 %x9 , i32 9
+ %r10 = insertelement <16 x i8> %r9 , i8 %x10, i32 10
+ %r11 = insertelement <16 x i8> %r10, i8 %x11, i32 11
+ %r12 = insertelement <16 x i8> %r11, i8 %x12, i32 12
+ %r13 = insertelement <16 x i8> %r12, i8 %x13, i32 13
+ %r14 = insertelement <16 x i8> %r13, i8 %x14, i32 14
+ %r15 = insertelement <16 x i8> %r14, i8 %x15, i32 15
+ ret <16 x i8> %r15
+}
+
+;
+; Unary shuffle indices from memory
+;
+
+define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
+; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movslq (%rdi), %rax
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movslq 4(%rdi), %rcx
+; SSE2-NEXT: movslq 8(%rdi), %rdx
+; SSE2-NEXT: movslq 12(%rdi), %rsi
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movslq (%rdi), %rax
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movslq 4(%rdi), %rcx
+; SSSE3-NEXT: movslq 8(%rdi), %rdx
+; SSSE3-NEXT: movslq 12(%rdi), %rsi
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movslq (%rdi), %rax
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movslq 4(%rdi), %rcx
+; SSE41-NEXT: movslq 8(%rdi), %rdx
+; SSE41-NEXT: movslq 12(%rdi), %rsi
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, -24(%rsp,%rcx,4), %xmm0
+; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0
+; SSE41-NEXT: pinsrd $3, -24(%rsp,%rsi,4), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq (%rdi), %rax
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq 4(%rdi), %rcx
+; AVX-NEXT: movslq 8(%rdi), %rdx
+; AVX-NEXT: movslq 12(%rdi), %rsi
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %p0 = getelementptr inbounds i32, i32* %i, i64 0
+ %p1 = getelementptr inbounds i32, i32* %i, i64 1
+ %p2 = getelementptr inbounds i32, i32* %i, i64 2
+ %p3 = getelementptr inbounds i32, i32* %i, i64 3
+ %i0 = load i32, i32* %p0, align 4
+ %i1 = load i32, i32* %p1, align 4
+ %i2 = load i32, i32* %p2, align 4
+ %i3 = load i32, i32* %p3, align 4
+ %x0 = extractelement <4 x i32> %x, i32 %i0
+ %x1 = extractelement <4 x i32> %x, i32 %i1
+ %x2 = extractelement <4 x i32> %x, i32 %i2
+ %x3 = extractelement <4 x i32> %x, i32 %i3
+ %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+ %r1 = insertelement <4 x i32> %r0, i32 %x1, i32 1
+ %r2 = insertelement <4 x i32> %r1, i32 %x2, i32 2
+ %r3 = insertelement <4 x i32> %r2, i32 %x3, i32 3
+ ret <4 x i32> %r3
+}
+
+define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
+; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsbq (%rdi), %rcx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movsbq 8(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm8
+; SSE2-NEXT: movsbq 12(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm9
+; SSE2-NEXT: movsbq 4(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: movsbq 14(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm10
+; SSE2-NEXT: movsbq 6(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm5
+; SSE2-NEXT: movsbq 10(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm11
+; SSE2-NEXT: movsbq 2(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm7
+; SSE2-NEXT: movsbq 15(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm12
+; SSE2-NEXT: movsbq 7(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: movsbq 11(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm13
+; SSE2-NEXT: movsbq 3(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm6
+; SSE2-NEXT: movsbq 13(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm14
+; SSE2-NEXT: movsbq 5(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm4
+; SSE2-NEXT: movsbq 9(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm15
+; SSE2-NEXT: movsbq 1(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsbq (%rdi), %rcx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: movsbq 8(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm8
+; SSSE3-NEXT: movsbq 12(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm9
+; SSSE3-NEXT: movsbq 4(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: movsbq 14(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm10
+; SSSE3-NEXT: movsbq 6(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm5
+; SSSE3-NEXT: movsbq 10(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm11
+; SSSE3-NEXT: movsbq 2(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm7
+; SSSE3-NEXT: movsbq 15(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm12
+; SSSE3-NEXT: movsbq 7(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: movsbq 11(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm13
+; SSSE3-NEXT: movsbq 3(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm6
+; SSSE3-NEXT: movsbq 13(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm14
+; SSSE3-NEXT: movsbq 5(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm4
+; SSSE3-NEXT: movsbq 9(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm15
+; SSSE3-NEXT: movsbq 1(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pushq %rbp
+; SSE41-NEXT: pushq %r15
+; SSE41-NEXT: pushq %r14
+; SSE41-NEXT: pushq %r13
+; SSE41-NEXT: pushq %r12
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: movsbq (%rdi), %rax
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movsbq 1(%rdi), %r15
+; SSE41-NEXT: movsbq 2(%rdi), %r8
+; SSE41-NEXT: movsbq 3(%rdi), %r9
+; SSE41-NEXT: movsbq 4(%rdi), %r10
+; SSE41-NEXT: movsbq 5(%rdi), %r11
+; SSE41-NEXT: movsbq 6(%rdi), %r14
+; SSE41-NEXT: movsbq 7(%rdi), %r12
+; SSE41-NEXT: movsbq 8(%rdi), %r13
+; SSE41-NEXT: movsbq 9(%rdi), %rdx
+; SSE41-NEXT: movsbq 10(%rdi), %rcx
+; SSE41-NEXT: movsbq 11(%rdi), %rsi
+; SSE41-NEXT: movsbq 12(%rdi), %rbx
+; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp
+; SSE41-NEXT: movzbl (%rax,%rbp), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movsbq 13(%rdi), %rax
+; SSE41-NEXT: pinsrb $1, (%r15,%rbp), %xmm0
+; SSE41-NEXT: movsbq 14(%rdi), %r15
+; SSE41-NEXT: movsbq 15(%rdi), %rdi
+; SSE41-NEXT: movzbl (%rdi,%rbp), %edi
+; SSE41-NEXT: movzbl (%r15,%rbp), %r15d
+; SSE41-NEXT: movzbl (%rax,%rbp), %eax
+; SSE41-NEXT: movzbl (%rbx,%rbp), %ebx
+; SSE41-NEXT: movzbl (%rsi,%rbp), %esi
+; SSE41-NEXT: movzbl (%rcx,%rbp), %ecx
+; SSE41-NEXT: movzbl (%rdx,%rbp), %edx
+; SSE41-NEXT: movzbl (%r13,%rbp), %r13d
+; SSE41-NEXT: movzbl (%r12,%rbp), %r12d
+; SSE41-NEXT: movzbl (%r14,%rbp), %r14d
+; SSE41-NEXT: movzbl (%r11,%rbp), %r11d
+; SSE41-NEXT: movzbl (%r10,%rbp), %r10d
+; SSE41-NEXT: movzbl (%r9,%rbp), %r9d
+; SSE41-NEXT: movzbl (%r8,%rbp), %ebp
+; SSE41-NEXT: pinsrb $2, %ebp, %xmm0
+; SSE41-NEXT: pinsrb $3, %r9d, %xmm0
+; SSE41-NEXT: pinsrb $4, %r10d, %xmm0
+; SSE41-NEXT: pinsrb $5, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $6, %r14d, %xmm0
+; SSE41-NEXT: pinsrb $7, %r12d, %xmm0
+; SSE41-NEXT: pinsrb $8, %r13d, %xmm0
+; SSE41-NEXT: pinsrb $9, %edx, %xmm0
+; SSE41-NEXT: pinsrb $10, %ecx, %xmm0
+; SSE41-NEXT: pinsrb $11, %esi, %xmm0
+; SSE41-NEXT: pinsrb $12, %ebx, %xmm0
+; SSE41-NEXT: pinsrb $13, %eax, %xmm0
+; SSE41-NEXT: pinsrb $14, %r15d, %xmm0
+; SSE41-NEXT: pinsrb $15, %edi, %xmm0
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: popq %r12
+; SSE41-NEXT: popq %r13
+; SSE41-NEXT: popq %r14
+; SSE41-NEXT: popq %r15
+; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: movsbq (%rdi), %rsi
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movsbq 1(%rdi), %r15
+; AVX-NEXT: movsbq 2(%rdi), %r8
+; AVX-NEXT: movsbq 3(%rdi), %r9
+; AVX-NEXT: movsbq 4(%rdi), %r10
+; AVX-NEXT: movsbq 5(%rdi), %r11
+; AVX-NEXT: movsbq 6(%rdi), %r14
+; AVX-NEXT: movsbq 7(%rdi), %r12
+; AVX-NEXT: movsbq 8(%rdi), %r13
+; AVX-NEXT: movsbq 9(%rdi), %rdx
+; AVX-NEXT: movsbq 10(%rdi), %rax
+; AVX-NEXT: movsbq 11(%rdi), %rcx
+; AVX-NEXT: movsbq 12(%rdi), %rbx
+; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT: movzbl (%rsi,%rbp), %esi
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: movsbq 13(%rdi), %rsi
+; AVX-NEXT: vpinsrb $1, (%r15,%rbp), %xmm0, %xmm0
+; AVX-NEXT: movsbq 14(%rdi), %r15
+; AVX-NEXT: movsbq 15(%rdi), %rdi
+; AVX-NEXT: movzbl (%rdi,%rbp), %edi
+; AVX-NEXT: movzbl (%r15,%rbp), %r15d
+; AVX-NEXT: movzbl (%rsi,%rbp), %esi
+; AVX-NEXT: movzbl (%rbx,%rbp), %ebx
+; AVX-NEXT: movzbl (%rcx,%rbp), %ecx
+; AVX-NEXT: movzbl (%rax,%rbp), %eax
+; AVX-NEXT: movzbl (%rdx,%rbp), %edx
+; AVX-NEXT: movzbl (%r13,%rbp), %r13d
+; AVX-NEXT: movzbl (%r12,%rbp), %r12d
+; AVX-NEXT: movzbl (%r14,%rbp), %r14d
+; AVX-NEXT: movzbl (%r11,%rbp), %r11d
+; AVX-NEXT: movzbl (%r10,%rbp), %r10d
+; AVX-NEXT: movzbl (%r9,%rbp), %r9d
+; AVX-NEXT: movzbl (%r8,%rbp), %ebp
+; AVX-NEXT: vpinsrb $2, %ebp, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $3, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, %r13d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, %ebx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, %r15d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r12
+; AVX-NEXT: popq %r13
+; AVX-NEXT: popq %r14
+; AVX-NEXT: popq %r15
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: retq
+ %p0 = getelementptr inbounds i8, i8* %i, i64 0
+ %p1 = getelementptr inbounds i8, i8* %i, i64 1
+ %p2 = getelementptr inbounds i8, i8* %i, i64 2
+ %p3 = getelementptr inbounds i8, i8* %i, i64 3
+ %p4 = getelementptr inbounds i8, i8* %i, i64 4
+ %p5 = getelementptr inbounds i8, i8* %i, i64 5
+ %p6 = getelementptr inbounds i8, i8* %i, i64 6
+ %p7 = getelementptr inbounds i8, i8* %i, i64 7
+ %p8 = getelementptr inbounds i8, i8* %i, i64 8
+ %p9 = getelementptr inbounds i8, i8* %i, i64 9
+ %p10 = getelementptr inbounds i8, i8* %i, i64 10
+ %p11 = getelementptr inbounds i8, i8* %i, i64 11
+ %p12 = getelementptr inbounds i8, i8* %i, i64 12
+ %p13 = getelementptr inbounds i8, i8* %i, i64 13
+ %p14 = getelementptr inbounds i8, i8* %i, i64 14
+ %p15 = getelementptr inbounds i8, i8* %i, i64 15
+ %i0 = load i8, i8* %p0 , align 4
+ %i1 = load i8, i8* %p1 , align 4
+ %i2 = load i8, i8* %p2 , align 4
+ %i3 = load i8, i8* %p3 , align 4
+ %i4 = load i8, i8* %p4 , align 4
+ %i5 = load i8, i8* %p5 , align 4
+ %i6 = load i8, i8* %p6 , align 4
+ %i7 = load i8, i8* %p7 , align 4
+ %i8 = load i8, i8* %p8 , align 4
+ %i9 = load i8, i8* %p9 , align 4
+ %i10 = load i8, i8* %p10, align 4
+ %i11 = load i8, i8* %p11, align 4
+ %i12 = load i8, i8* %p12, align 4
+ %i13 = load i8, i8* %p13, align 4
+ %i14 = load i8, i8* %p14, align 4
+ %i15 = load i8, i8* %p15, align 4
+ %x0 = extractelement <16 x i8> %x, i8 %i0
+ %x1 = extractelement <16 x i8> %x, i8 %i1
+ %x2 = extractelement <16 x i8> %x, i8 %i2
+ %x3 = extractelement <16 x i8> %x, i8 %i3
+ %x4 = extractelement <16 x i8> %x, i8 %i4
+ %x5 = extractelement <16 x i8> %x, i8 %i5
+ %x6 = extractelement <16 x i8> %x, i8 %i6
+ %x7 = extractelement <16 x i8> %x, i8 %i7
+ %x8 = extractelement <16 x i8> %x, i8 %i8
+ %x9 = extractelement <16 x i8> %x, i8 %i9
+ %x10 = extractelement <16 x i8> %x, i8 %i10
+ %x11 = extractelement <16 x i8> %x, i8 %i11
+ %x12 = extractelement <16 x i8> %x, i8 %i12
+ %x13 = extractelement <16 x i8> %x, i8 %i13
+ %x14 = extractelement <16 x i8> %x, i8 %i14
+ %x15 = extractelement <16 x i8> %x, i8 %i15
+ %r0 = insertelement <16 x i8> undef, i8 %x0 , i32 0
+ %r1 = insertelement <16 x i8> %r0 , i8 %x1 , i32 1
+ %r2 = insertelement <16 x i8> %r1 , i8 %x2 , i32 2
+ %r3 = insertelement <16 x i8> %r2 , i8 %x3 , i32 3
+ %r4 = insertelement <16 x i8> %r3 , i8 %x4 , i32 4
+ %r5 = insertelement <16 x i8> %r4 , i8 %x5 , i32 5
+ %r6 = insertelement <16 x i8> %r5 , i8 %x6 , i32 6
+ %r7 = insertelement <16 x i8> %r6 , i8 %x7 , i32 7
+ %r8 = insertelement <16 x i8> %r7 , i8 %x8 , i32 8
+ %r9 = insertelement <16 x i8> %r8 , i8 %x9 , i32 9
+ %r10 = insertelement <16 x i8> %r9 , i8 %x10, i32 10
+ %r11 = insertelement <16 x i8> %r10, i8 %x11, i32 11
+ %r12 = insertelement <16 x i8> %r11, i8 %x12, i32 12
+ %r13 = insertelement <16 x i8> %r12, i8 %x13, i32 13
+ %r14 = insertelement <16 x i8> %r13, i8 %x14, i32 14
+ %r15 = insertelement <16 x i8> %r14, i8 %x15, i32 15
+ ret <16 x i8> %r15
+}
+
+;
+; Binary shuffle indices from registers
+;
+
+define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
+; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
+; SSE: # BB#0:
+; SSE-NEXT: movslq %edi, %rax
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movslq %edx, %rdx
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movslq %ecx, %rcx
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <4 x float> %x, i32 %i0
+ %x1 = extractelement <4 x float> %x, i32 %i1
+ %y2 = extractelement <4 x float> %y, i32 %i2
+ %x3 = extractelement <4 x float> %x, i32 %i3
+ %r0 = insertelement <4 x float> undef, float %x0, i32 0
+ %r1 = insertelement <4 x float> %r0, float 0.0, i32 1
+ %r2 = insertelement <4 x float> %r1, float %y2, i32 2
+ %r3 = insertelement <4 x float> %r2, float %x3, i32 3
+ ret <4 x float> %r3
+}
+
+define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %y, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
+; SSE2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: movswq %di, %r10
+; SSE2-NEXT: movswq %si, %rsi
+; SSE2-NEXT: movswq %dx, %r11
+; SSE2-NEXT: movswq %cx, %rcx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movswq %r8w, %rdi
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movswq %r9w, %rax
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSE2-NEXT: xorl %edx, %edx
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %esi, %xmm2
+; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movzwl -40(%rsp,%r10,2), %eax
+; SSE2-NEXT: movzwl -40(%rsp,%r11,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: movswq %di, %r10
+; SSSE3-NEXT: movswq %si, %rsi
+; SSSE3-NEXT: movswq %dx, %r11
+; SSSE3-NEXT: movswq %cx, %rcx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movswq %r8w, %rdi
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movswq %r9w, %rax
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSSE3-NEXT: xorl %edx, %edx
+; SSSE3-NEXT: movd %edx, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd %esi, %xmm2
+; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: movzwl -40(%rsp,%r10,2), %eax
+; SSSE3-NEXT: movzwl -40(%rsp,%r11,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: movswq %di, %rax
+; SSE41-NEXT: movswq %si, %rsi
+; SSE41-NEXT: movswq %dx, %rdx
+; SSE41-NEXT: movswq %cx, %r10
+; SSE41-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movswq %r8w, %rdi
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movswq %r9w, %rcx
+; SSE41-NEXT: movzwl -40(%rsp,%rax,2), %eax
+; SSE41-NEXT: movd %eax, %xmm1
+; SSE41-NEXT: pinsrw $1, -24(%rsp,%rsi,2), %xmm1
+; SSE41-NEXT: pinsrw $2, -40(%rsp,%rdx,2), %xmm1
+; SSE41-NEXT: pinsrw $3, -24(%rsp,%r10,2), %xmm1
+; SSE41-NEXT: pinsrw $4, -40(%rsp,%rdi,2), %xmm1
+; SSE41-NEXT: pinsrw $5, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX1-NEXT: movswq %di, %r10
+; AVX1-NEXT: movswq %si, %r11
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: movswq %cx, %rcx
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movswq %r8w, %rdi
+; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movswq %r9w, %rax
+; AVX1-NEXT: movzwl -40(%rsp,%r10,2), %esi
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX2-NEXT: movswq %di, %r10
+; AVX2-NEXT: movswq %si, %r11
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: movswq %cx, %rcx
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movswq %r8w, %rdi
+; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movswq %r9w, %rax
+; AVX2-NEXT: movzwl -40(%rsp,%r10,2), %esi
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-NEXT: retq
+ %x0 = extractelement <8 x i16> %x, i16 %i0
+ %y1 = extractelement <8 x i16> %y, i16 %i1
+ %x2 = extractelement <8 x i16> %x, i16 %i2
+ %y3 = extractelement <8 x i16> %y, i16 %i3
+ %x4 = extractelement <8 x i16> %x, i16 %i4
+ %y5 = extractelement <8 x i16> %y, i16 %i5
+ %x6 = extractelement <8 x i16> %x, i16 %i6
+ %x7 = extractelement <8 x i16> %x, i16 %i7
+ %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+ %r1 = insertelement <8 x i16> %r0, i16 %y1, i32 1
+ %r2 = insertelement <8 x i16> %r1, i16 %x2, i32 2
+ %r3 = insertelement <8 x i16> %r2, i16 %y3, i32 3
+ %r4 = insertelement <8 x i16> %r3, i16 %x4, i32 4
+ %r5 = insertelement <8 x i16> %r4, i16 %y5, i32 5
+ %r6 = insertelement <8 x i16> %r5, i16 0, i32 6
+ %r7 = insertelement <8 x i16> %r6, i16 0, i32 7
+ ret <8 x i16> %r7
+}
diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll
new file mode 100644
index 000000000000..e8d9aa20491b
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -0,0 +1,720 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Unary shuffle indices from registers
+;
+
+define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
+ %x0 = extractelement <4 x double> %x, i64 %i0
+ %x1 = extractelement <4 x double> %x, i64 %i1
+ %x2 = extractelement <4 x double> %x, i64 %i2
+ %x3 = extractelement <4 x double> %x, i64 %i3
+ %r0 = insertelement <4 x double> undef, double %x0, i32 0
+ %r1 = insertelement <4 x double> %r0, double %x1, i32 1
+ %r2 = insertelement <4 x double> %r1, double %x2, i32 2
+ %r3 = insertelement <4 x double> %r2, double %x3, i32 3
+ ret <4 x double> %r3
+}
+
+define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
+ %x0 = extractelement <4 x double> %x, i64 %i0
+ %x1 = extractelement <4 x double> %x, i64 %i1
+ %x2 = extractelement <4 x double> %x, i64 %i2
+ %x3 = extractelement <4 x double> %x, i64 %i3
+ %r0 = insertelement <4 x double> undef, double undef, i32 0
+ %r1 = insertelement <4 x double> %r0, double %x1, i32 1
+ %r2 = insertelement <4 x double> %r1, double %x2, i32 2
+ %r3 = insertelement <4 x double> %r2, double 0.0, i32 3
+ ret <4 x double> %r3
+}
+
+define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %x0 = extractelement <2 x double> %x, i64 %i0
+ %x1 = extractelement <2 x double> %x, i64 %i1
+ %x2 = extractelement <2 x double> %x, i64 %i2
+ %x3 = extractelement <2 x double> %x, i64 %i3
+ %r0 = insertelement <4 x double> undef, double %x0, i32 0
+ %r1 = insertelement <4 x double> %r0, double %x1, i32 1
+ %r2 = insertelement <4 x double> %r1, double %x2, i32 2
+ %r3 = insertelement <4 x double> %r2, double %x3, i32 3
+ ret <4 x double> %r3
+}
+
+define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %x0 = extractelement <4 x i64> %x, i64 %i0
+ %x1 = extractelement <4 x i64> %x, i64 %i1
+ %x2 = extractelement <4 x i64> %x, i64 %i2
+ %x3 = extractelement <4 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
+
+define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %x0 = extractelement <4 x i64> %x, i64 %i0
+ %x1 = extractelement <4 x i64> %x, i64 %i1
+ %x2 = extractelement <4 x i64> %x, i64 %i2
+ %x3 = extractelement <4 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 0, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 0, i32 3
+ ret <4 x i64> %r3
+}
+
+define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <2 x i64> %x, i64 %i0
+ %x1 = extractelement <2 x i64> %x, i64 %i1
+ %x2 = extractelement <2 x i64> %x, i64 %i2
+ %x3 = extractelement <2 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
+
+define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
+; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: movslq %edi, %rax
+; AVX1-NEXT: movslq %esi, %rsi
+; AVX1-NEXT: movslq %edx, %rdx
+; AVX1-NEXT: movslq %ecx, %r11
+; AVX1-NEXT: movslq %r8d, %r10
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: movslq %r9d, %r8
+; AVX1-NEXT: movslq 16(%rbp), %rdi
+; AVX1-NEXT: movslq 24(%rbp), %rcx
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vmovd %esi, %xmm2
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; AVX2-NEXT: vmovd %edx, %xmm3
+; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3
+; AVX2-NEXT: vmovd %ecx, %xmm4
+; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4
+; AVX2-NEXT: vmovd %r8d, %xmm5
+; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5
+; AVX2-NEXT: vmovd %r9d, %xmm6
+; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6
+; AVX2-NEXT: vmovd {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7
+; AVX2-NEXT: vmovd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <8 x float> %x, i32 %i0
+ %x1 = extractelement <8 x float> %x, i32 %i1
+ %x2 = extractelement <8 x float> %x, i32 %i2
+ %x3 = extractelement <8 x float> %x, i32 %i3
+ %x4 = extractelement <8 x float> %x, i32 %i4
+ %x5 = extractelement <8 x float> %x, i32 %i5
+ %x6 = extractelement <8 x float> %x, i32 %i6
+ %x7 = extractelement <8 x float> %x, i32 %i7
+ %r0 = insertelement <8 x float> undef, float %x0, i32 0
+ %r1 = insertelement <8 x float> %r0, float %x1, i32 1
+ %r2 = insertelement <8 x float> %r1, float %x2, i32 2
+ %r3 = insertelement <8 x float> %r2, float %x3, i32 3
+ %r4 = insertelement <8 x float> %r3, float %x4, i32 4
+ %r5 = insertelement <8 x float> %r4, float %x5, i32 5
+ %r6 = insertelement <8 x float> %r5, float %x6, i32 6
+ %r7 = insertelement <8 x float> %r6, float %x7, i32 7
+ ret <8 x float> %r7
+}
+
+define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
+; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
+; ALL: # BB#0:
+; ALL-NEXT: movslq %edi, %rax
+; ALL-NEXT: movslq %esi, %rsi
+; ALL-NEXT: movslq %edx, %rdx
+; ALL-NEXT: movslq %ecx, %r11
+; ALL-NEXT: movslq %r8d, %r10
+; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: movslq %r9d, %r8
+; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rdi
+; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rcx
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT: retq
+ %x0 = extractelement <4 x float> %x, i32 %i0
+ %x1 = extractelement <4 x float> %x, i32 %i1
+ %x2 = extractelement <4 x float> %x, i32 %i2
+ %x3 = extractelement <4 x float> %x, i32 %i3
+ %x4 = extractelement <4 x float> %x, i32 %i4
+ %x5 = extractelement <4 x float> %x, i32 %i5
+ %x6 = extractelement <4 x float> %x, i32 %i6
+ %x7 = extractelement <4 x float> %x, i32 %i7
+ %r0 = insertelement <8 x float> undef, float %x0, i32 0
+ %r1 = insertelement <8 x float> %r0, float %x1, i32 1
+ %r2 = insertelement <8 x float> %r1, float %x2, i32 2
+ %r3 = insertelement <8 x float> %r2, float %x3, i32 3
+ %r4 = insertelement <8 x float> %r3, float %x4, i32 4
+ %r5 = insertelement <8 x float> %r4, float %x5, i32 5
+ %r6 = insertelement <8 x float> %r5, float %x6, i32 6
+ %r7 = insertelement <8 x float> %r6, float %x7, i32 7
+ ret <8 x float> %r7
+}
+
+define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
+; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: movslq 32(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: movslq 40(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 48(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 56(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 64(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 72(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 80(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 88(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq %edi, %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movslq %esi, %rax
+; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %edx, %rax
+; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %ecx, %rax
+; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r8d, %rax
+; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r9d, %rax
+; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq 16(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movslq 24(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: movslq 32(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: movslq 40(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 48(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 56(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 64(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 72(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 80(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 88(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq %edi, %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movslq %esi, %rax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %edx, %rax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %ecx, %rax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r8d, %rax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r9d, %rax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq 16(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movslq 24(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %x0 = extractelement <16 x i16> %x, i32 %i0
+ %x1 = extractelement <16 x i16> %x, i32 %i1
+ %x2 = extractelement <16 x i16> %x, i32 %i2
+ %x3 = extractelement <16 x i16> %x, i32 %i3
+ %x4 = extractelement <16 x i16> %x, i32 %i4
+ %x5 = extractelement <16 x i16> %x, i32 %i5
+ %x6 = extractelement <16 x i16> %x, i32 %i6
+ %x7 = extractelement <16 x i16> %x, i32 %i7
+ %x8 = extractelement <16 x i16> %x, i32 %i8
+ %x9 = extractelement <16 x i16> %x, i32 %i9
+ %x10 = extractelement <16 x i16> %x, i32 %i10
+ %x11 = extractelement <16 x i16> %x, i32 %i11
+ %x12 = extractelement <16 x i16> %x, i32 %i12
+ %x13 = extractelement <16 x i16> %x, i32 %i13
+ %x14 = extractelement <16 x i16> %x, i32 %i14
+ %x15 = extractelement <16 x i16> %x, i32 %i15
+ %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
+ %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
+ %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
+ %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
+ %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
+ %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
+ %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
+ %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
+ %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
+ %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
+ %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
+ %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
+ %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
+ %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
+ %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
+ %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
+ ret <16 x i16> %r15
+}
+
+define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
+; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq %edi, %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movslq %esi, %rax
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %edx, %rax
+; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %ecx, %rax
+; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r8d, %rax
+; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r9d, %rax
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq %edi, %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movslq %esi, %rax
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %edx, %rax
+; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %ecx, %rax
+; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r8d, %rax
+; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r9d, %rax
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <8 x i16> %x, i32 %i0
+ %x1 = extractelement <8 x i16> %x, i32 %i1
+ %x2 = extractelement <8 x i16> %x, i32 %i2
+ %x3 = extractelement <8 x i16> %x, i32 %i3
+ %x4 = extractelement <8 x i16> %x, i32 %i4
+ %x5 = extractelement <8 x i16> %x, i32 %i5
+ %x6 = extractelement <8 x i16> %x, i32 %i6
+ %x7 = extractelement <8 x i16> %x, i32 %i7
+ %x8 = extractelement <8 x i16> %x, i32 %i8
+ %x9 = extractelement <8 x i16> %x, i32 %i9
+ %x10 = extractelement <8 x i16> %x, i32 %i10
+ %x11 = extractelement <8 x i16> %x, i32 %i11
+ %x12 = extractelement <8 x i16> %x, i32 %i12
+ %x13 = extractelement <8 x i16> %x, i32 %i13
+ %x14 = extractelement <8 x i16> %x, i32 %i14
+ %x15 = extractelement <8 x i16> %x, i32 %i15
+ %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
+ %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
+ %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
+ %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
+ %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
+ %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
+ %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
+ %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
+ %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
+ %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
+ %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
+ %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
+ %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
+ %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
+ %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
+ %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
+ ret <16 x i16> %r15
+}
+
+;
+; Unary shuffle indices from memory
+;
+
+define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
+; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: movq 8(%rdi), %rcx
+; AVX1-NEXT: movq 16(%rdi), %rdx
+; AVX1-NEXT: movq 24(%rdi), %rsi
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: movq 24(%rdi), %rsi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %p0 = getelementptr inbounds i64, i64* %i, i32 0
+ %p1 = getelementptr inbounds i64, i64* %i, i32 1
+ %p2 = getelementptr inbounds i64, i64* %i, i32 2
+ %p3 = getelementptr inbounds i64, i64* %i, i32 3
+ %i0 = load i64, i64* %p0, align 4
+ %i1 = load i64, i64* %p1, align 4
+ %i2 = load i64, i64* %p2, align 4
+ %i3 = load i64, i64* %p3, align 4
+ %x0 = extractelement <4 x i64> %x, i64 %i0
+ %x1 = extractelement <4 x i64> %x, i64 %i1
+ %x2 = extractelement <4 x i64> %x, i64 %i2
+ %x3 = extractelement <4 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
+
+define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
+; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: movq 8(%rdi), %rcx
+; AVX1-NEXT: movq 16(%rdi), %rdx
+; AVX1-NEXT: movq 24(%rdi), %rsi
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: movq 24(%rdi), %rsi
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %p0 = getelementptr inbounds i64, i64* %i, i32 0
+ %p1 = getelementptr inbounds i64, i64* %i, i32 1
+ %p2 = getelementptr inbounds i64, i64* %i, i32 2
+ %p3 = getelementptr inbounds i64, i64* %i, i32 3
+ %i0 = load i64, i64* %p0, align 4
+ %i1 = load i64, i64* %p1, align 4
+ %i2 = load i64, i64* %p2, align 4
+ %i3 = load i64, i64* %p3, align 4
+ %x0 = extractelement <2 x i64> %x, i64 %i0
+ %x1 = extractelement <2 x i64> %x, i64 %i1
+ %x2 = extractelement <2 x i64> %x, i64 %i2
+ %x3 = extractelement <2 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
new file mode 100644
index 000000000000..a7794afba3d1
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -0,0 +1,5315 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; add
+;
+
+define <4 x i32> @trunc_add_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_add_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm6, %xmm2
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: paddq %xmm7, %xmm3
+; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_add_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_add_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm4, %xmm0
+; SSE-NEXT: paddd %xmm5, %xmm1
+; SSE-NEXT: paddd %xmm6, %xmm2
+; SSE-NEXT: paddd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: paddw %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_add_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = add <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; add to constant
+;
+
+define <4 x i32> @trunc_add_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: paddq %xmm0, %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_add_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_add_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: paddq %xmm8, %xmm0
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm4
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm5
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm6
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; sub
+;
+
+define <4 x i32> @trunc_sub_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: psubq %xmm2, %xmm0
+; SSE-NEXT: psubq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_sub_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: psubq %xmm6, %xmm2
+; SSE-NEXT: psubq %xmm4, %xmm0
+; SSE-NEXT: psubq %xmm7, %xmm3
+; SSE-NEXT: psubq %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_sub_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: psubd %xmm2, %xmm0
+; SSE-NEXT: psubd %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubd %xmm4, %xmm0
+; SSE-NEXT: psubd %xmm5, %xmm1
+; SSE-NEXT: psubd %xmm6, %xmm2
+; SSE-NEXT: psubd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubw %xmm2, %xmm0
+; SSE-NEXT: psubw %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = sub <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; sub to constant
+;
+
+define <4 x i32> @trunc_sub_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: psubq %xmm2, %xmm0
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_sub_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm4
+; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; SSE-NEXT: psubq %xmm4, %xmm0
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_sub_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: psubq %xmm8, %xmm0
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm4
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm5
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm6
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psubw {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; mul
+;
+
+define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm0, %xmm5
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm1, %xmm4
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: paddq %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: pmuludq %xmm6, %xmm8
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: psrlq $32, %xmm9
+; SSE-NEXT: pmuludq %xmm2, %xmm9
+; SSE-NEXT: psllq $32, %xmm9
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm6, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm9, %xmm2
+; SSE-NEXT: paddq %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pmuludq %xmm4, %xmm8
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm0, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm6, %xmm0
+; SSE-NEXT: paddq %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pmuludq %xmm7, %xmm4
+; SSE-NEXT: movdqa %xmm7, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm3, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm7, %xmm3
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm6, %xmm3
+; SSE-NEXT: paddq %xmm4, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm1, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm5, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm6, %xmm1
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
+; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6
+; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
+; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
+; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
+; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_mul_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm0, %xmm10
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm8, %xmm0
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm10, %xmm0
+; SSE-NEXT: paddq %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm1, %xmm10
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm8, %xmm1
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm10, %xmm1
+; SSE-NEXT: paddq %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm2, %xmm10
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm8, %xmm2
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm10, %xmm2
+; SSE-NEXT: paddq %xmm9, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm3, %xmm10
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm8, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm10, %xmm3
+; SSE-NEXT: paddq %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm4, %xmm10
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm8, %xmm4
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: paddq %xmm10, %xmm4
+; SSE-NEXT: paddq %xmm9, %xmm4
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm5, %xmm10
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm8, %xmm5
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: paddq %xmm10, %xmm5
+; SSE-NEXT: paddq %xmm9, %xmm5
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm6, %xmm10
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm8, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: paddq %xmm10, %xmm6
+; SSE-NEXT: paddq %xmm9, %xmm6
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm7, %xmm10
+; SSE-NEXT: psrlq $32, %xmm7
+; SSE-NEXT: pmuludq %xmm8, %xmm7
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm7
+; SSE-NEXT: paddq %xmm10, %xmm7
+; SSE-NEXT: paddq %xmm9, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
+; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9
+; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm10
+; AVX1-NEXT: vpmuludq %xmm4, %xmm10, %xmm10
+; AVX1-NEXT: vpsllq $32, %xmm10, %xmm10
+; AVX1-NEXT: vpaddq %xmm10, %xmm9, %xmm9
+; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm10
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm9
+; AVX1-NEXT: vpsrlq $32, %xmm10, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm9, %xmm9
+; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm10
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0
+; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm10, %xmm10
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm1
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
+; AVX1-NEXT: vpmuludq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
+; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7
+; AVX1-NEXT: vpsllq $32, %xmm7, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm8
+; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9
+; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9
+; AVX2-NEXT: vpsllq $32, %ymm9, %ymm9
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm9, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8
+; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8
+; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5
+; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5
+; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5
+; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5
+; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
+; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm5
+; AVX512-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
+; AVX512-NEXT: vpsllq $32, %zmm5, %zmm5
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpaddq %zmm1, %zmm5, %zmm1
+; AVX512-NEXT: vpaddq %zmm1, %zmm4, %zmm1
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm2, %zmm4
+; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
+; AVX512-NEXT: vpsllq $32, %zmm4, %zmm4
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm8, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm6, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm7, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = mul <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; mul to constant
+;
+
+define <4 x i32> @trunc_mul_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm2, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_mul_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5]
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm5, %xmm2
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm4
+; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm5, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm4, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
+; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_mul_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm8, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm9, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm8, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm9, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm8, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm9, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm8, %xmm3
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm9, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm8, %xmm4
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: paddq %xmm9, %xmm4
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm8, %xmm5
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: paddq %xmm9, %xmm5
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm8, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: paddq %xmm9, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm7
+; SSE-NEXT: pmuludq %xmm8, %xmm7
+; SSE-NEXT: psllq $32, %xmm7
+; SSE-NEXT: paddq %xmm9, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3]
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5]
+; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7
+; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
+; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0
+; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2
+; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpaddq %zmm1, %zmm3, %zmm1
+; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; and
+;
+
+define <4 x i32> @trunc_and_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_and_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm6, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_and_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_and_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm3, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm1, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm5, %xmm0
+; SSE-NEXT: packuswb %xmm6, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_and_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = and <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; and to constant
+;
+
+define <4 x i32> @trunc_and_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_and_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_and_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE-NEXT: pand {{.*}}(%rip), %xmm5
+; SSE-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE-NEXT: pand {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm9, %xmm7
+; SSE-NEXT: pand %xmm9, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm9, %xmm5
+; SSE-NEXT: pand %xmm9, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm9, %xmm3
+; SSE-NEXT: pand %xmm9, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm9, %xmm1
+; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; xor
+;
+
+define <4 x i32> @trunc_xor_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_xor_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_xor_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = xor <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; xor to constant
+;
+
+define <4 x i32> @trunc_xor_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm0, %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_xor_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_xor_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm8, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm4
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm5
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm6
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; or
+;
+
+define <4 x i32> @trunc_or_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_or_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_or_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_or_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_or_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = or <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; or to constant
+;
+
+define <4 x i32> @trunc_or_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_or_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm3
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_or_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: por %xmm8, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: por {{.*}}(%rip), %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm3
+; SSE-NEXT: por {{.*}}(%rip), %xmm4
+; SSE-NEXT: por {{.*}}(%rip), %xmm5
+; SSE-NEXT: por {{.*}}(%rip), %xmm6
+; SSE-NEXT: por {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: por {{.*}}(%rip), %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; complex patterns - often created by vectorizer
+;
+
+define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
+; SSE-LABEL: mul_add_v4i64_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pmuludq %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm0, %xmm5
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm4, %xmm2
+; SSE-NEXT: paddq %xmm1, %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: mul_add_v4i64_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mul_add_v4i64_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_add_v4i64_v4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sext <4 x i32> %a0 to <4 x i64>
+ %2 = sext <4 x i32> %a1 to <4 x i64>
+ %3 = mul <4 x i64> %1, %2
+ %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
+ %5 = trunc <4 x i64> %4 to <4 x i32>
+ ret <4 x i32> %5
+}
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 8c02c5a5433f..cfeb41e891d6 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -52,9 +52,10 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i32:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -135,12 +136,14 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i16:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -187,9 +190,10 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i8:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -248,12 +252,15 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i16:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i16>
@@ -311,6 +318,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
;
; AVX512BW-LABEL: trunc8i32_8i8:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
@@ -422,14 +430,17 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: trunc2x4i64_8i32:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i32:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -517,9 +528,10 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: trunc2x4i64_8i16:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -529,6 +541,8 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512BW-LABEL: trunc2x4i64_8i16:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index f1714d4845de..c9ad6e40d1c2 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -5,6 +5,11 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
@@ -82,6 +87,32 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pushl %esi
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %eax
+; X32-SSE-NEXT: movl $32, %ecx
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: pextrd $2, %xmm0, %edx
+; X32-SSE-NEXT: bsfl %edx, %esi
+; X32-SSE-NEXT: testl %edx, %edx
+; X32-SSE-NEXT: cmovel %eax, %esi
+; X32-SSE-NEXT: movd %esi, %xmm1
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: movd %xmm0, %ecx
+; X32-SSE-NEXT: bsfl %ecx, %edx
+; X32-SSE-NEXT: testl %ecx, %ecx
+; X32-SSE-NEXT: cmovel %eax, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
ret <2 x i64> %out
}
@@ -137,16 +168,68 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv2i64u:
-; AVX: # BB#0:
-; AVX-NEXT: vpextrq $1, %xmm0, %rax
-; AVX-NEXT: bsfq %rax, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: bsfq %rax, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: retq
+; AVX1-LABEL: testv2i64u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: bsfq %rax, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: bsfq %rax, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv2i64u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: bsfq %rax, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: bsfq %rax, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv2i64u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [63,63]
+; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv2i64u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %ecx
+; X32-SSE-NEXT: pextrd $3, %xmm0, %edx
+; X32-SSE-NEXT: bsfl %edx, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm1
+; X32-SSE-NEXT: movd %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %ecx
+; X32-SSE-NEXT: pextrd $1, %xmm0, %edx
+; X32-SSE-NEXT: bsfl %edx, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
ret <2 x i64> %out
}
@@ -302,6 +385,74 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; AVX512CDVL-NEXT: vpandd %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm3
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512CD-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: psubd %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pand %xmm3, %xmm4
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm5
+; X32-SSE-NEXT: pshufb %xmm4, %xmm5
+; X32-SSE-NEXT: psrlw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm3, %xmm2
+; X32-SSE-NEXT: pshufb %xmm2, %xmm0
+; X32-SSE-NEXT: paddb %xmm5, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm2
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
ret <4 x i32> %out
}
@@ -457,6 +608,51 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandd %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: psubd %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pand %xmm3, %xmm4
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm5
+; X32-SSE-NEXT: pshufb %xmm4, %xmm5
+; X32-SSE-NEXT: psrlw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm3, %xmm2
+; X32-SSE-NEXT: pshufb %xmm2, %xmm0
+; X32-SSE-NEXT: paddb %xmm5, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm2
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
ret <4 x i32> %out
}
@@ -558,24 +754,103 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubw %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm2, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: paddb %xmm4, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm0
+; X32-SSE-NEXT: psllw $8, %xmm0
+; X32-SSE-NEXT: paddb %xmm3, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0)
ret <8 x i16> %out
}
@@ -677,24 +952,103 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16u:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubw %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm2, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: paddb %xmm4, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm0
+; X32-SSE-NEXT: psllw $8, %xmm0
+; X32-SSE-NEXT: paddb %xmm3, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1)
ret <8 x i16> %out
}
@@ -780,21 +1134,87 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: paddb %xmm4, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubb %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm0
+; X32-SSE-NEXT: paddb %xmm4, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
ret <16 x i8> %out
}
@@ -880,21 +1300,87 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE41-NEXT: paddb %xmm4, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8u:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubb %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm0
+; X32-SSE-NEXT: paddb %xmm4, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
ret <16 x i8> %out
}
@@ -911,6 +1397,12 @@ define <2 x i64> @foldv2i64() nounwind {
; AVX-NEXT: movl $8, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $8, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
}
@@ -927,6 +1419,12 @@ define <2 x i64> @foldv2i64u() nounwind {
; AVX-NEXT: movl $8, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $8, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
}
@@ -937,10 +1435,30 @@ define <4 x i32> @foldv4i32() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
ret <4 x i32> %out
}
@@ -951,10 +1469,30 @@ define <4 x i32> @foldv4i32u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv4i32u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv4i32u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i32u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
ret <4 x i32> %out
}
@@ -965,10 +1503,30 @@ define <8 x i16> @foldv8i16() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
ret <8 x i16> %out
}
@@ -979,10 +1537,30 @@ define <8 x i16> @foldv8i16u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv8i16u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv8i16u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i16u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
ret <8 x i16> %out
}
@@ -993,10 +1571,30 @@ define <16 x i8> @foldv16i8() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
ret <16 x i8> %out
}
@@ -1007,10 +1605,30 @@ define <16 x i8> @foldv16i8u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv16i8u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv16i8u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i8u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
ret <16 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index a9ee27cc51f0..286bc50ec723 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -1,6 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
@@ -51,6 +53,41 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i64:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i64:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512CD-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
ret <4 x i64> %out
}
@@ -104,6 +141,26 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i64u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i64u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
ret <4 x i64> %out
}
@@ -169,6 +226,49 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; AVX512CDVL-NEXT: vpandd %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX512CD-NEXT: vpsubd %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
ret <8 x i32> %out
}
@@ -234,6 +334,26 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandd %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
ret <8 x i32> %out
}
@@ -292,6 +412,44 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
ret <16 x i16> %out
}
@@ -350,6 +508,44 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
ret <16 x i16> %out
}
@@ -399,6 +595,38 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv32i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv32i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
ret <32 x i8> %out
}
@@ -448,78 +676,230 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv32i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv32i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
ret <32 x i8> %out
}
define <4 x i64> @foldv4i64() nounwind {
-; ALL-LABEL: foldv4i64:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i64:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i64:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
ret <4 x i64> %out
}
define <4 x i64> @foldv4i64u() nounwind {
-; ALL-LABEL: foldv4i64u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv4i64u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i64u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i64u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i64u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
ret <4 x i64> %out
}
define <8 x i32> @foldv8i32() nounwind {
-; ALL-LABEL: foldv8i32:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
ret <8 x i32> %out
}
define <8 x i32> @foldv8i32u() nounwind {
-; ALL-LABEL: foldv8i32u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv8i32u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i32u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
ret <8 x i32> %out
}
define <16 x i16> @foldv16i16() nounwind {
-; ALL-LABEL: foldv16i16:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
ret <16 x i16> %out
}
define <16 x i16> @foldv16i16u() nounwind {
-; ALL-LABEL: foldv16i16u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv16i16u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i16u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
ret <16 x i16> %out
}
define <32 x i8> @foldv32i8() nounwind {
-; ALL-LABEL: foldv32i8:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv32i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv32i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
ret <32 x i8> %out
}
define <32 x i8> @foldv32i8u() nounwind {
-; ALL-LABEL: foldv32i8u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv32i8u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv32i8u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv32i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv32i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
ret <32 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 9265fad0176c..81bfd8189b8f 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -1,266 +1,509 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
-; ALL-LABEL: testv8i64:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrq $1, %xmm1, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vmovq %xmm1, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm1
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vpextrq $1, %xmm0, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm0, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm0
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv8i64:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv8i64:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv8i64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
ret <8 x i64> %out
}
define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
-; ALL-LABEL: testv8i64u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: vpsubq %zmm0, %zmm1, %zmm1
-; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vplzcntq %zmm0, %zmm0
-; ALL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpsubq %zmm0, %zmm1, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv8i64u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv8i64u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv8i64u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
ret <8 x i64> %out
}
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
-; ALL-LABEL: testv16i32:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrd $1, %xmm1, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm1, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm2
-; ALL-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $2, %xmm1, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $3, %xmm1, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm0, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm0, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm0, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm0, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv16i32:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm2, %ymm2
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv16i32:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512CDBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv16i32:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
ret <16 x i32> %out
}
define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
-; ALL-LABEL: testv16i32u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: vpsubd %zmm0, %zmm1, %zmm1
-; ALL-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpsubd %zmm0, %zmm1, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv16i32u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv16i32u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv16i32u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
ret <16 x i32> %out
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubw %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsllw $8, %ymm0, %ymm5
-; ALL-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; ALL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubw %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
-; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv32i16:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512CDBW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv32i16:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
ret <32 x i16> %out
}
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubw %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsllw $8, %ymm0, %ymm5
-; ALL-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; ALL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubw %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
-; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv32i16u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512CDBW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv32i16u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubb %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv64i8:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv64i8:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
ret <64 x i8> %out
}
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubb %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv64i8u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv64i8u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index b8024203ab2f..a71e3b7b712d 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -4,6 +4,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i16:
@@ -71,6 +72,11 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT: retq
entry:
%B = zext <16 x i8> %A to <16 x i16>
ret <16 x i16> %B
@@ -137,20 +143,21 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX1-LABEL: zext_16i8_to_8i32:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_8i32:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_16i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%C = zext <8 x i8> %B to <8 x i32>
@@ -215,20 +222,21 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX1-LABEL: zext_16i8_to_4i64:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_4i64:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_16i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%C = zext <4 x i8> %B to <4 x i64>
@@ -300,6 +308,11 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = zext <8 x i16> %A to <8 x i32>
ret <8 x i32>%B
@@ -366,21 +379,21 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
;
; AVX1-LABEL: zext_8i16_to_4i64:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i16_to_4i64:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_8i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%C = zext <4 x i16> %B to <4 x i64>
@@ -452,6 +465,11 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: retq
entry:
%B = zext <4 x i32> %A to <4 x i64>
ret <4 x i64>%B
@@ -526,23 +544,20 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_zext_4i8_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i8_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i8_to_4i64:
@@ -562,6 +577,11 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_4i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
entry:
%X = load <4 x i8>, <4 x i8>* %ptr
%Y = zext <4 x i8> %X to <4 x i64>
@@ -602,22 +622,21 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i32:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i32:
@@ -637,12 +656,137 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%X = load <8 x i8>, <8 x i8>* %ptr
%Y = zext <8 x i8> %X to <8 x i32>
ret <8 x i32> %Y
}
+define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_16i8_to_8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_zext_16i8_to_8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa (%rdi), %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_zext_16i8_to_8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa (%rdi), %xmm1
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_zext_16i8_to_8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_zext_16i8_to_8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_16i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX512-NEXT: retq
+entry:
+ %X = load <16 x i8>, <16 x i8>* %ptr
+ %Y = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %Z = zext <8 x i8> %Y to <8 x i32>
+ ret <8 x i32> %Z
+}
+
+define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_8i8_to_8i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_zext_8i8_to_8i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pshufb %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_zext_8i8_to_8i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_zext_8i8_to_8i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_zext_8i8_to_8i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_8i8_to_8i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = zext <8 x i8> %X to <8 x i64>
+ ret <8 x i64> %Y
+}
+
define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: load_zext_16i8_to_16i16:
; SSE2: # BB#0: # %entry
@@ -679,6 +823,11 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512-NEXT: retq
entry:
%X = load <16 x i8>, <16 x i8>* %ptr
%Y = zext <16 x i8> %X to <16 x i16>
@@ -751,21 +900,21 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_zext_4i16_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i16_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i16_to_4i64:
@@ -785,6 +934,11 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_4i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%X = load <4 x i16>, <4 x i16>* %ptr
%Y = zext <4 x i16> %X to <4 x i64>
@@ -827,6 +981,11 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX512-NEXT: retq
entry:
%X = load <8 x i16>, <8 x i16>* %ptr
%Y = zext <8 x i16> %X to <8 x i32>
@@ -899,6 +1058,11 @@ define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX512-NEXT: retq
entry:
%X = load <4 x i32>, <4 x i32>* %ptr
%Y = zext <4 x i32> %X to <4 x i64>
@@ -949,6 +1113,12 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%t = zext <8 x i8> %z to <8 x i32>
ret <8 x i32> %t
@@ -991,6 +1161,11 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
%Z = bitcast <16 x i16> %B to <8 x i32>
@@ -1035,6 +1210,11 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
%Z = bitcast <8 x i32> %B to <4 x i64>
@@ -1057,9 +1237,8 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1088,6 +1267,12 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
%Z = bitcast <32 x i8> %B to <8 x i32>
@@ -1170,6 +1355,12 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable
; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%Z = bitcast <32 x i8> %B to <4 x i64>
@@ -1187,10 +1378,7 @@ define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable
;
; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
@@ -1254,6 +1442,12 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
%Z = bitcast <16 x i16> %B to <4 x i64>
@@ -1322,6 +1516,12 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable
; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
%Z = bitcast <16 x i16> %B to <8 x i32>
@@ -1369,6 +1569,12 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
%Z = bitcast <16 x i16> %B to <8 x i32>
@@ -1431,6 +1637,12 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
%Z = bitcast <8 x i32> %B to <4 x i64>
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index fe528fd4ea24..80930a72aa8a 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -1,270 +1,573 @@
-; RUN: llc < %s -march=x86-64 -mattr=sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86-64 -mattr=ssse3 | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -march=x86-64 -mattr=avx2 | FileCheck %s -check-prefix=AVX2
-; RUN: llc < %s -march=x86-64 -mattr=avx512f | FileCheck %s -check-prefix=AVX512
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
define <4 x i32> @test1(<4 x i32> %a) nounwind {
; SSE2-LABEL: test1:
-; SSE2: movdqa
-; SSE2: psrad $31
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test1:
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test1:
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test1:
-; AVX512: vpabsd
-; AVX512-NEXT: ret
- %tmp1neg = sub <4 x i32> zeroinitializer, %a
- %b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
- %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
- ret <4 x i32> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test1:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <4 x i32> zeroinitializer, %a
+ %b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
+ ret <4 x i32> %abs
}
define <4 x i32> @test2(<4 x i32> %a) nounwind {
; SSE2-LABEL: test2:
-; SSE2: movdqa
-; SSE2: psrad $31
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test2:
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test2:
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test2:
-; AVX512: vpabsd
-; AVX512-NEXT: ret
- %tmp1neg = sub <4 x i32> zeroinitializer, %a
- %b = icmp sge <4 x i32> %a, zeroinitializer
- %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
- ret <4 x i32> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test2:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <4 x i32> zeroinitializer, %a
+ %b = icmp sge <4 x i32> %a, zeroinitializer
+ %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
+ ret <4 x i32> %abs
}
define <8 x i16> @test3(<8 x i16> %a) nounwind {
; SSE2-LABEL: test3:
-; SSE2: movdqa
-; SSE2: psraw $15
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test3:
-; SSSE3: pabsw
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test3:
-; AVX2: vpabsw
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test3:
-; AVX512: vpabsw
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i16> zeroinitializer, %a
- %b = icmp sgt <8 x i16> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
- ret <8 x i16> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsw %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test3:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsw %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <8 x i16> zeroinitializer, %a
+ %b = icmp sgt <8 x i16> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
+ ret <8 x i16> %abs
}
define <16 x i8> @test4(<16 x i8> %a) nounwind {
; SSE2-LABEL: test4:
-; SSE2: pxor
-; SSE2: pcmpgtb
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test4:
-; SSSE3: pabsb
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test4:
-; AVX2: vpabsb
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test4:
-; AVX512: vpabsb
-; AVX512-NEXT: ret
- %tmp1neg = sub <16 x i8> zeroinitializer, %a
- %b = icmp slt <16 x i8> %a, zeroinitializer
- %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
- ret <16 x i8> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsb %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test4:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsb %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <16 x i8> zeroinitializer, %a
+ %b = icmp slt <16 x i8> %a, zeroinitializer
+ %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
+ ret <16 x i8> %abs
}
define <4 x i32> @test5(<4 x i32> %a) nounwind {
; SSE2-LABEL: test5:
-; SSE2: movdqa
-; SSE2: psrad $31
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test5:
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test5:
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test5:
-; AVX512: vpabsd
-; AVX512-NEXT: ret
- %tmp1neg = sub <4 x i32> zeroinitializer, %a
- %b = icmp sle <4 x i32> %a, zeroinitializer
- %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
- ret <4 x i32> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test5:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <4 x i32> zeroinitializer, %a
+ %b = icmp sle <4 x i32> %a, zeroinitializer
+ %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
+ ret <4 x i32> %abs
}
define <8 x i32> @test6(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test6:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test6:
-; SSSE3: pabsd
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test6:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test6:
-; AVX2: vpabsd {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test6:
-; AVX512: vpabsd {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i32> zeroinitializer, %a
- %b = icmp sgt <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
- %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
- ret <8 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i32> zeroinitializer, %a
+ %b = icmp sgt <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
+ ret <8 x i32> %abs
}
define <8 x i32> @test7(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test7:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test7:
-; SSSE3: pabsd
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test7:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test7:
-; AVX2: vpabsd {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test7:
-; AVX512: vpabsd {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i32> zeroinitializer, %a
- %b = icmp sge <8 x i32> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
- ret <8 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i32> zeroinitializer, %a
+ %b = icmp sge <8 x i32> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
+ ret <8 x i32> %abs
}
define <16 x i16> @test8(<16 x i16> %a) nounwind {
+; SSE2-LABEL: test8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test8:
-; SSSE3: pabsw
-; SSSE3: pabsw
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsw %xmm0, %xmm0
+; SSSE3-NEXT: pabsw %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test8:
-; AVX2: vpabsw {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsw %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test8:
-; AVX512: vpabsw {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <16 x i16> zeroinitializer, %a
- %b = icmp sgt <16 x i16> %a, zeroinitializer
- %abs = select <16 x i1> %b, <16 x i16> %a, <16 x i16> %tmp1neg
- ret <16 x i16> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsw %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <16 x i16> zeroinitializer, %a
+ %b = icmp sgt <16 x i16> %a, zeroinitializer
+ %abs = select <16 x i1> %b, <16 x i16> %a, <16 x i16> %tmp1neg
+ ret <16 x i16> %abs
}
define <32 x i8> @test9(<32 x i8> %a) nounwind {
+; SSE2-LABEL: test9:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT: paddb %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test9:
-; SSSE3: pabsb
-; SSSE3: pabsb
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsb %xmm0, %xmm0
+; SSSE3-NEXT: pabsb %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test9:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test9:
-; AVX2: vpabsb {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsb %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test9:
-; AVX512: vpabsb {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <32 x i8> zeroinitializer, %a
- %b = icmp slt <32 x i8> %a, zeroinitializer
- %abs = select <32 x i1> %b, <32 x i8> %tmp1neg, <32 x i8> %a
- ret <32 x i8> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsb %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <32 x i8> zeroinitializer, %a
+ %b = icmp slt <32 x i8> %a, zeroinitializer
+ %abs = select <32 x i1> %b, <32 x i8> %tmp1neg, <32 x i8> %a
+ ret <32 x i8> %abs
}
define <8 x i32> @test10(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test10:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test10:
-; SSSE3: pabsd
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test10:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test10:
-; AVX2: vpabsd {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test10:
-; AVX512: vpabsd {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i32> zeroinitializer, %a
- %b = icmp sle <8 x i32> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i32> %tmp1neg, <8 x i32> %a
- ret <8 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i32> zeroinitializer, %a
+ %b = icmp sle <8 x i32> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i32> %tmp1neg, <8 x i32> %a
+ ret <8 x i32> %abs
}
define <16 x i32> @test11(<16 x i32> %a) nounwind {
+; SSE2-LABEL: test11:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test11:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: pabsd %xmm2, %xmm2
+; SSSE3-NEXT: pabsd %xmm3, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test11:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test11:
-; AVX2: vpabsd
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: vpabsd %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test11:
-; AVX512: vpabsd {{.*}}%zmm
-; AVX512-NEXT: ret
- %tmp1neg = sub <16 x i32> zeroinitializer, %a
- %b = icmp sle <16 x i32> %a, zeroinitializer
- %abs = select <16 x i1> %b, <16 x i32> %tmp1neg, <16 x i32> %a
- ret <16 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <16 x i32> zeroinitializer, %a
+ %b = icmp sle <16 x i32> %a, zeroinitializer
+ %abs = select <16 x i1> %b, <16 x i32> %tmp1neg, <16 x i32> %a
+ ret <16 x i32> %abs
}
define <8 x i64> @test12(<8 x i64> %a) nounwind {
+; SSE-LABEL: test12:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test12:
-; AVX2: vpxor
-; AVX2: vpxor
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test12:
-; AVX512: vpabsq {{.*}}%zmm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i64> zeroinitializer, %a
- %b = icmp sle <8 x i64> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
- ret <8 x i64> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsq %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i64> zeroinitializer, %a
+ %b = icmp sle <8 x i64> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+ ret <8 x i64> %abs
}
define <8 x i64> @test13(<8 x i64>* %a.ptr) nounwind {
+; SSE-LABEL: test13:
+; SSE: # BB#0:
+; SSE-NEXT: movdqu (%rdi), %xmm0
+; SSE-NEXT: movdqu 16(%rdi), %xmm1
+; SSE-NEXT: movdqu 32(%rdi), %xmm2
+; SSE-NEXT: movdqu 48(%rdi), %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test13:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovups (%rdi), %ymm0
+; AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test13:
-; AVX2: vpxor
-; AVX2: vpxor
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test13:
-; AVX512: vpabsq (%
-; AVX512-NEXT: ret
- %a = load <8 x i64>, <8 x i64>* %a.ptr, align 8
- %tmp1neg = sub <8 x i64> zeroinitializer, %a
- %b = icmp sle <8 x i64> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
- ret <8 x i64> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsq (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %a = load <8 x i64>, <8 x i64>* %a.ptr, align 8
+ %tmp1neg = sub <8 x i64> zeroinitializer, %a
+ %b = icmp sle <8 x i64> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+ ret <8 x i64> %abs
}
diff --git a/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll b/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
index 2ff8c3a9028f..0eb17fb6c14d 100644
--- a/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
+++ b/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -o /dev/null -stop-after machine-scheduler %s | FileCheck %s --check-prefix=PRE-RA
-; RUN: llc -mtriple=x86_64-unknown-unknown -o /dev/null -stop-after prologepilog %s | FileCheck %s --check-prefix=POST-RA
+; RUN: llc -mtriple=x86_64-unknown-unknown -o - -stop-after machine-scheduler %s | FileCheck %s --check-prefix=PRE-RA
+; RUN: llc -mtriple=x86_64-unknown-unknown -o - -stop-after prologepilog %s | FileCheck %s --check-prefix=POST-RA
; This test verifies that the virtual register references in machine function's
; liveins are cleared after register allocation.
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index 002561042688..d9f783756d1e 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -1,23 +1,29 @@
-; RUN: llc %s -o - -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx"
; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true>
; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought
; we would lower that into a blend where only the high bit is relevant.
; However, since the whole mask is constant, this is simplified incorrectly
; by the generic code, because it was expecting -1 in place of 2147483648.
-;
+;
; The problem does not occur without AVX, because vselect of v4i32 is not legal
; nor custom.
;
; <rdar://problem/18675020>
-; CHECK-LABEL: test:
-; CHECK: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
-; CHECK: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
-; CHECK: ret
define void @test(<4 x i16>* %a, <4 x i16>* %b) {
+; CHECK-LABEL: test:
+; CHECK: ## BB#0: ## %body
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
+; CHECK-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; CHECK-NEXT: vmovq %xmm1, (%rdi)
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
+; CHECK-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vmovq %xmm0, (%rsi)
+; CHECK-NEXT: retq
body:
%predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>
%predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
@@ -31,17 +37,22 @@ body:
; When shrinking the condition used into the select to match a blend, this
; test case exercises the path where the modified node is not the root
; of the condition.
-;
-; CHECK-LABEL: test2:
-; CHECK: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
-; CHECK: vblendvpd [[MASK]]
-; CHECK: retq
+
define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0: ## %bb
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: movq (%rdi,%rsi,8), %rax
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01]
+; CHECK-NEXT: vblendvpd %ymm0, {{.*}}(%rip), %ymm1, %ymm0
+; CHECK-NEXT: vmovupd %ymm0, (%rax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
bb:
%arrayidx1928 = getelementptr inbounds double*, double** %call1559, i64 %indvars.iv4198
%tmp1888 = load double*, double** %arrayidx1928, align 8
@@ -57,22 +68,32 @@ bb:
; to be optimized into a and. In that case, the conditional mask was wrong.
;
; Make sure that the and is fed by the original mask.
-;
+;
; <rdar://problem/18819506>
-; CHECK-LABEL: test3:
-; Compute the mask.
-; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]]
-; Do not shrink the bit of the mask.
-; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}}
-; Use the mask in the blend.
-; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; Shuffle mask to truncate.
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; CHECK: retq
define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; CHECK-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
+; CHECK-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; CHECK-NEXT: vpsrld $31, %xmm3, %xmm4
+; CHECK-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
+; CHECK-NEXT: vpsubd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmovq %xmm0, (%rdi)
+; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vmovq %xmm0, (%rsi)
+; CHECK-NEXT: retq
%tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
%tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
%predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
@@ -85,11 +106,24 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
; We shouldn't try to lower this directly using VSELECT because we don't have
; vpblendvb in AVX1, only in AVX2. Instead, it should be expanded.
-;
-; CHECK-LABEL: PR22706:
-; CHECK: vpcmpgtb
-; CHECK: vpcmpgtb
+
define <32 x i8> @PR22706(<32 x i1> %x) {
+; CHECK-LABEL: PR22706:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm1
+; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %tmp
}
diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll
index edf2a442918a..8e9f1d980913 100644
--- a/test/CodeGen/X86/vselect-minmax.ll
+++ b/test/CodeGen/X86/vselect-minmax.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE4
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
diff --git a/test/CodeGen/X86/vzero-excess.ll b/test/CodeGen/X86/vzero-excess.ll
new file mode 100644
index 000000000000..0ed90741b61e
--- /dev/null
+++ b/test/CodeGen/X86/vzero-excess.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+
+; In the following 4 tests, the existing call to VZU/VZA ensures clean state before
+; the call to the unknown, so we don't need to insert a second VZU at that point.
+
+define <4 x float> @zeroupper_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
+; CHECK-LABEL: zeroupper_v4f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $48, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: addq $48, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroupper()
+ call void @the_unknown()
+ %loadx = load <8 x float>, <8 x float> *%x, align 32
+ %sum = fadd <8 x float> %loadx, %y
+ %lo = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %hi = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = fadd <4 x float> %lo, %hi
+ ret <4 x float> %res
+}
+
+define <8 x float> @zeroupper_v8f32(<8 x float> %x) nounwind {
+; CHECK-LABEL: zeroupper_v8f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroupper()
+ call void @the_unknown()
+ ret <8 x float> %x
+}
+
+define <4 x float> @zeroall_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
+; CHECK-LABEL: zeroall_v4f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $48, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: vzeroall
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: addq $48, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroall()
+ call void @the_unknown()
+ %loadx = load <8 x float>, <8 x float> *%x, align 32
+ %sum = fadd <8 x float> %loadx, %y
+ %lo = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %hi = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = fadd <4 x float> %lo, %hi
+ ret <4 x float> %res
+}
+
+define <8 x float> @zeroall_v8f32(<8 x float> %x) nounwind {
+; CHECK-LABEL: zeroall_v8f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: vzeroall
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroall()
+ call void @the_unknown()
+ ret <8 x float> %x
+}
+
+declare void @llvm.x86.avx.vzeroupper() nounwind readnone
+declare void @llvm.x86.avx.vzeroall() nounwind readnone
+declare void @the_unknown() nounwind
+
diff --git a/test/CodeGen/X86/warn-stack.ll b/test/CodeGen/X86/warn-stack.ll
index aa09ad8066fe..7353d073e630 100644
--- a/test/CodeGen/X86/warn-stack.ll
+++ b/test/CodeGen/X86/warn-stack.ll
@@ -12,7 +12,7 @@ entry:
ret void
}
-; CHECK: warning: stack size limit exceeded (104) in warn
+; CHECK: warning: stack size limit exceeded (88) in warn
define void @warn() nounwind ssp {
entry:
%buffer = alloca [80 x i8], align 1
diff --git a/test/CodeGen/X86/weak_def_can_be_hidden.ll b/test/CodeGen/X86/weak_def_can_be_hidden.ll
index 8e6d34c89d88..516bc02cc2f8 100644
--- a/test/CodeGen/X86/weak_def_can_be_hidden.ll
+++ b/test/CodeGen/X86/weak_def_can_be_hidden.ll
@@ -4,7 +4,7 @@
; RUN: llc -mtriple=i686-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
; RUN: llc -mtriple=i686-apple-darwin8 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
-@v1 = linkonce_odr constant i32 32
+@v1 = linkonce_odr local_unnamed_addr constant i32 32
; CHECK: .globl _v1
; CHECK: .weak_def_can_be_hidden _v1
@@ -27,7 +27,7 @@ define i32* @f2() {
ret i32* @v2
}
-@v3 = linkonce_odr unnamed_addr global i32 32
+@v3 = linkonce_odr unnamed_addr constant i32 32
; CHECK: .globl _v3
; CHECK: .weak_def_can_be_hidden _v3
@@ -38,9 +38,9 @@ define i32* @f3() {
ret i32* @v3
}
-@v4 = linkonce_odr global i32 32
+@v4 = linkonce_odr unnamed_addr global i32 32
; CHECK: .globl _v4
-; CHECK: .weak_definition _v4
+; CHECK: .weak_def_can_be_hidden _v4
; CHECK-D89: .globl _v4
; CHECK-D89: .weak_definition _v4
diff --git a/test/CodeGen/X86/widen_bitops-0.ll b/test/CodeGen/X86/widen_bitops-0.ll
new file mode 100644
index 000000000000..f8316d0e1ea2
--- /dev/null
+++ b/test/CodeGen/X86/widen_bitops-0.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE42
+
+;
+; AND/XOR/OR i24 as v3i8
+;
+
+define i24 @and_i24_as_v3i8(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: and_i24_as_v3i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i24_as_v3i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <3 x i8>
+ %2 = bitcast i24 %b to <3 x i8>
+ %3 = and <3 x i8> %1, %2
+ %4 = bitcast <3 x i8> %3 to i24
+ ret i24 %4
+}
+
+define i24 @xor_i24_as_v3i8(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: xor_i24_as_v3i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i24_as_v3i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <3 x i8>
+ %2 = bitcast i24 %b to <3 x i8>
+ %3 = xor <3 x i8> %1, %2
+ %4 = bitcast <3 x i8> %3 to i24
+ ret i24 %4
+}
+
+define i24 @or_i24_as_v3i8(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: or_i24_as_v3i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i24_as_v3i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <3 x i8>
+ %2 = bitcast i24 %b to <3 x i8>
+ %3 = or <3 x i8> %1, %2
+ %4 = bitcast <3 x i8> %3 to i24
+ ret i24 %4
+}
+
+;
+; AND/XOR/OR i24 as v8i3
+;
+
+define i24 @and_i24_as_v8i3(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: and_i24_as_v8i3:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i24_as_v8i3:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <8 x i3>
+ %2 = bitcast i24 %b to <8 x i3>
+ %3 = and <8 x i3> %1, %2
+ %4 = bitcast <8 x i3> %3 to i24
+ ret i24 %4
+}
+
+define i24 @xor_i24_as_v8i3(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: xor_i24_as_v8i3:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i24_as_v8i3:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <8 x i3>
+ %2 = bitcast i24 %b to <8 x i3>
+ %3 = xor <8 x i3> %1, %2
+ %4 = bitcast <8 x i3> %3 to i24
+ ret i24 %4
+}
+
+define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: or_i24_as_v8i3:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i24_as_v8i3:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <8 x i3>
+ %2 = bitcast i24 %b to <8 x i3>
+ %3 = or <8 x i3> %1, %2
+ %4 = bitcast <8 x i3> %3 to i24
+ ret i24 %4
+}
+
+;
+; AND/XOR/OR v3i8 as i24
+;
+
+define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
+; X32-SSE-LABEL: and_v3i8_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v3i8_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: movd %edi, %xmm1
+; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pand %xmm0, %xmm1
+; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: retq
+ %1 = bitcast <3 x i8> %a to i24
+ %2 = bitcast <3 x i8> %b to i24
+ %3 = and i24 %1, %2
+ %4 = bitcast i24 %3 to <3 x i8>
+ ret <3 x i8> %4
+}
+
+define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
+; X32-SSE-LABEL: xor_v3i8_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pxor %xmm0, %xmm1
+; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v3i8_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: movd %edi, %xmm1
+; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pxor %xmm0, %xmm1
+; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: retq
+ %1 = bitcast <3 x i8> %a to i24
+ %2 = bitcast <3 x i8> %b to i24
+ %3 = xor i24 %1, %2
+ %4 = bitcast i24 %3 to <3 x i8>
+ ret <3 x i8> %4
+}
+
+define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
+; X32-SSE-LABEL: or_v3i8_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: por %xmm0, %xmm1
+; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v3i8_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: movd %edi, %xmm1
+; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: por %xmm0, %xmm1
+; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: retq
+ %1 = bitcast <3 x i8> %a to i24
+ %2 = bitcast <3 x i8> %b to i24
+ %3 = or i24 %1, %2
+ %4 = bitcast i24 %3 to <3 x i8>
+ ret <3 x i8> %4
+}
+
+;
+; AND/XOR/OR v8i3 as i24
+;
+
+define <8 x i3> @and_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
+; X32-SSE-LABEL: and_v8i3_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: andps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v8i3_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i3> %a to i24
+ %2 = bitcast <8 x i3> %b to i24
+ %3 = and i24 %1, %2
+ %4 = bitcast i24 %3 to <8 x i3>
+ ret <8 x i3> %4
+}
+
+define <8 x i3> @xor_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
+; X32-SSE-LABEL: xor_v8i3_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v8i3_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i3> %a to i24
+ %2 = bitcast <8 x i3> %b to i24
+ %3 = xor i24 %1, %2
+ %4 = bitcast i24 %3 to <8 x i3>
+ ret <8 x i3> %4
+}
+
+define <8 x i3> @or_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
+; X32-SSE-LABEL: or_v8i3_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: orps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v8i3_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i3> %a to i24
+ %2 = bitcast <8 x i3> %b to i24
+ %3 = or i24 %1, %2
+ %4 = bitcast i24 %3 to <8 x i3>
+ ret <8 x i3> %4
+}
diff --git a/test/CodeGen/X86/widen_bitops-1.ll b/test/CodeGen/X86/widen_bitops-1.ll
new file mode 100644
index 000000000000..f2a6b22c2af4
--- /dev/null
+++ b/test/CodeGen/X86/widen_bitops-1.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE42
+
+;
+; AND/XOR/OR i32 as v4i8
+;
+
+define i32 @and_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: and_i32_as_v4i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i32_as_v4i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <4 x i8>
+ %2 = bitcast i32 %b to <4 x i8>
+ %3 = and <4 x i8> %1, %2
+ %4 = bitcast <4 x i8> %3 to i32
+ ret i32 %4
+}
+
+define i32 @xor_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: xor_i32_as_v4i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i32_as_v4i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <4 x i8>
+ %2 = bitcast i32 %b to <4 x i8>
+ %3 = xor <4 x i8> %1, %2
+ %4 = bitcast <4 x i8> %3 to i32
+ ret i32 %4
+}
+
+define i32 @or_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: or_i32_as_v4i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i32_as_v4i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <4 x i8>
+ %2 = bitcast i32 %b to <4 x i8>
+ %3 = or <4 x i8> %1, %2
+ %4 = bitcast <4 x i8> %3 to i32
+ ret i32 %4
+}
+
+;
+; AND/XOR/OR i32 as v8i4
+;
+
+define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: and_i32_as_v8i4:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i32_as_v8i4:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <8 x i4>
+ %2 = bitcast i32 %b to <8 x i4>
+ %3 = and <8 x i4> %1, %2
+ %4 = bitcast <8 x i4> %3 to i32
+ ret i32 %4
+}
+
+define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: xor_i32_as_v8i4:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i32_as_v8i4:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <8 x i4>
+ %2 = bitcast i32 %b to <8 x i4>
+ %3 = xor <8 x i4> %1, %2
+ %4 = bitcast <8 x i4> %3 to i32
+ ret i32 %4
+}
+
+define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: or_i32_as_v8i4:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i32_as_v8i4:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <8 x i4>
+ %2 = bitcast i32 %b to <8 x i4>
+ %3 = or <8 x i4> %1, %2
+ %4 = bitcast <8 x i4> %3 to i32
+ ret i32 %4
+}
+
+;
+; AND/XOR/OR v4i8 as i32
+;
+
+define <4 x i8> @and_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: and_v4i8_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: andps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v4i8_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <4 x i8> %a to i32
+ %2 = bitcast <4 x i8> %b to i32
+ %3 = and i32 %1, %2
+ %4 = bitcast i32 %3 to <4 x i8>
+ ret <4 x i8> %4
+}
+
+define <4 x i8> @xor_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: xor_v4i8_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v4i8_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <4 x i8> %a to i32
+ %2 = bitcast <4 x i8> %b to i32
+ %3 = xor i32 %1, %2
+ %4 = bitcast i32 %3 to <4 x i8>
+ ret <4 x i8> %4
+}
+
+define <4 x i8> @or_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: or_v4i8_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: orps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v4i8_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <4 x i8> %a to i32
+ %2 = bitcast <4 x i8> %b to i32
+ %3 = or i32 %1, %2
+ %4 = bitcast i32 %3 to <4 x i8>
+ ret <4 x i8> %4
+}
+
+;
+; AND/XOR/OR v8i4 as i32
+;
+
+define <8 x i4> @and_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: and_v8i4_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: andps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v8i4_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i4> %a to i32
+ %2 = bitcast <8 x i4> %b to i32
+ %3 = and i32 %1, %2
+ %4 = bitcast i32 %3 to <8 x i4>
+ ret <8 x i4> %4
+}
+
+define <8 x i4> @xor_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: xor_v8i4_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v8i4_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i4> %a to i32
+ %2 = bitcast <8 x i4> %b to i32
+ %3 = xor i32 %1, %2
+ %4 = bitcast i32 %3 to <8 x i4>
+ ret <8 x i4> %4
+}
+
+define <8 x i4> @or_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: or_v8i4_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: orps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v8i4_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i4> %a to i32
+ %2 = bitcast <8 x i4> %b to i32
+ %3 = or i32 %1, %2
+ %4 = bitcast i32 %3 to <8 x i4>
+ ret <8 x i4> %4
+}
diff --git a/test/CodeGen/X86/widen_compare-1.ll b/test/CodeGen/X86/widen_compare-1.ll
new file mode 100644
index 000000000000..8ea0db53a391
--- /dev/null
+++ b/test/CodeGen/X86/widen_compare-1.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
+
+; compare v2i16
+
+define <2 x i16> @compare_v2i64_to_v2i16(<2 x i16>* %src) nounwind {
+; X86-LABEL: compare_v2i64_to_v2i16:
+; X86: # BB#0:
+; X86-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0]
+; X86-NEXT: retl
+;
+; X64-LABEL: compare_v2i64_to_v2i16:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [65535,65535]
+; X64-NEXT: retq
+ %val = load <2 x i16>, <2 x i16>* %src, align 4
+ %cmp = icmp uge <2 x i16> %val, %val
+ %sel = select <2 x i1> %cmp, <2 x i16> <i16 -1, i16 -1>, <2 x i16> zeroinitializer
+ ret <2 x i16> %sel
+}
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 3f54ab694c07..cf5a8abda18c 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -1,12 +1,101 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: paddd
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
; truncate v2i64 to v2i32
-define void @convert(<2 x i32>* %dst.addr, <2 x i64> %src) nounwind {
+define void @convert_v2i64_to_v2i32(<2 x i32>* %dst.addr, <2 x i64> %src) nounwind {
+; X86-LABEL: convert_v2i64_to_v2i32:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: paddd .LCPI0_0, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v2i64_to_v2i32:
+; X64: # BB#0: # %entry
+; X64-NEXT: paddd {{.*}}(%rip), %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
%val = trunc <2 x i64> %src to <2 x i32>
%add = add <2 x i32> %val, < i32 1, i32 1 >
store <2 x i32> %add, <2 x i32>* %dst.addr
ret void
}
+
+; truncate v3i32 to v3i8
+
+define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) nounwind {
+; X86-LABEL: convert_v3i32_to_v3i8:
+; X86: # BB#0: # %entry
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movdqa (%ecx), %xmm0
+; X86-NEXT: paddd .LCPI1_0, %xmm0
+; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
+; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: movw %cx, (%eax)
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v3i32_to_v3i8:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa (%rsi), %xmm0
+; X64-NEXT: paddd {{.*}}(%rip), %xmm0
+; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movw %ax, (%rdi)
+; X64-NEXT: retq
+entry:
+ %load = load <3 x i32>, <3 x i32>* %src.addr
+ %val = trunc <3 x i32> %load to <3 x i8>
+ %add = add <3 x i8> %val, < i8 1, i8 1, i8 1 >
+ store <3 x i8> %add, <3 x i8>* %dst.addr
+ ret void
+}
+
+; truncate v5i16 to v5i8
+
+define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind {
+; X86-LABEL: convert_v5i16_to_v5i8:
+; X86: # BB#0: # %entry
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movdqa (%ecx), %xmm0
+; X86-NEXT: paddw .LCPI2_0, %xmm0
+; X86-NEXT: pextrb $8, %xmm0, 4(%eax)
+; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT: movd %xmm0, (%eax)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v5i16_to_v5i8:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa (%rsi), %xmm0
+; X64-NEXT: paddw {{.*}}(%rip), %xmm0
+; X64-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT: movd %xmm0, (%rdi)
+; X64-NEXT: retq
+entry:
+ %load = load <5 x i16>, <5 x i16>* %src.addr
+ %val = trunc <5 x i16> %load to <5 x i8>
+ %add = add <5 x i8> %val, < i8 1, i8 1, i8 1, i8 1, i8 1 >
+ store <5 x i8> %add, <5 x i8>* %dst.addr
+ ret void
+}
diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll
index c8646c6489a1..015b0faa9827 100644
--- a/test/CodeGen/X86/widen_conv-2.ll
+++ b/test/CodeGen/X86/widen_conv-2.ll
@@ -1,11 +1,26 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: psllq $48, %xmm0
-; CHECK: psrad $16, %xmm0
-; CHECK: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
; sign extension v2i16 to v2i32
-define void @convert(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind {
+define void @convert_v2i16_v2i32(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind {
+; X86-LABEL: convert_v2i16_v2i32:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: psllq $48, %xmm0
+; X86-NEXT: psrad $16, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v2i16_v2i32:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllq $48, %xmm0
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: movq %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
%signext = sext <2 x i16> %src to <2 x i32> ; <<12 x i8>> [#uses=1]
store <2 x i32> %signext, <2 x i32>* %dst.addr
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index 0a6eea049d37..e8fa1043e9f0 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -1,11 +1,150 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: cvtdq2ps
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE42
; sign to float v2i16 to v2f32
-define void @convert(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
+define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
+; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: psllq $48, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movss %xmm0, (%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: psllq $48, %xmm0
+; X86-SSE42-NEXT: psrad $16, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
+; X86-SSE42-NEXT: movss %xmm0, (%eax)
+; X86-SSE42-NEXT: retl
+;
+; X64-LABEL: convert_v2i16_to_v2f32:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllq $48, %xmm0
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-NEXT: movlps %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
%val = sitofp <2 x i16> %src to <2 x float>
- store <2 x float> %val, <2 x float>* %dst.addr
+ store <2 x float> %val, <2 x float>* %dst.addr, align 4
+ ret void
+}
+
+; sign to float v3i8 to v3f32
+
+define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
+; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $32, %esp
+; X86-SSE2-NEXT: movl 8(%ebp), %eax
+; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movzwl (%ecx), %edx
+; X86-SSE2-NEXT: movd %edx, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT: shll $8, %edx
+; X86-SSE2-NEXT: movzbl (%esp), %esi
+; X86-SSE2-NEXT: orl %edx, %esi
+; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
+; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: psrad $24, %xmm0
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movss %xmm0, (%eax)
+; X86-SSE2-NEXT: movaps %xmm0, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
+; X86-SSE2-NEXT: leal -4(%ebp), %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: pushl %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
+; X86-SSE42-NEXT: movzwl (%ecx), %ecx
+; X86-SSE42-NEXT: movd %ecx, %xmm0
+; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
+; X86-SSE42-NEXT: pslld $24, %xmm0
+; X86-SSE42-NEXT: psrad $24, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
+; X86-SSE42-NEXT: movss %xmm0, (%eax)
+; X86-SSE42-NEXT: popl %eax
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE2: # BB#0: # %entry
+; X64-SSE2-NEXT: movzwl (%rsi), %eax
+; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE2-NEXT: shll $8, %eax
+; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE2-NEXT: orl %eax, %ecx
+; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: psrad $24, %xmm0
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE42: # BB#0: # %entry
+; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE42-NEXT: movzwl (%rsi), %ecx
+; X64-SSE42-NEXT: movd %rcx, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
+; X64-SSE42-NEXT: pslld $24, %xmm0
+; X64-SSE42-NEXT: psrad $24, %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
+; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE42-NEXT: retq
+entry:
+ %load = load <3 x i8>, <3 x i8>* %src.addr, align 1
+ %cvt = sitofp <3 x i8> %load to <3 x float>
+ store <3 x float> %cvt, <3 x float>* %dst.addr, align 4
ret void
}
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index f633592f2ef8..71b7976ab8bd 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -1,11 +1,174 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
-; CHECK-NOT: cvtsi2ss
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE42
; unsigned to float v7i16 to v7f32
-define void @convert(<7 x float>* %dst.addr, <7 x i16> %src) nounwind {
+define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwind {
+; X86-SSE2-LABEL: convert_v7i16_v7f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movups %xmm0, (%eax)
+; X86-SSE2-NEXT: movss %xmm2, 16(%eax)
+; X86-SSE2-NEXT: movaps %xmm2, %xmm0
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X86-SSE2-NEXT: movss %xmm0, 24(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm2, 20(%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v7i16_v7f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: pxor %xmm1, %xmm1
+; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
+; X86-SSE42-NEXT: extractps $2, %xmm0, 24(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm0, 20(%eax)
+; X86-SSE42-NEXT: movups %xmm1, (%eax)
+; X86-SSE42-NEXT: movss %xmm0, 16(%eax)
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v7i16_v7f32:
+; X64-SSE2: # BB#0: # %entry
+; X64-SSE2-NEXT: pxor %xmm1, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
+; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, 16(%rdi)
+; X64-SSE2-NEXT: movups %xmm2, (%rdi)
+; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movss %xmm0, 24(%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v7i16_v7f32:
+; X64-SSE42: # BB#0: # %entry
+; X64-SSE42-NEXT: pxor %xmm1, %xmm1
+; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
+; X64-SSE42-NEXT: extractps $2, %xmm0, 24(%rdi)
+; X64-SSE42-NEXT: movlps %xmm0, 16(%rdi)
+; X64-SSE42-NEXT: movups %xmm1, (%rdi)
+; X64-SSE42-NEXT: retq
entry:
- %val = sitofp <7 x i16> %src to <7 x float>
- store <7 x float> %val, <7 x float>* %dst.addr
+ %val = uitofp <7 x i16> %src to <7 x float>
+ store <7 x float> %val, <7 x float>* %dst.addr, align 4
+ ret void
+}
+
+; unsigned to float v3i8 to v3f32
+
+define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
+; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $32, %esp
+; X86-SSE2-NEXT: movl 8(%ebp), %eax
+; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movzwl (%ecx), %edx
+; X86-SSE2-NEXT: movd %edx, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT: shll $8, %edx
+; X86-SSE2-NEXT: movzbl (%esp), %esi
+; X86-SSE2-NEXT: orl %edx, %esi
+; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
+; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movss %xmm0, (%eax)
+; X86-SSE2-NEXT: movaps %xmm0, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
+; X86-SSE2-NEXT: leal -4(%ebp), %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: pushl %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
+; X86-SSE42-NEXT: movzwl (%ecx), %ecx
+; X86-SSE42-NEXT: movd %ecx, %xmm0
+; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
+; X86-SSE42-NEXT: pand .LCPI1_0, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
+; X86-SSE42-NEXT: movss %xmm0, (%eax)
+; X86-SSE42-NEXT: popl %eax
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE2: # BB#0: # %entry
+; X64-SSE2-NEXT: movzwl (%rsi), %eax
+; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE2-NEXT: shll $8, %eax
+; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE2-NEXT: orl %eax, %ecx
+; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
+; X64-SSE2-NEXT: pxor %xmm1, %xmm1
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE42: # BB#0: # %entry
+; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE42-NEXT: movzwl (%rsi), %ecx
+; X64-SSE42-NEXT: movd %rcx, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
+; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
+; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE42-NEXT: retq
+entry:
+ %load = load <3 x i8>, <3 x i8>* %src.addr, align 1
+ %cvt = uitofp <3 x i8> %load to <3 x float>
+ store <3 x float> %cvt, <3 x float>* %dst.addr, align 4
ret void
}
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index c670b45df747..810e409c175c 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
-; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
+; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
; PR4891
; PR5626
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index fad1fa32559a..00aeb009b638 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s
; Test based on pr5626 to load/store
;
@@ -6,10 +7,13 @@
%i32vec3 = type <3 x i32>
define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; CHECK-LABEL: add3i32:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
-; CHECK-NEXT: pextrd $2, %[[R0]], 8(%{{.*}})
-; CHECK-NEXT: movq %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: paddd (%rdx), %xmm0
+; CHECK-NEXT: pextrd $2, %xmm0, 8(%rdi)
+; CHECK-NEXT: movq %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec3, %i32vec3* %ap, align 16
%b = load %i32vec3, %i32vec3* %bp, align 16
%x = add %i32vec3 %a, %b
@@ -19,13 +23,16 @@ define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; CHECK-LABEL: add3i32_2:
-; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R0]]
-; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R1]]
-; CHECK-NEXT: paddd %[[R0]], %[[R1]]
-; CHECK-NEXT: pextrd $2, %[[R1]], 8(%{{.*}})
-; CHECK-NEXT: movq %[[R1]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: pinsrd $2, 8(%rsi), %xmm0
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pinsrd $2, 8(%rdx), %xmm1
+; CHECK-NEXT: paddd %xmm0, %xmm1
+; CHECK-NEXT: pextrd $2, %xmm1, 8(%rdi)
+; CHECK-NEXT: movq %xmm1, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec3, %i32vec3* %ap, align 8
%b = load %i32vec3, %i32vec3* %bp, align 8
%x = add %i32vec3 %a, %b
@@ -36,13 +43,16 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
%i32vec7 = type <7 x i32>
define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
; CHECK-LABEL: add7i32:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
-; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: paddd (%rdx), %xmm0
+; CHECK-NEXT: paddd 16(%rdx), %xmm1
+; CHECK-NEXT: pextrd $2, %xmm1, 24(%rdi)
+; CHECK-NEXT: movq %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec7, %i32vec7* %ap, align 16
%b = load %i32vec7, %i32vec7* %bp, align 16
%x = add %i32vec7 %a, %b
@@ -53,15 +63,18 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
%i32vec12 = type <12 x i32>
define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
; CHECK-LABEL: add12i32:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
-; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: paddd 32(%{{.*}}), %[[R2]]
-; CHECK-NEXT: movdqa %[[R2]], 32(%{{.*}})
-; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: movdqa 32(%rsi), %xmm2
+; CHECK-NEXT: paddd (%rdx), %xmm0
+; CHECK-NEXT: paddd 16(%rdx), %xmm1
+; CHECK-NEXT: paddd 32(%rdx), %xmm2
+; CHECK-NEXT: movdqa %xmm2, 32(%rdi)
+; CHECK-NEXT: movdqa %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec12, %i32vec12* %ap, align 16
%b = load %i32vec12, %i32vec12* %bp, align 16
%x = add %i32vec12 %a, %b
@@ -73,13 +86,16 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
%i16vec3 = type <3 x i16>
define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
; CHECK-LABEL: add3i16:
-; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddd %[[R0]], %[[R1]]
-; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})
-; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
-; CHECK-NEXT: pmovzxdq %[[R1]], %[[R0]]
-; CHECK-NEXT: movd %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: paddd %xmm0, %xmm1
+; CHECK-NEXT: pextrw $4, %xmm1, 4(%rdi)
+; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; CHECK-NEXT: movd %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec3, %i16vec3* %ap, align 16
%b = load %i16vec3, %i16vec3* %bp, align 16
%x = add %i16vec3 %a, %b
@@ -90,10 +106,13 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
%i16vec4 = type <4 x i16>
define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
; CHECK-LABEL: add4i16:
-; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddw %[[R0]], %[[R1]]
-; CHECK-NEXT: movq %[[R1]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: paddw %xmm0, %xmm1
+; CHECK-NEXT: movq %xmm1, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec4, %i16vec4* %ap, align 16
%b = load %i16vec4, %i16vec4* %bp, align 16
%x = add %i16vec4 %a, %b
@@ -104,12 +123,15 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp
%i16vec12 = type <12 x i16>
define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
; CHECK-LABEL: add12i16:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: paddw (%rdx), %xmm0
+; CHECK-NEXT: paddw 16(%rdx), %xmm1
+; CHECK-NEXT: movq %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec12, %i16vec12* %ap, align 16
%b = load %i16vec12, %i16vec12* %bp, align 16
%x = add %i16vec12 %a, %b
@@ -120,15 +142,18 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
%i16vec18 = type <18 x i16>
define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
; CHECK-LABEL: add18i16:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
-; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: paddw 32(%{{.*}}), %[[R2]]
-; CHECK-NEXT: movd %[[R2]], 32(%{{.*}})
-; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: movdqa 32(%rsi), %xmm2
+; CHECK-NEXT: paddw (%rdx), %xmm0
+; CHECK-NEXT: paddw 16(%rdx), %xmm1
+; CHECK-NEXT: paddw 32(%rdx), %xmm2
+; CHECK-NEXT: movd %xmm2, 32(%rdi)
+; CHECK-NEXT: movdqa %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec18, %i16vec18* %ap, align 16
%b = load %i16vec18, %i16vec18* %bp, align 16
%x = add %i16vec18 %a, %b
@@ -140,14 +165,17 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
%i8vec3 = type <3 x i8>
define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
; CHECK-LABEL: add3i8:
-; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddd %[[R0]], %[[R1]]
-; CHECK-NEXT: pextrb $8, %[[R1]], 2(%{{.*}})
-; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
-; CHECK-NEXT: pmovzxwq %[[R1]], %[[R0]]
-; CHECK-NEXT: movd %[[R0]], %e[[R2:[abcd]]]x
-; CHECK-NEXT: movw %[[R2]]x, (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: paddd %xmm0, %xmm1
+; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi)
+; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movw %ax, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i8vec3, %i8vec3* %ap, align 16
%b = load %i8vec3, %i8vec3* %bp, align 16
%x = add %i8vec3 %a, %b
@@ -158,15 +186,18 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
%i8vec31 = type <31 x i8>
define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
; CHECK-LABEL: add31i8:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddb (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddb 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: pextrb $14, %[[R1]], 30(%{{.*}})
-; CHECK-NEXT: pextrw $6, %[[R1]], 28(%{{.*}})
-; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
-; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: paddb (%rdx), %xmm0
+; CHECK-NEXT: paddb 16(%rdx), %xmm1
+; CHECK-NEXT: pextrb $14, %xmm1, 30(%rdi)
+; CHECK-NEXT: pextrw $6, %xmm1, 28(%rdi)
+; CHECK-NEXT: pextrd $2, %xmm1, 24(%rdi)
+; CHECK-NEXT: movq %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i8vec31, %i8vec31* %ap, align 16
%b = load %i8vec31, %i8vec31* %bp, align 16
%x = add %i8vec31 %a, %b
@@ -178,29 +209,31 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
%i8vec3pack = type { <3 x i8>, i8 }
define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
; CHECK-LABEL: rot:
-; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]]
-; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]]
-; CHECK-NEXT: pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]]
-; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT: movw %[[R0]]x, (%[[PTR0:.*]])
-; CHECK-NEXT: movb $-98, 2(%[[PTR0]])
-; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]]
-; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]]
-; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]]
-; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x
-; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]])
-; CHECK-NEXT: movb $1, 2(%[[PTR1]])
-; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa %[[X0]], %[[X1:xmm[0-9]+]]
-; CHECK-NEXT: psrld $1, %[[X1]]
-; CHECK-NEXT: pblendw $192, %[[X0]], %[[X1]]
-; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}})
-; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X1]]
-; CHECK-NEXT: pmovzxwq %[[X1]], %[[X3:xmm[0-9]+]]
-; CHECK-NEXT: movd %[[X3]], %e[[R0:[abcd]]]x
-; CHECK-NEXT: movw %[[R0]]x, (%{{.*}})
-
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u>
+; CHECK-NEXT: pshufb %xmm0, %xmm1
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm1, %eax
+; CHECK-NEXT: movw %ax, (%rsi)
+; CHECK-NEXT: movb $-98, 2(%rsi)
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u>
+; CHECK-NEXT: pshufb %xmm0, %xmm1
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movw %ax, (%rdx)
+; CHECK-NEXT: movb $1, 2(%rdx)
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $1, %xmm1
+; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi)
+; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movw %ax, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
entry:
%storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
diff --git a/test/CodeGen/X86/win-alloca-expander.ll b/test/CodeGen/X86/win-alloca-expander.ll
new file mode 100644
index 000000000000..45ca3b214ab8
--- /dev/null
+++ b/test/CodeGen/X86/win-alloca-expander.ll
@@ -0,0 +1,154 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-win32 -O0
+
+%struct.S = type { [1024 x i8] }
+%struct.T = type { [3000 x i8] }
+%struct.U = type { [10000 x i8] }
+
+define void @basics() {
+; CHECK-LABEL: basics:
+entry:
+ br label %bb1
+
+; Allocation move sizes should have been removed.
+; CHECK-NOT: movl $1024
+; CHECK-NOT: movl $3000
+
+bb1:
+ %p0 = alloca %struct.S
+; The allocation is small enough not to require stack probing, but the %esp
+; offset after the prologue is not known, so the stack must be touched before
+; the pointer is adjusted.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+
+ %saved_stack = tail call i8* @llvm.stacksave()
+
+ %p1 = alloca %struct.S
+; We know the %esp offset from above, so there is no need to touch the stack
+; before adjusting it.
+; CHECK: subl $1024, %esp
+
+ %p2 = alloca %struct.T
+; The offset is now 2048 bytes, so allocating a T must touch the stack again.
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+
+ call void @f(%struct.S* %p0)
+; CHECK: calll
+
+ %p3 = alloca %struct.T
+; The call above touched the stack, so there is room for a T object.
+; CHECK: subl $3000, %esp
+
+ %p4 = alloca %struct.U
+; The U object is large enough to require stack probing.
+; CHECK: movl $10000, %eax
+; CHECK: calll __chkstk
+
+ %p5 = alloca %struct.T
+; The stack probing above touched the tip of the stack, so there's room for a T.
+; CHECK: subl $3000, %esp
+
+ call void @llvm.stackrestore(i8* %saved_stack)
+ %p6 = alloca %struct.S
+; The stack restore means we lose track of the stack pointer and must probe.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+
+; Use the pointers so they're not optimized away.
+ call void @f(%struct.S* %p1)
+ call void @g(%struct.T* %p2)
+ call void @g(%struct.T* %p3)
+ call void @h(%struct.U* %p4)
+ call void @g(%struct.T* %p5)
+ ret void
+}
+
+define void @loop() {
+; CHECK-LABEL: loop:
+entry:
+ br label %bb1
+
+bb1:
+ %p1 = alloca %struct.S
+; The entry offset is unknown; touch-and-sub.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+ br label %loop1
+
+loop1:
+ %i1 = phi i32 [ 10, %bb1 ], [ %dec1, %loop1 ]
+ %p2 = alloca %struct.S
+; We know the incoming offset from bb1, but from the back-edge, we assume the
+; worst, and therefore touch-and-sub to allocate.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+ %dec1 = sub i32 %i1, 1
+ %cmp1 = icmp sgt i32 %i1, 0
+ br i1 %cmp1, label %loop1, label %end
+; CHECK: decl
+; CHECK: jg
+
+end:
+ call void @f(%struct.S* %p1)
+ call void @f(%struct.S* %p2)
+ ret void
+}
+
+define void @probe_size_attribute() "stack-probe-size"="512" {
+; CHECK-LABEL: probe_size_attribute:
+entry:
+ br label %bb1
+
+bb1:
+ %p0 = alloca %struct.S
+; The allocation would be small enough not to require probing, if it wasn't
+; for the stack-probe-size attribute.
+; CHECK: movl $1024, %eax
+; CHECK: calll __chkstk
+ call void @f(%struct.S* %p0)
+ ret void
+}
+
+define void @cfg(i1 %x, i1 %y) {
+; Test that the blocks are analyzed in the correct order.
+; CHECK-LABEL: cfg:
+entry:
+ br i1 %x, label %bb1, label %bb2
+
+bb1:
+ %p1 = alloca %struct.S
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+ br label %bb3
+bb2:
+ %p2 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+ br label %bb3
+
+bb3:
+ br i1 %y, label %bb4, label %bb5
+
+bb4:
+ %p4 = alloca %struct.S
+; CHECK: subl $1024, %esp
+ call void @f(%struct.S* %p4)
+ ret void
+
+bb5:
+ %p5 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+ call void @g(%struct.T* %p5)
+ ret void
+}
+
+
+declare void @f(%struct.S*)
+declare void @g(%struct.T*)
+declare void @h(%struct.U*)
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
diff --git a/test/CodeGen/X86/win-catchpad-csrs.ll b/test/CodeGen/X86/win-catchpad-csrs.ll
index 327ee45b4326..64c7a9747df9 100644
--- a/test/CodeGen/X86/win-catchpad-csrs.ll
+++ b/test/CodeGen/X86/win-catchpad-csrs.ll
@@ -51,7 +51,7 @@ handler1:
; X86: calll _getint
; X86: calll _useints
; X86: movl $0, -{{[0-9]+}}(%ebp)
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _f
; X86: [[contbb:LBB0_[0-9]+]]: # %try.cont
; X86: popl %esi
@@ -71,7 +71,7 @@ handler1:
; X86: subl $16, %esp
; X86: addl $12, %ebp
; X86: movl $1, -{{[0-9]+}}(%ebp)
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _f
; X86: movl $[[restorebb]], %eax
; X86-NEXT: addl $16, %esp
diff --git a/test/CodeGen/X86/win-catchpad-varargs.ll b/test/CodeGen/X86/win-catchpad-varargs.ll
index 6508f3bd7d64..a31b3d72c56c 100644
--- a/test/CodeGen/X86/win-catchpad-varargs.ll
+++ b/test/CodeGen/X86/win-catchpad-varargs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
+; RUN: llc -stack-symbol-ordering=0 -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
declare void @llvm.va_start(i8*)
declare void @llvm.va_end(i8*)
diff --git a/test/CodeGen/X86/win-catchpad.ll b/test/CodeGen/X86/win-catchpad.ll
index 836c53bda8e6..48866490c16c 100644
--- a/test/CodeGen/X86/win-catchpad.ll
+++ b/test/CodeGen/X86/win-catchpad.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
; Loosely based on IR for this C++ source code:
; void f(int p);
@@ -57,23 +57,23 @@ try.cont:
; X86: movl %esp, -[[sp_offset:[0-9]+]](%ebp)
; X86: movl $0, -{{[0-9]+}}(%ebp)
; X86: leal -[[local_offs:[0-9]+]](%ebp), %[[addr_reg:[a-z]+]]
-; X86-DAG: movl %[[addr_reg]], 4(%esp)
-; X86-DAG: movl $1, (%esp)
+; X86-DAG: pushl %[[addr_reg]]
+; X86-DAG: pushl $1
; X86: calll _f
; X86: [[contbb:LBB0_[0-9]+]]: # %try.cont
; X86: retl
-; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
-; X86-NEXT: # %handler1
-; X86-NEXT: addl $12, %ebp
-; X86: jmp [[contbb]]
-
; FIXME: These should be de-duplicated.
; X86: [[restorebb2:LBB0_[0-9]+]]: # Block address taken
; X86-NEXT: # %handler2
; X86-NEXT: addl $12, %ebp
; X86: jmp [[contbb]]
+; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT: # %handler1
+; X86-NEXT: addl $12, %ebp
+; X86: jmp [[contbb]]
+
; X86: "?catch$[[catch1bb:[0-9]+]]@?0?try_catch_catch@4HA":
; X86: LBB0_[[catch1bb]]: # %handler1{{$}}
; X86: pushl %ebp
@@ -83,13 +83,14 @@ try.cont:
; X86-DAG: movl -32(%ebp), %[[e_reg:[a-z]+]]
; X86-DAG: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
; X86-DAG: movl $1, -{{[0-9]+}}(%ebp)
-; X86-DAG: movl %[[addr_reg]], 4(%esp)
-; X86-DAG: movl %[[e_reg]], (%esp)
+; X86: pushl %[[addr_reg]]
+; X86: pushl %[[e_reg]]
; X86: calll _f
-; X86-NEXT: movl $[[restorebb1]], %eax
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86: addl $8, %esp
+; X86: movl $[[restorebb1]], %eax
+; X86: addl $8, %esp
+; X86: popl %ebp
+; X86: retl
; X86: "?catch$[[catch2bb:[0-9]+]]@?0?try_catch_catch@4HA":
; X86: LBB0_[[catch2bb]]: # %handler2{{$}}
@@ -99,13 +100,14 @@ try.cont:
; X86: movl %esp, -[[sp_offset]](%ebp)
; X86-DAG: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
; X86-DAG: movl $1, -{{[0-9]+}}(%ebp)
-; X86-DAG: movl %[[addr_reg]], 4(%esp)
-; X86-DAG: movl $3, (%esp)
+; X86: pushl %[[addr_reg]]
+; X86: pushl $3
; X86: calll _f
-; X86-NEXT: movl $[[restorebb2]], %eax
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86: addl $8, %esp
+; X86: movl $[[restorebb2]], %eax
+; X86: addl $8, %esp
+; X86: popl %ebp
+; X86: retl
; X86: L__ehtable$try_catch_catch:
; X86: $handlerMap$0$try_catch_catch:
@@ -122,19 +124,19 @@ try.cont:
; X64: Lfunc_begin0:
; X64: pushq %rbp
; X64: .seh_pushreg 5
-; X64: subq $48, %rsp
-; X64: .seh_stackalloc 48
-; X64: leaq 48(%rsp), %rbp
-; X64: .seh_setframe 5, 48
+; X64: subq $[[STCK_ALLOC:.*]], %rsp
+; X64: .seh_stackalloc [[STCK_ALLOC]]
+; X64: leaq [[STCK_ALLOC]](%rsp), %rbp
+; X64: .seh_setframe 5, [[STCK_ALLOC]]
; X64: .seh_endprologue
-; X64: movq $-2, -8(%rbp)
+; X64: movq $-2, -16(%rbp)
; X64: .Ltmp0
; X64-DAG: leaq -[[local_offs:[0-9]+]](%rbp), %rdx
; X64-DAG: movl $1, %ecx
; X64: callq f
; X64: [[contbb:\.LBB0_[0-9]+]]: # Block address taken
; X64-NEXT: # %try.cont
-; X64: addq $48, %rsp
+; X64: addq $[[STCK_ALLOC]], %rsp
; X64: popq %rbp
; X64: retq
@@ -145,10 +147,10 @@ try.cont:
; X64: .seh_pushreg 5
; X64: subq $32, %rsp
; X64: .seh_stackalloc 32
-; X64: leaq 48(%rdx), %rbp
+; X64: leaq [[STCK_ALLOC]](%rdx), %rbp
; X64: .seh_endprologue
; X64-DAG: leaq -[[local_offs]](%rbp), %rdx
-; X64-DAG: movl -12(%rbp), %ecx
+; X64-DAG: movl -4(%rbp), %ecx
; X64: callq f
; X64: leaq [[contbb]](%rip), %rax
; X64-NEXT: addq $32, %rsp
@@ -162,7 +164,7 @@ try.cont:
; X64: .seh_pushreg 5
; X64: subq $32, %rsp
; X64: .seh_stackalloc 32
-; X64: leaq 48(%rdx), %rbp
+; X64: leaq [[STCK_ALLOC]](%rdx), %rbp
; X64: .seh_endprologue
; X64-DAG: leaq -[[local_offs]](%rbp), %rdx
; X64-DAG: movl $3, %ecx
@@ -180,7 +182,7 @@ try.cont:
; X64-NEXT: .long ($tryMap$try_catch_catch)@IMGREL
; X64-NEXT: .long 5
; X64-NEXT: .long ($ip2state$try_catch_catch)@IMGREL
-; X64-NEXT: .long 40
+; X64-NEXT: .long 48
; X64-NEXT: .long 0
; X64-NEXT: .long 1
@@ -194,7 +196,7 @@ try.cont:
; X64: $handlerMap$0$try_catch_catch:
; X64-NEXT: .long 0
; X64-NEXT: .long "??_R0H@8"@IMGREL
-; X64-NEXT: .long 36
+; X64-NEXT: .long 60
; X64-NEXT: .long "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
; X64-NEXT: .long 56
; X64-NEXT: .long 64
@@ -255,8 +257,8 @@ try.cont:
; X86: pushl %ebp
; X86: subl $8, %esp
; X86: addl $12, %ebp
-; X86: LBB1_[[loopbb:[0-9]+]]: # %loop
; X86: movl $1, -16(%ebp)
+; X86: LBB1_[[loopbb:[0-9]+]]: # %loop
; X86: calll _getbool
; X86: testb $1, %al
; X86: jne LBB1_[[loopbb]]
diff --git a/test/CodeGen/X86/win-cleanuppad.ll b/test/CodeGen/X86/win-cleanuppad.ll
index 4b0a543a876a..4b3af8c063bf 100644
--- a/test/CodeGen/X86/win-cleanuppad.ll
+++ b/test/CodeGen/X86/win-cleanuppad.ll
@@ -88,11 +88,11 @@ cleanup.outer: ; preds = %invoke.cont.1, %c
}
; X86-LABEL: _nested_cleanup:
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _f
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _f
-; X86: movl $3, (%esp)
+; X86: pushl $3
; X86: calll _f
; X86: "?dtor$[[cleanup_inner:[0-9]+]]@?0?nested_cleanup@4HA":
@@ -163,7 +163,7 @@ cleanup.outer: ; preds = %invoke.cont.1, %c
; X64: retq
; X64: .section .xdata,"dr"
-; X64-NEXT: .align 4
+; X64-NEXT: .p2align 2
; X64: $cppxdata$nested_cleanup:
; X64-NEXT: .long 429065506
; X64-NEXT: .long 2
diff --git a/test/CodeGen/X86/win32-eh-states.ll b/test/CodeGen/X86/win32-eh-states.ll
index 2777d6644e6a..634653dc2f97 100644
--- a/test/CodeGen/X86/win32-eh-states.ll
+++ b/test/CodeGen/X86/win32-eh-states.ll
@@ -68,19 +68,19 @@ catch.7:
; X86: movl $___ehhandler$f, {{.*}}
;
; X86: movl $0, [[state]](%ebp)
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _may_throw
;
; X86: movl $1, [[state]](%ebp)
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _may_throw
;
; X86: movl $2, [[state]](%ebp)
-; X86: movl $3, (%esp)
+; X86: pushl $3
; X86: calll _may_throw
;
; X86: movl $3, [[state]](%ebp)
-; X86: movl $4, (%esp)
+; X86: pushl $4
; X86: calll _may_throw
@@ -172,19 +172,19 @@ unreachable: ; preds = %entry
; X86: movl $___ehhandler$g, {{.*}}
;
; X86: movl $1, [[state]](%ebp)
-; X86: movl $-1, (%esp)
+; X86: pushl $-1
; X86: calll _may_throw
;
; X86: movl $2, [[state]](%ebp)
-; X86: movl $0, (%esp)
+; X86: pushl $0
; X86: calll _may_throw
;
; X86: movl $3, [[state]](%ebp)
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _may_throw
;
; X86: movl $2, [[state]](%ebp)
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _may_throw
; X64-LABEL: g:
diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll
index 73c7b486a55a..88403c687403 100644
--- a/test/CodeGen/X86/win32-eh.ll
+++ b/test/CodeGen/X86/win32-eh.ll
@@ -88,12 +88,58 @@ catch:
; CHECK-LABEL: L__ehtable$use_except_handler4:
; CHECK-NEXT: .long -2
; CHECK-NEXT: .long 0
-; CHECK-NEXT: .long 9999
+; CHECK-NEXT: .long -40
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long -2
; CHECK-NEXT: .long _catchall_filt
; CHECK-NEXT: .long LBB2_2
+define void @use_except_handler4_ssp() sspstrong personality i32 (...)* @_except_handler4 {
+entry:
+ invoke void @may_throw_or_crash()
+ to label %cont unwind label %lpad
+cont:
+ ret void
+lpad:
+ %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+ %p = catchpad within %cs [i8* bitcast (i32 ()* @catchall_filt to i8*)]
+ catchret from %p to label %cont
+}
+
+; CHECK-LABEL: _use_except_handler4_ssp:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: subl ${{[0-9]+}}, %esp
+; CHECK: movl %ebp, %[[ehguard:[^ ,]*]]
+; CHECK: movl %esp, -36(%ebp)
+; CHECK: movl $-2, -16(%ebp)
+; CHECK: movl $L__ehtable$use_except_handler4_ssp, %[[lsda:[^ ,]*]]
+; CHECK: xorl ___security_cookie, %[[lsda]]
+; CHECK: movl %[[lsda]], -20(%ebp)
+; CHECK: xorl ___security_cookie, %[[ehguard]]
+; CHECK: movl %[[ehguard]], -40(%ebp)
+; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $__except_handler4, -24(%ebp)
+; CHECK: movl %fs:0, %[[next:[^ ,]*]]
+; CHECK: movl %[[next]], -28(%ebp)
+; CHECK: movl %[[node]], %fs:0
+; CHECK: calll _may_throw_or_crash
+; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl %[[next]], %fs:0
+; CHECK: retl
+; CHECK: [[catch:[^ ,]*]]: # %catch{{$}}
+
+; CHECK: .section .xdata,"dr"
+; CHECK-LABEL: L__ehtable$use_except_handler4_ssp:
+; CHECK-NEXT: .long -2
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -40
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -2
+; CHECK-NEXT: .long _catchall_filt
+; CHECK-NEXT: .long [[catch]]
+
define void @use_CxxFrameHandler3() personality i32 (...)* @__CxxFrameHandler3 {
invoke void @may_throw_or_crash()
to label %cont unwind label %catchall
@@ -125,7 +171,7 @@ catch:
; CHECK: retl
; CHECK: .section .xdata,"dr"
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3:
; CHECK-NEXT: .long 429065506
; CHECK-NEXT: .long 2
diff --git a/test/CodeGen/X86/win32-seh-catchpad-realign.ll b/test/CodeGen/X86/win32-seh-catchpad-realign.ll
index 23aeea37c117..1ba0c1a0efe1 100644
--- a/test/CodeGen/X86/win32-seh-catchpad-realign.ll
+++ b/test/CodeGen/X86/win32-seh-catchpad-realign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s
; The aligned alloca means that we have to realign the stack, which forces the
; use of ESI to address local variables.
diff --git a/test/CodeGen/X86/win32-seh-catchpad.ll b/test/CodeGen/X86/win32-seh-catchpad.ll
index 224e96f8b8f0..995a4b9eaf2e 100644
--- a/test/CodeGen/X86/win32-seh-catchpad.ll
+++ b/test/CodeGen/X86/win32-seh-catchpad.ll
@@ -32,16 +32,16 @@ invoke.cont: ; preds = %entry
; CHECK-LABEL: _try_except:
; Store state #0
; CHECK: movl $0, -[[state:[0-9]+]](%ebp)
-; CHECK: movl $1, (%esp)
+; CHECK: pushl $1
; CHECK: calll _f
; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $3, (%esp)
+; CHECK: pushl $3
; CHECK: calll _f
; CHECK: retl
; __except
; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $2, (%esp)
+; CHECK: pushl $2
; CHECK: calll _f
; CHECK: .section .xdata,"dr"
@@ -205,7 +205,7 @@ __except:
; CHECK-NEXT: movl -24(%ebp), %esp
; CHECK-NEXT: addl $12, %ebp
; CHECK-NEXT: movl $-1, -16(%ebp)
-; CHECK-NEXT: movl $2, (%esp)
+; CHECK-NEXT: pushl $2
; CHECK-NEXT: calll _f
diff --git a/test/CodeGen/X86/win32-seh-nested-finally.ll b/test/CodeGen/X86/win32-seh-nested-finally.ll
index c283a35d70cf..b732815b8475 100644
--- a/test/CodeGen/X86/win32-seh-nested-finally.ll
+++ b/test/CodeGen/X86/win32-seh-nested-finally.ll
@@ -43,31 +43,35 @@ attributes #3 = { noinline }
; CHECK: movl $-1, -[[state:[0-9]+]](%ebp)
; CHECK: movl {{.*}}, %fs:0
; CHECK: movl $1, -[[state]](%ebp)
-; CHECK: movl $1, (%esp)
+; CHECK: pushl $1
; CHECK: calll _f
+; CHECK: addl $4, %esp
; CHECK: movl $0, -[[state]](%ebp)
-; CHECK: movl $2, (%esp)
+; CHECK: pushl $2
; CHECK: calll _f
+; CHECK: addl $4, %esp
; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $3, (%esp)
+; CHECK: pushl $3
; CHECK: calll _f
+; CHECK: addl $4, %esp
; CHECK: retl
; CHECK: LBB0_[[inner:[0-9]+]]: # %ehcleanup
; CHECK: pushl %ebp
; CHECK: addl $12, %ebp
-; CHECK: movl $0, -[[state]](%ebp)
-; CHECK: movl $2, (%esp)
+; CHECK: pushl $2
; CHECK: calll _f
+; CHECK: addl $4, %esp
+; CHECK: addl $4, %esp
; CHECK: popl %ebp
; CHECK: retl
; CHECK: LBB0_[[outer:[0-9]+]]: # %ehcleanup.3
; CHECK: pushl %ebp
; CHECK: addl $12, %ebp
-; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $3, (%esp)
+; CHECK: pushl $3
; CHECK: calll _f
+; CHECK: addl $8, %esp
; CHECK: popl %ebp
; CHECK: retl
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index b38273ad9594..56008e15910e 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -135,12 +135,11 @@ entry:
; Load the address of the result and put it onto stack
-; (through %ecx in the -O0 build).
-; WIN32: leal {{[0-9]+}}(%esp), %e{{[a-d]}}x
-; WIN32: movl %e{{[a-d]}}x, (%e{{([a-d]x)|(sp)}})
-
; The this pointer goes to ECX.
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; (through %ecx in the -O0 build).
+; WIN32: leal {{[0-9]*}}(%esp), %e{{[a-d]}}x
+; WIN32: leal {{[0-9]*}}(%esp), %ecx
+; WIN32: {{pushl %e[a-d]x|movl %e[a-d]x, \(%esp\)}}
; WIN32-NEXT: calll "?foo@C5@@QAE?AUS5@@XZ"
; WIN32: retl
ret void
@@ -155,25 +154,21 @@ define void @test6_f(%struct.test6* %x) nounwind {
; LINUX-LABEL: test6_f:
; The %x argument is moved to %ecx. It will be the this pointer.
-; WIN32: movl 20(%esp), %ecx
-
-; The %x argument is moved to (%esp). It will be the this pointer. With -O0
-; we copy esp to ecx and use (ecx) instead of (esp).
-; MINGW_X86: movl 20(%esp), %eax
-; MINGW_X86: movl %eax, (%e{{([a-d]x)|(sp)}})
+; WIN32: movl {{16|20}}(%esp), %ecx
-; CYGWIN: movl 20(%esp), %eax
-; CYGWIN: movl %eax, (%e{{([a-d]x)|(sp)}})
; The sret pointer is (%esp)
-; WIN32: leal 4(%esp), %[[REG:e[a-d]x]]
-; WIN32-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
+; WIN32: leal {{4?}}(%esp), %eax
+; WIN32-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
; The sret pointer is %ecx
-; MINGW_X86-NEXT: leal 4(%esp), %ecx
+; The %x argument is moved to (%esp). It will be the this pointer.
+; MINGW_X86: leal {{4?}}(%esp), %ecx
+; MINGW_X86-NEXT: {{pushl 16\(%esp\)|movl %eax, \(%esp\)}}
; MINGW_X86-NEXT: calll _test6_g
-; CYGWIN-NEXT: leal 4(%esp), %ecx
+; CYGWIN: leal {{4?}}(%esp), %ecx
+; CYGWIN-NEXT: {{pushl 16\(%esp\)|movl %eax, \(%esp\)}}
; CYGWIN-NEXT: calll _test6_g
%tmp = alloca %struct.test6, align 4
@@ -191,17 +186,17 @@ define void @test7_f(%struct.test7* %x) nounwind {
; LINUX-LABEL: test7_f:
; The %x argument is moved to %ecx on all OSs. It will be the this pointer.
-; WIN32: movl 20(%esp), %ecx
-; MINGW_X86: movl 20(%esp), %ecx
-; CYGWIN: movl 20(%esp), %ecx
+; WIN32: movl {{16|20}}(%esp), %ecx
+; MINGW_X86: movl {{16|20}}(%esp), %ecx
+; CYGWIN: movl {{16|20}}(%esp), %ecx
; The sret pointer is (%esp)
-; WIN32: leal 4(%esp), %[[REG:e[a-d]x]]
-; WIN32-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
-; MINGW_X86: leal 4(%esp), %[[REG:e[a-d]x]]
-; MINGW_X86-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
-; CYGWIN: leal 4(%esp), %[[REG:e[a-d]x]]
-; CYGWIN-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
+; WIN32: leal {{4?}}(%esp), %eax
+; WIN32-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
+; MINGW_X86: leal {{4?}}(%esp), %eax
+; MINGW_X86-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
+; CYGWIN: leal {{4?}}(%esp), %eax
+; CYGWIN-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
%tmp = alloca %struct.test7, align 4
call x86_thiscallcc void @test7_g(%struct.test7* %x, %struct.test7* sret %tmp)
diff --git a/test/CodeGen/X86/win64_eh.ll b/test/CodeGen/X86/win64_eh.ll
index cb9d026bec2d..9421f00c8107 100644
--- a/test/CodeGen/X86/win64_eh.ll
+++ b/test/CodeGen/X86/win64_eh.ll
@@ -47,7 +47,6 @@ entry:
; WIN64: .seh_endproc
-; Checks stack push
define i32 @foo3(i32 %f_arg, i32 %e_arg, i32 %d_arg, i32 %c_arg, i32 %b_arg, i32 %a_arg) uwtable {
entry:
%a = alloca i32
@@ -83,14 +82,11 @@ entry:
}
; WIN64-LABEL: foo3:
; WIN64: .seh_proc foo3
-; WIN64: pushq %rsi
-; WIN64: .seh_pushreg 6
; NORM: subq $24, %rsp
; ATOM: leaq -24(%rsp), %rsp
; WIN64: .seh_stackalloc 24
; WIN64: .seh_endprologue
; WIN64: addq $24, %rsp
-; WIN64: popq %rsi
; WIN64: ret
; WIN64: .seh_endproc
diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll
index 77c37b4d348e..a674d8c080af 100644
--- a/test/CodeGen/X86/win_cst_pool.ll
+++ b/test/CodeGen/X86/win_cst_pool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=sse2 -mattr=avx | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"
@@ -7,7 +7,7 @@ define double @double() {
}
; CHECK: .globl __real@0000000000800000
; CHECK-NEXT: .section .rdata,"dr",discard,__real@0000000000800000
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: __real@0000000000800000:
; CHECK-NEXT: .quad 8388608
; CHECK: double:
@@ -19,7 +19,7 @@ define <4 x i32> @vec1() {
}
; CHECK: .globl __xmm@00000000000000010000000200000003
; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000000000000010000000200000003
-; CHECK-NEXT: .align 16
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: __xmm@00000000000000010000000200000003:
; CHECK-NEXT: .long 3
; CHECK-NEXT: .long 2
@@ -34,7 +34,7 @@ define <8 x i16> @vec2() {
}
; CHECK: .globl __xmm@00000001000200030004000500060007
; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000001000200030004000500060007
-; CHECK-NEXT: .align 16
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: __xmm@00000001000200030004000500060007:
; CHECK-NEXT: .short 7
; CHECK-NEXT: .short 6
@@ -54,7 +54,7 @@ define <4 x float> @undef1() {
; CHECK: .globl __xmm@00000000000000003f8000003f800000
; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000000000000003f8000003f800000
-; CHECK-NEXT: .align 16
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: __xmm@00000000000000003f8000003f800000:
; CHECK-NEXT: .long 1065353216 # float 1
; CHECK-NEXT: .long 1065353216 # float 1
@@ -73,7 +73,21 @@ define float @pr23966(i32 %a) {
; CHECK: .globl __real@bf8000003f800000
; CHECK-NEXT: .section .rdata,"dr",discard,__real@bf8000003f800000
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: __real@bf8000003f800000:
; CHECK-NEXT: .long 1065353216
; CHECK-NEXT: .long 3212836864
+
+define <4 x i64> @ymm() {
+entry:
+ ret <4 x i64> <i64 8589934593, i64 17179869187, i64 8589934593, i64 17179869187>
+}
+
+; CHECK: .globl __ymm@0000000400000003000000020000000100000004000000030000000200000001
+; CHECK: .section .rdata,"dr",discard,__ymm@0000000400000003000000020000000100000004000000030000000200000001
+; CHECK: .p2align 5
+; CHECK: __ymm@0000000400000003000000020000000100000004000000030000000200000001:
+; CHECK: .quad 8589934593 # 0x200000001
+; CHECK: .quad 17179869187 # 0x400000003
+; CHECK: .quad 8589934593 # 0x200000001
+; CHECK: .quad 17179869187
diff --git a/test/CodeGen/X86/x86-16.ll b/test/CodeGen/X86/x86-16.ll
new file mode 100644
index 000000000000..775b2c447bbd
--- /dev/null
+++ b/test/CodeGen/X86/x86-16.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-code16"
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval, align 4
+ ret i32 0
+}
+
+; CHECK: .code16
+; CHECK-LABEL: main
+
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 (trunk 265439) (llvm/trunk 265567)"} \ No newline at end of file
diff --git a/test/CodeGen/X86/x86-32-intrcc.ll b/test/CodeGen/X86/x86-32-intrcc.ll
index 99d0044c6de6..9794f2cb3e46 100644
--- a/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/test/CodeGen/X86/x86-32-intrcc.ll
@@ -3,7 +3,7 @@
%struct.interrupt_frame = type { i32, i32, i32, i32, i32 }
-@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+@llvm.used = appending global [4 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_clobbers to i8*), i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_x87 to i8*)], section "llvm.metadata"
; Spills eax, putting original esp at +4.
; No stack adjustment if declared with no error code
@@ -77,3 +77,19 @@ define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i32 %
ret void
}
+@f80 = common global x86_fp80 0xK00000000000000000000, align 4
+
+; Test that the presence of x87 does not crash the FP stackifier
+define x86_intrcc void @test_isr_x87(%struct.interrupt_frame* %frame) {
+ ; CHECK-LABEL: test_isr_x87
+ ; CHECK-DAG: fldt f80
+ ; CHECK-DAG: fld1
+ ; CHECK: faddp
+ ; CHECK-NEXT: fstpt f80
+ ; CHECK-NEXT: iretl
+entry:
+ %ld = load x86_fp80, x86_fp80* @f80, align 4
+ %add = fadd x86_fp80 %ld, 0xK3FFF8000000000000000
+ store x86_fp80 %add, x86_fp80* @f80, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/x86-32-vector-calling-conv.ll b/test/CodeGen/X86/x86-32-vector-calling-conv.ll
index b2bda7ab8d01..e87f2b065d3a 100644
--- a/test/CodeGen/X86/x86-32-vector-calling-conv.ll
+++ b/test/CodeGen/X86/x86-32-vector-calling-conv.ll
@@ -2,11 +2,11 @@
; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=LINUX
; CHECK-LABEL: test_sse:
-; DARWIN-DAG: vpaddd %xmm1, %xmm0, %xmm0
-; DARWIN-DAG: vpaddd %xmm3, %xmm2, %xmm1
+; DARWIN: vpaddd %xmm3, %xmm2, %xmm2
+; DARWIN: vpaddd %xmm2, %xmm1, %xmm1
; DARWIN: vpaddd %xmm1, %xmm0, %xmm0
-; LINUX-DAG: vpaddd %xmm1, %xmm0, %xmm0
-; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm1
+; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm2
+; LINUX: vpaddd %xmm2, %xmm1, %xmm1
; LINUX: vpaddd %xmm1, %xmm0, %xmm0
define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
%r0 = add <4 x i32> %a, %b
@@ -16,11 +16,11 @@ define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %
}
; CHECK-LABEL: test_avx:
-; DARWIN-DAG: vpaddd %ymm1, %ymm0, %ymm0
-; DARWIN-DAG: vpaddd %ymm3, %ymm2, %ymm1
+; DARWIN: vpaddd %ymm3, %ymm2, %ymm2
+; DARWIN: vpaddd %ymm2, %ymm1, %ymm1
; DARWIN: vpaddd %ymm1, %ymm0, %ymm0
-; LINUX-DAG: vpaddd %ymm1, %ymm0, %ymm0
-; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm1
+; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm2
+; LINUX: vpaddd %ymm2, %ymm1, %ymm1
; LINUX: vpaddd %ymm1, %ymm0, %ymm0
define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind {
%r0 = add <8 x i32> %a, %b
@@ -30,11 +30,11 @@ define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %
}
; CHECK-LABEL: test_avx512:
-; DARWIN-DAG: vpaddd %zmm1, %zmm0, %zmm0
-; DARWIN-DAG: vpaddd %zmm3, %zmm2, %zmm1
+; DARWIN: vpaddd %zmm3, %zmm2, %zmm2
+; DARWIN: vpaddd %zmm2, %zmm1, %zmm1
; DARWIN: vpaddd %zmm1, %zmm0, %zmm0
-; LINUX-DAG: vpaddd %zmm1, %zmm0, %zmm0
-; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm1
+; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm2
+; LINUX: vpaddd %zmm2, %zmm1, %zmm1
; LINUX: vpaddd %zmm1, %zmm0, %zmm0
define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind {
%r0 = add <16 x i32> %a, %b
diff --git a/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/test/CodeGen/X86/x86-64-flags-intrinsics.ll
index 4c5032aedbca..2852ef49e0a5 100644
--- a/test/CodeGen/X86/x86-64-flags-intrinsics.ll
+++ b/test/CodeGen/X86/x86-64-flags-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
target triple = "x86_64-pc-win32"
declare i64 @llvm.x86.flags.read.u64()
diff --git a/test/CodeGen/X86/x86-64-intrcc.ll b/test/CodeGen/X86/x86-64-intrcc.ll
index 429209c063ca..2bcf3cde478a 100644
--- a/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/test/CodeGen/X86/x86-64-intrcc.ll
@@ -3,7 +3,7 @@
%struct.interrupt_frame = type { i64, i64, i64, i64, i64 }
-@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+@llvm.used = appending global [4 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_clobbers to i8*), i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_x87 to i8*)], section "llvm.metadata"
; Spills rax, putting original esp at +8.
; No stack adjustment if declared with no error code
@@ -83,4 +83,21 @@ define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %
; CHECK0-SSE-NEXT: addq $8, %rsp
; CHECK0-SSE-NEXT: iretq
ret void
-} \ No newline at end of file
+}
+
+@f80 = common global x86_fp80 0xK00000000000000000000, align 4
+
+; Test that the presence of x87 does not crash the FP stackifier
+define x86_intrcc void @test_isr_x87(%struct.interrupt_frame* %frame) {
+ ; CHECK-LABEL: test_isr_x87
+ ; CHECK-DAG: fldt f80
+ ; CHECK-DAG: fld1
+ ; CHECK: faddp
+ ; CHECK-NEXT: fstpt f80
+ ; CHECK-NEXT: iretq
+entry:
+ %ld = load x86_fp80, x86_fp80* @f80, align 4
+ %add = fadd x86_fp80 %ld, 0xK3FFF8000000000000000
+ store x86_fp80 %add, x86_fp80* @f80, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/x86-64-pic.ll b/test/CodeGen/X86/x86-64-pic.ll
new file mode 100644
index 000000000000..76ed8894b417
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-pic.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic | FileCheck %s
+
+@g1 = private global i8 1
+define i8* @get_g1() {
+; CHECK: get_g1:
+; CHECK: leaq .Lg1(%rip), %rax
+ ret i8* @g1
+}
diff --git a/test/CodeGen/X86/x86-64-plt-relative-reloc.ll b/test/CodeGen/X86/x86-64-plt-relative-reloc.ll
new file mode 100644
index 000000000000..8ba480d1e1d6
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-plt-relative-reloc.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=x86_64-unknown-linux -o - %s | FileCheck %s
+
+@vtable = constant [5 x i32] [i32 0,
+ i32 trunc (i64 sub (i64 ptrtoint (void ()* @fn1 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32),
+ i32 trunc (i64 sub (i64 ptrtoint (void ()* @fn2 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32),
+ i32 trunc (i64 sub (i64 ptrtoint (void ()* @fn3 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32),
+ i32 trunc (i64 sub (i64 ptrtoint (i8* @global4 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32)
+]
+
+declare void @fn1() unnamed_addr
+declare void @fn2() unnamed_addr
+declare void @fn3()
+@global4 = external unnamed_addr global i8
+
+; CHECK: .long 0
+; CHECK-NEXT: .long (fn1@PLT-vtable)-4
+; CHECK-NEXT: .long (fn2@PLT-vtable)-4
+; CHECK-NEXT: .long (fn3-vtable)-4
+; CHECK-NEXT: .long (global4-vtable)-4
diff --git a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
index c476ffd84053..b1f4ca562236 100644
--- a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
+++ b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
-; RUN: llc -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
; x32 uses %esp, %ebp as stack and frame pointers
diff --git a/test/CodeGen/X86/x86-big-ret.ll b/test/CodeGen/X86/x86-big-ret.ll
new file mode 100644
index 000000000000..b7fed33f396b
--- /dev/null
+++ b/test/CodeGen/X86/x86-big-ret.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc"
+
+define x86_fastcallcc i32 @test1(i32 inreg %V, [65533 x i8]* byval %p_arg) {
+ ret i32 %V
+}
+; CHECK-LABEL: @test1@65540:
+; CHECK: movl %ecx, %eax
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: addl $65536, %esp
+; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: retl
+
+define x86_stdcallcc void @test2([65533 x i8]* byval %p_arg) {
+ ret void
+}
+; CHECK-LABEL: _test2@65536:
+; CHECK: popl %ecx
+; CHECK-NEXT: addl $65536, %esp
+; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/x86-flags-intrinsics.ll b/test/CodeGen/X86/x86-flags-intrinsics.ll
index 325de7d5f1e7..e2233aec22c7 100644
--- a/test/CodeGen/X86/x86-flags-intrinsics.ll
+++ b/test/CodeGen/X86/x86-flags-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
target triple = "i686-pc-win32"
declare i32 @llvm.x86.flags.read.u32()
diff --git a/test/CodeGen/X86/x86-interrupt_cc.ll b/test/CodeGen/X86/x86-interrupt_cc.ll
new file mode 100644
index 000000000000..b91b8fbfb76d
--- /dev/null
+++ b/test/CodeGen/X86/x86-interrupt_cc.ll
@@ -0,0 +1,33 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx -show-mc-encoding -mattr=+avx512f < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK64
+; RUN: llc -verify-machineinstrs -mtriple=i386-apple-macosx -show-mc-encoding -mattr=+avx512f < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK32
+
+; Make sure we spill the high numbered ZMM registers and K registers with the right encoding.
+; CHECK-LABEL: foo
+; CHECK: kmovq %k7, {{.+}}
+; CHECK64: encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x08,0x00,0x00]
+; CHECK32: encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x02,0x00,0x00]
+; k6 is used as an anchor for the previous regexp.
+; CHECK-NEXT: kmovq %k6
+
+; CHECK64: movups %zmm31, {{.+}}
+; CHECK64: encoding: [0x62,0x61,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x07,0x00,0x00]
+; zmm30 is used as an anchor for the previous regexp.
+; CHECK64-NEXT: movups %zmm30
+
+; CHECK32-NOT: zmm31
+; CHECK32-NOT: zmm8
+; CHECK32: movups %zmm7, {{.+}}
+; CHECK32: encoding: [0x62,0xf1,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x01,0x00,0x00]
+; zmm6 is used as an anchor for the previous regexp.
+; CHECK32-NEXT: movups %zmm6
+
+; CHECK: call
+; CHECK: iret
+
+define x86_intrcc void @foo(i8* %frame) {
+ call void @bar()
+ ret void
+}
+
+declare void @bar()
+
diff --git a/test/CodeGen/X86/x86-interrupt_cld.ll b/test/CodeGen/X86/x86-interrupt_cld.ll
new file mode 100644
index 000000000000..bbb109eb633e
--- /dev/null
+++ b/test/CodeGen/X86/x86-interrupt_cld.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Checks that interrupt handler code calls cld before calling an external
+;; function.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK: cld
+; CHECK: call
+
+define x86_intrcc void @foo(i8* %frame) {
+ call void @bar()
+ ret void
+}
+
+declare void @bar()
+
diff --git a/test/CodeGen/X86/x86-interrupt_vzeroupper.ll b/test/CodeGen/X86/x86-interrupt_vzeroupper.ll
new file mode 100644
index 000000000000..b735ae82bd52
--- /dev/null
+++ b/test/CodeGen/X86/x86-interrupt_vzeroupper.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Checks that interrupt handler code does not call "vzeroupper" instruction
+;; before iret.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK: vzeroupper
+; CHECK-NEXT: call
+; CHECK-NOT: vzeroupper
+; CHECK: iret
+
+define x86_intrcc void @foo(i8* %frame) {
+ call void @bar()
+ ret void
+}
+
+declare void @bar()
+
diff --git a/test/CodeGen/X86/x86-plt-relative-reloc.ll b/test/CodeGen/X86/x86-plt-relative-reloc.ll
new file mode 100644
index 000000000000..733a4cb5f034
--- /dev/null
+++ b/test/CodeGen/X86/x86-plt-relative-reloc.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=i686-unknown-linux -o - %s | FileCheck %s
+
+@vtable = constant [4 x i32] [i32 0,
+ i32 sub (i32 ptrtoint (void ()* @fn1 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32)),
+ i32 sub (i32 ptrtoint (void ()* @fn2 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32)),
+ i32 sub (i32 ptrtoint (void ()* @fn3 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32))
+]
+
+declare void @fn1() unnamed_addr
+declare void @fn2() unnamed_addr
+declare void @fn3()
+
+; CHECK: .long 0
+; CHECK-NEXT: .long (fn1@PLT-vtable)-4
+; CHECK-NEXT: .long (fn2@PLT-vtable)-4
+; CHECK-NEXT: .long (fn3-vtable)-4
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index eb87f7101d7c..2899e38b71cd 100644
--- a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
+; RUN: llc %s -o - | FileCheck %s
;
; Note: This test cannot be merged with the shrink-wrapping tests
; because the booleans set on the command line take precedence on
@@ -185,7 +185,7 @@ attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
; CHECK-NEXT: je [[STRINGS_EQUAL]]
;
; CHECK: [[STRINGS_EQUAL]]
-; CHECK-NEXT: popq
+; CHECK: popq
define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 {
entry:
%cmp.i = icmp eq i8* %vk1, null
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 609e2cc1158c..5b6e773fe5d4 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -811,8 +811,6 @@ end:
;
; Load the value of b.
; CHECK: movb _b(%rip), [[BOOL:%cl]]
-; Extract i1 from the loaded value.
-; CHECK-NEXT: andb $1, [[BOOL]]
; Create the zero value for the select assignment.
; CHECK-NEXT: xorl [[CMOVE_VAL:%eax]], [[CMOVE_VAL]]
; CHECK-NEXT: testb [[BOOL]], [[BOOL]]
diff --git a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
index d885f1cd364f..d3a12862a9e4 100644
--- a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
+++ b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
@@ -1,41 +1,44 @@
-; RUN: llc -mattr=+avx < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mattr=+avx | FileCheck %s
; Check that we properly upgrade the AVX vbroadcast intrinsics to IR. The
; expectation is that we should still get the original instruction back that
; maps to the intrinsic.
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.9.0"
-; CHECK-LABEL: test_mm_broadcast_ss:
define <4 x float> @test_mm_broadcast_ss(float* readonly %__a){
+; CHECK-LABEL: test_mm_broadcast_ss:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT: retq
entry:
%0 = bitcast float* %__a to i8*
-; CHECK: vbroadcastss (%{{.*}}), %xmm
%1 = tail call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %0)
ret <4 x float> %1
}
+declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*)
-; CHECK-LABEL: test_mm256_broadcast_sd:
define <4 x double> @test_mm256_broadcast_sd(double* readonly %__a) {
+; CHECK-LABEL: test_mm256_broadcast_sd:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT: retq
entry:
%0 = bitcast double* %__a to i8*
-; CHECK: vbroadcastsd (%{{.*}}), %ymm
%1 = tail call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %0)
ret <4 x double> %1
}
+declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*)
-; CHECK-LABEL: test_mm256_broadcast_ss:
define <8 x float> @test_mm256_broadcast_ss(float* readonly %__a) {
+; CHECK-LABEL: test_mm256_broadcast_ss:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT: retq
entry:
%0 = bitcast float* %__a to i8*
-; CHECK: vbroadcastss (%{{.*}}), %ymm
%1 = tail call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %0)
ret <8 x float> %1
}
-
-declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*)
-
-declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*)
-
declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*)
diff --git a/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
index d4813ea47a3d..8e081b9e4100 100644
--- a/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
@@ -1,13 +1,17 @@
-; RUN: llc -mattr=+avx2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx2 | FileCheck %s
; Check that we properly upgrade the AVX2 vbroadcast intrinsic to IR.
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
define <4 x i64> @broadcast128(<2 x i64> %src) {
- ; CHECK-LABEL: broadcast128
- ; CHECK: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-LABEL: broadcast128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = alloca <2 x i64>, align 16
%2 = bitcast <2 x i64>* %1 to i8*
store <2 x i64> %src, <2 x i64>* %1, align 16
diff --git a/test/CodeGen/X86/x87.ll b/test/CodeGen/X86/x87.ll
new file mode 100644
index 000000000000..683d7b05cf8c
--- /dev/null
+++ b/test/CodeGen/X86/x87.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X87
+; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=X87
+; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+
+define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone {
+; X87-LABEL: test:
+; NOX87-LABEL: test:
+; X87: fild
+; NOX87: __floatunsisf
+ %tmp = uitofp i32 %i to float
+
+; X87: fild
+; NOX87: __floatdisf
+ %tmp1 = sitofp i64 %l to float
+
+; X87: fadd
+; NOX87: __addsf3
+ %tmp2 = fadd float %tmp, %tmp1
+
+; X87: fstp
+ store float %tmp2, float* %pf
+
+; X87: fild
+; NOX87: __floatunsidf
+ %tmp3 = uitofp i32 %i to double
+
+; X87: fild
+; NOX87: __floatdidf
+ %tmp4 = sitofp i64 %l to double
+
+; X87: fadd
+; NOX87: __adddf3
+ %tmp5 = fadd double %tmp3, %tmp4
+
+; X87: fstp
+ store double %tmp5, double* %pd
+
+; X87: __floatsitf
+; NOX87: __floatsitf
+ %tmp6 = sitofp i32 %i to fp128
+
+; X87: __floatunditf
+; NOX87: __floatunditf
+ %tmp7 = uitofp i64 %l to fp128
+
+; X87: __addtf3
+; NOX87: __addtf3
+ %tmp8 = fadd fp128 %tmp6, %tmp7
+ store fp128 %tmp8, fp128* %pld
+
+ ret void
+}
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index 7c4b60d264c9..eb0fd8649868 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
-
+; RUN: llc -mtriple=x86_64-darwin-unknown -mcpu=knl < %s | FileCheck %s --check-prefix=KNL
;
; Get the actual value of the overflow bit.
;
@@ -295,7 +295,7 @@ entry:
define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) {
entry:
; CHECK-LABEL: smulo.i8
-; CHECK: movb %dil, %al
+; CHECK: movl %edi, %eax
; CHECK-NEXT: imulb %sil
; CHECK-NEXT: seto %cl
%t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -345,7 +345,7 @@ entry:
define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) {
entry:
; CHECK-LABEL: umulo.i8
-; CHECK: movb %dil, %al
+; CHECK: movl %edi, %eax
; CHECK-NEXT: mulb %sil
; CHECK-NEXT: seto %cl
%t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -734,6 +734,26 @@ continue:
ret i1 true
}
+define i1 @bug27873(i64 %c1, i1 %c2) {
+; KNL-LABEL: bug27873:
+; KNL: ## BB#0:
+; KNL-NEXT: andl $1, %esi
+; KNL-NEXT: kmovw %esi, %k0
+; KNL-NEXT: movl $160, %ecx
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: mulq %rcx
+; KNL-NEXT: seto %al
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: korw %k1, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: # kill
+; KNL-NEXT: retq
+ %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
+ %mul.overflow = extractvalue { i64, i1 } %mul, 1
+ %x1 = or i1 %c2, %mul.overflow
+ ret i1 %x1
+}
+
declare {i8, i1} @llvm.sadd.with.overflow.i8 (i8, i8 ) nounwind readnone
declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
index 825efa6361b5..76a7e72ca961 100644
--- a/test/CodeGen/X86/xmulo.ll
+++ b/test/CodeGen/X86/xmulo.ll
@@ -9,9 +9,9 @@ declare i32 @printf(i8*, ...)
define i32 @t1() nounwind {
; CHECK-LABEL: t1:
-; CHECK: movl $0, 12(%esp)
-; CHECK: movl $0, 8(%esp)
-; CHECK: movl $72, 4(%esp)
+; CHECK: pushl $0
+; CHECK: pushl $0
+; CHECK: pushl $72
%1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
%2 = extractvalue {i64, i1} %1, 0
@@ -23,9 +23,9 @@ define i32 @t1() nounwind {
define i32 @t2() nounwind {
; CHECK-LABEL: t2:
-; CHECK: movl $0, 12(%esp)
-; CHECK: movl $0, 8(%esp)
-; CHECK: movl $0, 4(%esp)
+; CHECK: pushl $0
+; CHECK: pushl $0
+; CHECK: pushl $0
%1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
%2 = extractvalue {i64, i1} %1, 0
@@ -37,9 +37,9 @@ define i32 @t2() nounwind {
define i32 @t3() nounwind {
; CHECK-LABEL: t3:
-; CHECK: movl $1, 12(%esp)
-; CHECK: movl $-1, 8(%esp)
-; CHECK: movl $-9, 4(%esp)
+; CHECK: pushl $1
+; CHECK: pushl $-1
+; CHECK: pushl $-9
%1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
%2 = extractvalue {i64, i1} %1, 0
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..a9287e7d8c91
--- /dev/null
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -0,0 +1,1111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/xop-builtins.c
+
+define <2 x i64> @test_mm_maccs_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_macc_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macc_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macc_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_maccsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccsd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccsd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maccd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maccs_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccs_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccs_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_macc_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macc_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macc_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maccslo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccslo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccslo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_macclo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macclo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macclo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_maccshi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccshi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccshi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_macchi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macchi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macchi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_maddsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maddsd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maddsd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maddd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maddd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maddd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_haddw_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddw_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphaddbw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddw_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphaddbw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphaddbd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphaddbd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphaddbq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphaddbq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddwq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddwq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphadddq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphadddq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_haddw_epu8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddw_epu8:
+; X32: # BB#0:
+; X32-NEXT: vphaddubw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddw_epu8:
+; X64: # BB#0:
+; X64-NEXT: vphaddubw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epu8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epu8:
+; X32: # BB#0:
+; X32-NEXT: vphaddubd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epu8:
+; X64: # BB#0:
+; X64-NEXT: vphaddubd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epu8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epu8:
+; X32: # BB#0:
+; X32-NEXT: vphaddubq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epu8:
+; X64: # BB#0:
+; X64-NEXT: vphaddubq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epu16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epu16:
+; X32: # BB#0:
+; X32-NEXT: vphadduwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epu16:
+; X64: # BB#0:
+; X64-NEXT: vphadduwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_mm_haddq_epu16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epu16:
+; X32: # BB#0:
+; X32-NEXT: vphadduwq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epu16:
+; X64: # BB#0:
+; X64-NEXT: vphadduwq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epu32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epu32:
+; X32: # BB#0:
+; X32-NEXT: vphaddudq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epu32:
+; X64: # BB#0:
+; X64-NEXT: vphaddudq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubw_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_hsubw_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphsubbw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_hsubw_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphsubbw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubd_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_hsubd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphsubwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_hsubd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphsubwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubq_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_hsubq_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphsubdq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_hsubq_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphsubdq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_cmov_si128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_cmov_si128:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; X32-NEXT: vpxor %xmm3, %xmm2, %xmm3
+; X32-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X32-NEXT: vpand %xmm3, %xmm1, %xmm1
+; X32-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmov_si128:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; X64-NEXT: vpxor %xmm3, %xmm2, %xmm3
+; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpand %xmm3, %xmm1, %xmm1
+; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_cmov_si256:
+; X32: # BB#0:
+; X32-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmov_si256:
+; X64: # BB#0:
+; X64-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_perm_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_perm_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_perm_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi8:
+; X32: # BB#0:
+; X32-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi8:
+; X64: # BB#0:
+; X64-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi16:
+; X32: # BB#0:
+; X32-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi16:
+; X64: # BB#0:
+; X64-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi32:
+; X32: # BB#0:
+; X32-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi32:
+; X64: # BB#0:
+; X64-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi64:
+; X32: # BB#0:
+; X32-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi64:
+; X64: # BB#0:
+; X64-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi8:
+; X32: # BB#0:
+; X32-NEXT: vprotb $1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi8:
+; X64: # BB#0:
+; X64-NEXT: vprotb $1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %arg0, i8 1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi16:
+; X32: # BB#0:
+; X32-NEXT: vprotw $50, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi16:
+; X64: # BB#0:
+; X64-NEXT: vprotw $50, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %arg0, i8 50)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi32:
+; X32: # BB#0:
+; X32-NEXT: vprotd $226, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi32:
+; X64: # BB#0:
+; X64-NEXT: vprotd $226, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %arg0, i8 -30)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi64:
+; X32: # BB#0:
+; X32-NEXT: vprotq $100, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi64:
+; X64: # BB#0:
+; X64-NEXT: vprotq $100, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 100)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpshlq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpshlq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %arg0, <16 x i8> %arg1, i8 0)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %arg0, <8 x i16> %arg1, i8 0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %arg0, <4 x i32> %arg1, i8 0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu64:
+; X32: # BB#0:
+; X32-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu64:
+; X64: # BB#0:
+; X64-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %arg0, <16 x i8> %arg1, i8 0)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %arg0, <8 x i16> %arg1, i8 0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %arg0, <4 x i32> %arg1, i8 0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <2 x double> @test_mm_permute2_pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_permute2_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute2_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
+
+define <4 x double> @test_mm256_permute2_pd(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_permute2_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
+
+define <4 x float> @test_mm_permute2_ps(<4 x float> %a0, <4 x float> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_permute2_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute2_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %arg2, i8 0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
+
+define <8 x float> @test_mm256_permute2_ps(<8 x float> %a0, <8 x float> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_permute2_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %arg2, i8 0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
+
+define <4 x float> @test_mm_frcz_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_frcz_ss:
+; X32: # BB#0:
+; X32-NEXT: vfrczss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_ss:
+; X64: # BB#0:
+; X64-NEXT: vfrczss %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_frcz_sd(<2 x double> %a0) {
+; X32-LABEL: test_mm_frcz_sd:
+; X32: # BB#0:
+; X32-NEXT: vfrczsd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_sd:
+; X64: # BB#0:
+; X64-NEXT: vfrczsd %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_frcz_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_frcz_ps:
+; X32: # BB#0:
+; X32-NEXT: vfrczps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_ps:
+; X64: # BB#0:
+; X64-NEXT: vfrczps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_frcz_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_frcz_pd:
+; X32: # BB#0:
+; X32-NEXT: vfrczpd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_pd:
+; X64: # BB#0:
+; X64-NEXT: vfrczpd %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_frcz_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_frcz_ps:
+; X32: # BB#0:
+; X32-NEXT: vfrczps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_frcz_ps:
+; X64: # BB#0:
+; X64-NEXT: vfrczps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_frcz_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_frcz_pd:
+; X32: # BB#0:
+; X32-NEXT: vfrczpd %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_frcz_pd:
+; X64: # BB#0:
+; X64-NEXT: vfrczpd %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
new file mode 100644
index 000000000000..6fba72f2681b
--- /dev/null
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
@@ -0,0 +1,727 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s
+
+define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1]
+ ret <2 x double> %res
+}
+define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %a1
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1]
+ ret <2 x double> %res
+}
+define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %a2
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
+ ret <4 x double> %res
+}
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %a1
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
+ ret <4 x double> %res
+}
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %a2
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_int_x86_xop_vpcomeqb_mem(<16 x i8> %a0, <16 x i8>* %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqb_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqb (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %a1
+ %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %vec) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomequb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomequd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomequq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomequw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalsed:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalsed %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalsew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalsew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomged(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomged:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomged %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomleb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomled(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomled:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomled %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomleq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomleub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomleud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomlew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomlew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomlew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomltb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomltd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomltq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomltub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomltud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomltw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomneb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomned(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomned:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomneq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomneub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomneud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomnew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomnew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrued:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrued %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtruew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtruew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index 3b4c6ea12107..bb6ef50cdc6c 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -1,649 +1,263 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4,+xop | FileCheck %s
-
-define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: vpermil2pd
- %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s
+
+define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 1) ; [#uses=1]
ret <2 x double> %res
}
-define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
+define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a1
- %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1]
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x i64> %a2, i8 1) ; [#uses=1]
ret <2 x double> %res
}
-define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
- %vec = load <2 x double>, <2 x double>* %a2
- %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1]
+define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %a2
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %vec, i8 1) ; [#uses=1]
ret <2 x double> %res
}
-declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
-define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
- ; CHECK: vpermil2pd
- ; CHECK: ymm
- %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
+define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 2) ;
ret <4 x double> %res
}
-define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
- ; CHECK: ymm
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a1
- %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x i64> %a2, i8 2) ;
ret <4 x double> %res
}
-define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
- ; CHECK: ymm
- %vec = load <4 x double>, <4 x double>* %a2
- %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x i64>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %a2
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %vec, i8 2) ;
ret <4 x double> %res
}
-declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
-define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: vpermil2ps
- %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
+define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 3) ;
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
-define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: vpermil2ps
- ; CHECK: ymm
- %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
+define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 4) ;
ret <8 x float> %res
}
-declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
- ; CHECK: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_int_x86_xop_vpcmov:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
- ; CHECK: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
ret <4 x i64> %res
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpcmov
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a1
%res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ;
ret <4 x i64> %res
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpcmov
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a2
%res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ;
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
-define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK:vpcomeqb
- %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-define <16 x i8> @test_int_x86_xop_vpcomeqb_mem(<16 x i8> %a0, <16 x i8>* %a1) {
- ; CHECK-NOT: vmovaps
- ; CHECK:vpcomeqb
- %vec = load <16 x i8>, <16 x i8>* %a1
- %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %vec) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomeqw
- %res = call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomeqd
- %res = call <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomeqq
- %res = call <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomequb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomequb
- %res = call <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomequd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomequd
- %res = call <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomequq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomequq
- %res = call <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomequw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomequw
- %res = call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomfalseb
- %res = call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomfalsed
- %res = call <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomfalseq
- %res = call <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomfalseub
- %res = call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomfalseud
- %res = call <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomfalseuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomfalseuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomfalsew
- %res = call <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgeb
- %res = call <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomged(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomged
- %res = call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgeq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgeub
- %res = call <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomgeud
- %res = call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgeuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgeuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgew
- %res = call <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgtb
- %res = call <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomgtd
- %res = call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgtq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgtub
- %res = call <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomgtud
- %res = call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgtuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgtuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgtw
- %res = call <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomleb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomleb
- %res = call <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomled(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomled
- %res = call <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomleq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomleq
- %res = call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomleub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomleub
- %res = call <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomleud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomleud
- %res = call <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomleuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomleuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomlew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomlew
- %res = call <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomltb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomltb
- %res = call <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomltd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomltd
- %res = call <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomltq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomltq
- %res = call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomltub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomltub
- %res = call <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomltud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomltud
- %res = call <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomltuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomltuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomltw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomltw
- %res = call <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomneb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomneqb
- %res = call <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomned(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomneqd
- %res = call <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomneq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomneqq
- %res = call <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomneub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomnequb
- %res = call <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomneud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomnequd
- %res = call <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomnequq
- %res = call <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomnequw
- %res = call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomnew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomneqw
- %res = call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomtrueb
- %res = call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomtrued
- %res = call <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomtrueq
- %res = call <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomtrueub
- %res = call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomtrueud
- %res = call <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomtrueuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomtrueuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomtruew
- %res = call <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
-
define <4 x i32> @test_int_x86_xop_vphaddbd(<16 x i8> %a0) {
- ; CHECK: vphaddbd
+; CHECK-LABEL: test_int_x86_xop_vphaddbd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddbd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddbq(<16 x i8> %a0) {
- ; CHECK: vphaddbq
+; CHECK-LABEL: test_int_x86_xop_vphaddbq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddbq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphaddbw(<16 x i8> %a0) {
- ; CHECK: vphaddbw
+; CHECK-LABEL: test_int_x86_xop_vphaddbw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddbw %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %a0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphadddq(<4 x i32> %a0) {
- ; CHECK: vphadddq
+; CHECK-LABEL: test_int_x86_xop_vphadddq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphadddq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphaddubd(<16 x i8> %a0) {
- ; CHECK: vphaddubd
+; CHECK-LABEL: test_int_x86_xop_vphaddubd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddubd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddubq(<16 x i8> %a0) {
- ; CHECK: vphaddubq
+; CHECK-LABEL: test_int_x86_xop_vphaddubq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddubq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphaddubw(<16 x i8> %a0) {
- ; CHECK: vphaddubw
+; CHECK-LABEL: test_int_x86_xop_vphaddubw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddubw %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %a0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddudq(<4 x i32> %a0) {
- ; CHECK: vphaddudq
+; CHECK-LABEL: test_int_x86_xop_vphaddudq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddudq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphadduwd(<8 x i16> %a0) {
- ; CHECK: vphadduwd
+; CHECK-LABEL: test_int_x86_xop_vphadduwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphadduwd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphadduwq(<8 x i16> %a0) {
- ; CHECK: vphadduwq
+; CHECK-LABEL: test_int_x86_xop_vphadduwq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphadduwq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphaddwd(<8 x i16> %a0) {
- ; CHECK: vphaddwd
+; CHECK-LABEL: test_int_x86_xop_vphaddwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddwd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddwq(<8 x i16> %a0) {
- ; CHECK: vphaddwq
+; CHECK-LABEL: test_int_x86_xop_vphaddwq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddwq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphsubbw(<16 x i8> %a0) {
- ; CHECK: vphsubbw
+; CHECK-LABEL: test_int_x86_xop_vphsubbw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubbw %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %a0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphsubdq(<4 x i32> %a0) {
- ; CHECK: vphsubdq
+; CHECK-LABEL: test_int_x86_xop_vphsubdq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubdq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %a0) ;
ret <2 x i64> %res
}
define <2 x i64> @test_int_x86_xop_vphsubdq_mem(<4 x i32>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vphsubdq
+; CHECK-LABEL: test_int_x86_xop_vphsubdq_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubdq (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %a0
%res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %vec) ;
ret <2 x i64> %res
@@ -651,13 +265,18 @@ define <2 x i64> @test_int_x86_xop_vphsubdq_mem(<4 x i32>* %a0) {
declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphsubwd(<8 x i16> %a0) {
- ; CHECK: vphsubwd
+; CHECK-LABEL: test_int_x86_xop_vphsubwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubwd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %a0) ;
ret <4 x i32> %res
}
define <4 x i32> @test_int_x86_xop_vphsubwd_mem(<8 x i16>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vphsubwd
+; CHECK-LABEL: test_int_x86_xop_vphsubwd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubwd (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a0
%res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %vec) ;
ret <4 x i32> %res
@@ -665,90 +284,128 @@ define <4 x i32> @test_int_x86_xop_vphsubwd_mem(<8 x i16>* %a0) {
declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacsdd
+; CHECK-LABEL: test_int_x86_xop_vpmacsdd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacsdqh
+; CHECK-LABEL: test_int_x86_xop_vpmacsdqh:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacsdql
+; CHECK-LABEL: test_int_x86_xop_vpmacsdql:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacssdd
+; CHECK-LABEL: test_int_x86_xop_vpmacssdd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacssdqh
+; CHECK-LABEL: test_int_x86_xop_vpmacssdqh:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacssdql
+; CHECK-LABEL: test_int_x86_xop_vpmacssdql:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacsswd
+; CHECK-LABEL: test_int_x86_xop_vpmacsswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
- ; CHECK: vpmacssww
+; CHECK-LABEL: test_int_x86_xop_vpmacssww:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacswd
+; CHECK-LABEL: test_int_x86_xop_vpmacswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
- ; CHECK: vpmacsww
+; CHECK-LABEL: test_int_x86_xop_vpmacsww:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmadcsswd
+; CHECK-LABEL: test_int_x86_xop_vpmadcsswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmadcswd
+; CHECK-LABEL: test_int_x86_xop_vpmadcswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
define <4 x i32> @test_int_x86_xop_vpmadcswd_mem(<8 x i16> %a0, <8 x i16>* %a1, <4 x i32> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpmadcswd
+; CHECK-LABEL: test_int_x86_xop_vpmadcswd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a1
%res = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %vec, <4 x i32> %a2) ;
ret <4 x i32> %res
@@ -756,20 +413,27 @@ define <4 x i32> @test_int_x86_xop_vpmadcswd_mem(<8 x i16> %a0, <8 x i16>* %a1,
declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
- ; CHECK: vpperm
+; CHECK-LABEL: test_int_x86_xop_vpperm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ;
ret <16 x i8> %res
}
define <16 x i8> @test_int_x86_xop_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpperm
+; CHECK-LABEL: test_int_x86_xop_vpperm_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %a2
%res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %vec) ;
ret <16 x i8> %res
}
define <16 x i8> @test_int_x86_xop_vpperm_mr(<16 x i8> %a0, <16 x i8>* %a1, <16 x i8> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpperm
+; CHECK-LABEL: test_int_x86_xop_vpperm_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %a1
%res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %vec, <16 x i8> %a2) ;
ret <16 x i8> %res
@@ -777,125 +441,177 @@ define <16 x i8> @test_int_x86_xop_vpperm_mr(<16 x i8> %a0, <16 x i8>* %a1, <16
declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vprotb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vprotb
+; CHECK-LABEL: test_int_x86_xop_vprotb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a0, <16 x i8> %a1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vprotd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vprotd
+; CHECK-LABEL: test_int_x86_xop_vprotd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a0, <4 x i32> %a1) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vprotq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vprotq
+; CHECK-LABEL: test_int_x86_xop_vprotq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vprotw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vprotw
+; CHECK-LABEL: test_int_x86_xop_vprotw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a0, <8 x i16> %a1) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vprotbi(<16 x i8> %a0) {
- ; CHECK: vprotb
+; CHECK-LABEL: test_int_x86_xop_vprotbi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotb $1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %a0, i8 1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
define <4 x i32> @test_int_x86_xop_vprotdi(<4 x i32> %a0) {
- ; CHECK: vprotd
+; CHECK-LABEL: test_int_x86_xop_vprotdi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotd $254, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %a0, i8 -2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vprotqi(<2 x i64> %a0) {
- ; CHECK: vprotq
+; CHECK-LABEL: test_int_x86_xop_vprotqi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotq $3, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 3) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
define <8 x i16> @test_int_x86_xop_vprotwi(<8 x i16> %a0) {
- ; CHECK: vprotw
+; CHECK-LABEL: test_int_x86_xop_vprotwi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotw $252, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %a0, i8 -4) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpshab
+; CHECK-LABEL: test_int_x86_xop_vpshab:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a0, <16 x i8> %a1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpshad(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpshad
+; CHECK-LABEL: test_int_x86_xop_vpshad:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a0, <4 x i32> %a1) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpshaq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpshaq
+; CHECK-LABEL: test_int_x86_xop_vpshaq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpshaw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpshaw
+; CHECK-LABEL: test_int_x86_xop_vpshaw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a0, <8 x i16> %a1) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpshlb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpshlb
+; CHECK-LABEL: test_int_x86_xop_vpshlb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a0, <16 x i8> %a1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpshld(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpshld
+; CHECK-LABEL: test_int_x86_xop_vpshld:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a0, <4 x i32> %a1) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpshlq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpshlq
+; CHECK-LABEL: test_int_x86_xop_vpshlq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpshlw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpshlw
+; CHECK-LABEL: test_int_x86_xop_vpshlw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %a1) ;
ret <8 x i16> %res
}
define <8 x i16> @test_int_x86_xop_vpshlw_rm(<8 x i16> %a0, <8 x i16>* %a1) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpshlw
+; CHECK-LABEL: test_int_x86_xop_vpshlw_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlw (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a1
%res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %vec) ;
ret <8 x i16> %res
}
define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpshlw
+; CHECK-LABEL: test_int_x86_xop_vpshlw_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlw %xmm0, (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a0
%res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %vec, <8 x i16> %a1) ;
ret <8 x i16> %res
@@ -903,14 +619,18 @@ define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczss
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ss:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczss %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0) ;
ret <4 x float> %res
}
define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczss
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ss_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczss (%rdi), %xmm0
+; CHECK-NEXT: retq
%elem = load float, float* %a0
%vec = insertelement <4 x float> undef, float %elem, i32 0
%res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %vec) ;
@@ -919,14 +639,18 @@ define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczsd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_sd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczsd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0) ;
ret <2 x double> %res
}
define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczsd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_sd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczsd (%rdi), %xmm0
+; CHECK-NEXT: retq
%elem = load double, double* %a0
%vec = insertelement <2 x double> undef, double %elem, i32 0
%res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %vec) ;
@@ -935,13 +659,18 @@ define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
define <2 x double> @test_int_x86_xop_vfrcz_pd(<2 x double> %a0) {
- ; CHECK: vfrczpd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0) ;
ret <2 x double> %res
}
define <2 x double> @test_int_x86_xop_vfrcz_pd_mem(<2 x double>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczpd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a0
%res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %vec) ;
ret <2 x double> %res
@@ -949,15 +678,18 @@ define <2 x double> @test_int_x86_xop_vfrcz_pd_mem(<2 x double>* %a0) {
declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
define <4 x double> @test_int_x86_xop_vfrcz_pd_256(<4 x double> %a0) {
- ; CHECK: vfrczpd
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd %ymm0, %ymm0
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0) ;
ret <4 x double> %res
}
define <4 x double> @test_int_x86_xop_vfrcz_pd_256_mem(<4 x double>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczpd
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_256_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd (%rdi), %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a0
%res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %vec) ;
ret <4 x double> %res
@@ -965,13 +697,18 @@ define <4 x double> @test_int_x86_xop_vfrcz_pd_256_mem(<4 x double>* %a0) {
declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone
define <4 x float> @test_int_x86_xop_vfrcz_ps(<4 x float> %a0) {
- ; CHECK: vfrczps
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0) ;
ret <4 x float> %res
}
define <4 x float> @test_int_x86_xop_vfrcz_ps_mem(<4 x float>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczps
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <4 x float>, <4 x float>* %a0
%res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %vec) ;
ret <4 x float> %res
@@ -979,15 +716,18 @@ define <4 x float> @test_int_x86_xop_vfrcz_ps_mem(<4 x float>* %a0) {
declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
define <8 x float> @test_int_x86_xop_vfrcz_ps_256(<8 x float> %a0) {
- ; CHECK: vfrczps
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps %ymm0, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0) ;
ret <8 x float> %res
}
define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczps
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_256_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps (%rdi), %ymm0
+; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %a0
%res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %vec) ;
ret <8 x float> %res
@@ -995,56 +735,80 @@ define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK:vpcomb
+; CHECK-LABEL: test_int_x86_xop_vpcomb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomw
+; CHECK-LABEL: test_int_x86_xop_vpcomw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomd
+; CHECK-LABEL: test_int_x86_xop_vpcomd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomq
+; CHECK-LABEL: test_int_x86_xop_vpcomq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK:vpcomub
+; CHECK-LABEL: test_int_x86_xop_vpcomub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomuw
+; CHECK-LABEL: test_int_x86_xop_vpcomuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomud
+; CHECK-LABEL: test_int_x86_xop_vpcomud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomuq
+; CHECK-LABEL: test_int_x86_xop_vpcomuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
ret <2 x i64> %res
}
diff --git a/test/CodeGen/X86/xop-mask-comments.ll b/test/CodeGen/X86/xop-mask-comments.ll
new file mode 100644
index 000000000000..e4cc9101777d
--- /dev/null
+++ b/test/CodeGen/X86/xop-mask-comments.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X64
+
+;
+; VPPERM
+;
+
+define <16 x i8> @vpperm_shuffle_unary(<16 x i8> %a0) {
+; X32-LABEL: vpperm_shuffle_unary:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_unary:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_unary_undef(<16 x i8> %a0) {
+; X32-LABEL: vpperm_shuffle_unary_undef:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_unary_undef:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> undef, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_unary_zero(<16 x i8> %a0) {
+; X32-LABEL: vpperm_shuffle_unary_zero:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3],zero,xmm0[1],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_unary_zero:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3],zero,xmm0[1],zero
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 130, i8 17, i8 128>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_binary(<16 x i8> %a0, <16 x i8> %a1) {
+; X32-LABEL: vpperm_shuffle_binary:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],xmm1[3],xmm0[2],xmm1[1],xmm0[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_binary:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],xmm1[3],xmm0[2],xmm1[1],xmm0[0]
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_binary_zero(<16 x i8> %a0, <16 x i8> %a1) {
+; X32-LABEL: vpperm_shuffle_binary_zero:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_binary_zero:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],zero,zero,zero,zero
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 147, i8 130, i8 145, i8 128>)
+ ret <16 x i8> %1
+}
+
+; we can't decode vpperm's other permute ops
+define <16 x i8> @vpperm_shuffle_general(<16 x i8> %a0, <16 x i8> %a1) {
+; X32-LABEL: vpperm_shuffle_general:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_general:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 179, i8 162, i8 177, i8 160>)
+ ret <16 x i8> %1
+}
+
+;
+; VPERMIL2
+;
+
+define <2 x double> @vpermil2pd_21(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: vpermil2pd_21:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2pd_21:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> <i64 4, i64 2>, i8 0)
+ ret <2 x double> %1
+}
+
+define <4 x double> @vpermil2pd256_0062(<4 x double> %a0, <4 x double> %a1) {
+; X32-LABEL: vpermil2pd256_0062:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0,0],ymm1[2],ymm0[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2pd256_0062:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0,0],ymm1[2],ymm0[2]
+; X64-NEXT: retq
+ %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 0, i64 4, i64 0>, i8 0)
+ ret <4 x double> %1
+}
+
+define <4 x double> @vpermil2pd256_zz73(<4 x double> %a0, <4 x double> %a1) {
+; X32-LABEL: vpermil2pd256_zz73:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = zero,zero,ymm1[3],ymm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2pd256_zz73:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = zero,zero,ymm1[3],ymm0[3]
+; X64-NEXT: retq
+ %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 0, i64 14, i64 10>, i8 3)
+ ret <4 x double> %1
+}
+
+define <4 x float> @vpermil2ps_0561(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: vpermil2ps_0561:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2ps_0561:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[1]
+; X64-NEXT: retq
+ %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 1>, i8 0)
+ ret <4 x float> %1
+}
+
+define <8 x float> @vpermil2ps256_098144FE(<8 x float> %a0, <8 x float> %a1) {
+; X32-LABEL: vpermil2ps256_098144FE:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[1,0],ymm0[1,4,4],ymm1[7,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2ps256_098144FE:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[1,0],ymm0[1,4,4],ymm1[7,6]
+; X64-NEXT: retq
+ %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 5, i32 4, i32 1, i32 0, i32 0, i32 7, i32 6>, i8 0)
+ ret <8 x float> %1
+}
+
+define <8 x float> @vpermil2ps256_0zz8BzzA(<8 x float> %a0, <8 x float> %a1) {
+; X32-LABEL: vpermil2ps256_0zz8BzzA:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],zero,zero,ymm1[0,7],zero,zero,ymm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2ps256_0zz8BzzA:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],zero,zero,ymm1[0,7],zero,zero,ymm1[6]
+; X64-NEXT: retq
+ %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 8, i32 4, i32 7, i32 8, i32 8, i32 6>, i8 2)
+ ret <8 x float> %1
+}
+
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
+
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/X86/xray-attribute-instrumentation.ll b/test/CodeGen/X86/xray-attribute-instrumentation.ll
new file mode 100644
index 000000000000..9e2d8934e98f
--- /dev/null
+++ b/test/CodeGen/X86/xray-attribute-instrumentation.ll
@@ -0,0 +1,13 @@
+; RUN: llc -filetype=asm -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" {
+; CHECK-LABEL: Lxray_sled_0:
+; CHECK-NEXT: .p2align 2, 0x90
+; CHECK-NEXT: .ascii "\353\t"
+; CHECK-NEXT: nopw 512(%rax,%rax)
+; CHECK-LABEL: Ltmp0:
+ ret i32 0
+; CHECK-LABEL: Lxray_sled_1:
+; CHECK-NEXT: retq
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+}
diff --git a/test/CodeGen/X86/xray-selective-instrumentation-miss.ll b/test/CodeGen/X86/xray-selective-instrumentation-miss.ll
new file mode 100644
index 000000000000..5b57e2541156
--- /dev/null
+++ b/test/CodeGen/X86/xray-selective-instrumentation-miss.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mcpu=nehalem < %s | not grep xray_sled_
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-apple-darwin8"
+
+define i32 @foo() nounwind uwtable "xray-instruction-threshold"="3" {
+entry:
+ ret i32 0
+}
diff --git a/test/CodeGen/X86/xray-selective-instrumentation.ll b/test/CodeGen/X86/xray-selective-instrumentation.ll
new file mode 100644
index 000000000000..4368161a2b30
--- /dev/null
+++ b/test/CodeGen/X86/xray-selective-instrumentation.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mcpu=nehalem < %s | grep xray_sled_
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-apple-darwin8"
+
+define i32 @foo() nounwind uwtable "xray-instruction-threshold"="1" {
+entry:
+ ret i32 0
+}
diff --git a/test/CodeGen/X86/zext-fold.ll b/test/CodeGen/X86/zext-fold.ll
index a10923f7a80f..6aca4f40f0aa 100644
--- a/test/CodeGen/X86/zext-fold.ll
+++ b/test/CodeGen/X86/zext-fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -enable-misched=false | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux -enable-misched=false | FileCheck %s
;; Simple case
define i32 @test1(i8 %x) nounwind readnone {
@@ -35,7 +35,8 @@ define void @test3(i8 %x) nounwind readnone {
}
; CHECK: test3
; CHECK: movzbl {{[0-9]+}}(%esp), [[REGISTER:%e[a-z]{2}]]
-; CHECK-NEXT: movl [[REGISTER]], 4(%esp)
+; CHECK: subl $8, %esp
+; CHECK-NEXT: pushl [[REGISTER]]
; CHECK-NEXT: andl $224, [[REGISTER]]
-; CHECK-NEXT: movl [[REGISTER]], (%esp)
+; CHECK-NEXT: pushl [[REGISTER]]
; CHECK-NEXT: call{{.*}}use
diff --git a/test/CodeGen/XCore/align.ll b/test/CodeGen/XCore/align.ll
index 2878a648e09b..53efa3962909 100644
--- a/test/CodeGen/XCore/align.ll
+++ b/test/CodeGen/XCore/align.ll
@@ -1,13 +1,13 @@
; RUN: llc < %s -march=xcore | FileCheck %s
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK-LABEL: f:
define void @f() nounwind {
entry:
ret void
}
-; CHECK: .align 2
+; CHECK: .p2align 1
; CHECK-LABEL: g:
define void @g() nounwind optsize {
entry:
diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
index 6c8f389e8a98..d9b2f24d9970 100644
--- a/test/CodeGen/XCore/dwarf_debug.ll
+++ b/test/CodeGen/XCore/dwarf_debug.ll
@@ -23,11 +23,10 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "test.c", directory: "")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "f", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "f", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
!6 = !DISubroutineType(types: !7)
!7 = !{!8, !8}
!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/XCore/epilogue_prologue.ll b/test/CodeGen/XCore/epilogue_prologue.ll
index 923cc4a09e05..b6813f28558e 100644
--- a/test/CodeGen/XCore/epilogue_prologue.ll
+++ b/test/CodeGen/XCore/epilogue_prologue.ll
@@ -101,7 +101,7 @@ entry:
; FP + large frame: spill FP+SR+R4+LR = entsp 3 + 200000 + extsp 1
; CHECKFP: .section .cp.rodata.cst4,"aMc",@progbits,4
-; CHECKFP-NEXT: .align 4
+; CHECKFP-NEXT: .p2align 2
; CHECKFP-NEXT: .LCPI[[CNST0:[0-9_]+]]:
; CHECKFP-NEXT: .long 200002
; CHECKFP-NEXT: .LCPI[[CNST1:[0-9_]+]]:
@@ -154,7 +154,7 @@ entry:
;
; !FP + large frame: spill SR+SR+R4+LR = entsp 4 + 200000
; CHECK: .section .cp.rodata.cst4,"aMc",@progbits,4
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LCPI[[CNST0:[0-9_]+]]:
; CHECK-NEXT: .long 200003
; CHECK-NEXT: .LCPI[[CNST1:[0-9_]+]]:
diff --git a/test/CodeGen/XCore/linkage.ll b/test/CodeGen/XCore/linkage.ll
index 7384fe7bcf09..ff07a261fc50 100644
--- a/test/CodeGen/XCore/linkage.ll
+++ b/test/CodeGen/XCore/linkage.ll
@@ -42,9 +42,8 @@ define protected void @test_protected() {
; CHECK-NOT: .hidden test_hidden_declaration
-; CHECK: .weak gr
-@gr = extern_weak global i32
-
; CHECK: .weak fr
declare extern_weak void @fr(i32*, i32*)
+; CHECK: .weak gr
+@gr = extern_weak global i32
diff --git a/test/CodeGen/XCore/scavenging.ll b/test/CodeGen/XCore/scavenging.ll
index 7b6f54ebec24..b46c75a4aaf6 100644
--- a/test/CodeGen/XCore/scavenging.ll
+++ b/test/CodeGen/XCore/scavenging.ll
@@ -53,7 +53,7 @@ declare void @g(i32*, i32*)
; CHECK: .section .cp.rodata.cst4,"aMc",@progbits,4
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK: [[ARG5:.LCPI[0-9_]+]]:
; CHECK: .long 100003
; CHECK: [[INDEX0:.LCPI[0-9_]+]]:
diff --git a/test/CodeGen/XCore/threads.ll b/test/CodeGen/XCore/threads.ll
index 0c25314295d7..30dda143e084 100644
--- a/test/CodeGen/XCore/threads.ll
+++ b/test/CodeGen/XCore/threads.ll
@@ -87,7 +87,7 @@ define i32* @f_tle() {
; CHECK: shl [[R0:r[0-9]]], r11, 3
; CHECK: ldaw [[R1:r[0-9]]], dp[tle]
; r0 = &tl + id*8
-; CHECK: add r0, [[R1]], [[R0]]
+; CHECK: add r0, [[R0]], [[R1]]
ret i32* getelementptr inbounds ([2 x i32], [2 x i32]* @tle, i32 0, i32 0)
}
@@ -96,7 +96,7 @@ define i32 @f_tlExpr () {
; CHECK: get r11, id
; CHECK: shl [[R0:r[0-9]]], r11, 3
; CHECK: ldaw [[R1:r[0-9]]], dp[tle]
-; CHECK: add [[R2:r[0-9]]], [[R1]], [[R0]]
+; CHECK: add [[R2:r[0-9]]], [[R0]], [[R1]]
; CHECK: add r0, [[R2]], [[R2]]
ret i32 add(
i32 ptrtoint( i32* getelementptr inbounds ([2 x i32], [2 x i32]* @tle, i32 0, i32 0) to i32),